gen-charsets.pl revision 04b94745
1#! /usr/bin/env perl
2# $XTermId: gen-charsets.pl,v 1.37 2024/02/09 01:11:52 tom Exp $
3# -----------------------------------------------------------------------------
4# this file is part of xterm
5#
6# Copyright 2018-2023,2024 by Thomas E. Dickey
7#
8#                         All Rights Reserved
9#
10# Permission is hereby granted, free of charge, to any person obtaining a
11# copy of this software and associated documentation files (the
12# "Software"), to deal in the Software without restriction, including
13# without limitation the rights to use, copy, modify, merge, publish,
14# distribute, sublicense, and/or sell copies of the Software, and to
15# permit persons to whom the Software is furnished to do so, subject to
16# the following conditions:
17#
18# The above copyright notice and this permission notice shall be included
19# in all copies or substantial portions of the Software.
20#
21# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24# IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
25# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28#
29# Except as contained in this notice, the name(s) of the above copyright
30# holders shall not be used in advertising or otherwise to promote the
31# sale, use or other dealings in this Software without prior written
32# authorization.
33# -----------------------------------------------------------------------------
34# Format/maintain xterm's charsets.h header.
35
36use strict;
37use warnings;
38
39use Getopt::Std;
40
41$| = 1;
42
43our ( $opt_d, $opt_i, $opt_v, $opt_x );
44our $undef     = hex(0x2426);
45our $head_file = "charsets.h";
46our $data_file = "charsets.dat";
47our @import;
48
49our %wide_chars = qw(
50  map_DEC_Cyrillic 1
51  map_DEC_Greek_Supp 1
52  map_DEC_Hebrew_Supp 1
53  map_DEC_Technical 1
54  map_DEC_Turkish_Supp 1
55  map_ISO_Greek_Supp 1
56  map_ISO_Hebrew 1
57  map_ISO_Latin_2 1
58  map_ISO_Latin_5 1
59  map_ISO_Latin_Cyrillic 1
60  map_NRCS_Greek 1
61  map_NRCS_Hebrew 1
62  map_NRCS_Turkish 1
63);
64
65our $note_1 = "\
66#ifndef included_charsets_h
67#define included_charsets_h 1";
68
69our $note_2 = "\
70#ifndef PUA
71#define PUA(n) (0xEEEE + (n))
72#endif
73#define UNDEF  0x2426\t\t/* rendered as a backwards \"?\" */";
74
75our $note_3 = "\
76#if OPT_WIDE_CHARS
77#define begin_CODEPAGE(size) \\
78\tif (!(xw->flags & NATIONAL)) { \\
79\t    screen->utf8_nrc_mode++; \\
80\t}
81#define end_CODEPAGE() \\
82\tif (!(xw->flags & NATIONAL)) { \\
83\t    screen->utf8_nrc_mode--; \\
84\t}
85#else
86#define begin_CODEPAGE(size)\t/* nothing */
87#define end_CODEPAGE()\t\t/* nothing */
88#endif";
89
90sub read_file($) {
91    my $file = shift;
92    open( FP, $file ) || do {
93        print STDERR "Can't open $file: $!\n";
94        return;
95    };
96    my @data = <FP>;
97    close(FP);
98    chomp @data;
99    return @data;
100}
101
102# Translate a Unicode mapping, e.g., for one of the ISO-8859-x codepages,
103# into the form used in charsets.c for converting characters.
104sub do_import($) {
105    my $file = shift;
106    my @data = &read_file($file);
107    my $name = $file;
108    $name =~ s,^.*/,,;
109    $name =~ s/\..*$//;
110    $name =~ s/^(8859)/ISO-$1/;
111    $name =~ s/-/_/g;
112    my @target;
113    my @noteof;
114
115    $import[ $#import + 1 ] = sprintf "map_%s", $name;
116    for my $n ( 0 .. $#data ) {
117        chomp $data[$n];
118        $data[$n] =~ s/^\s*//;
119        $data[$n] =~ s/\s*$//;
120        next if ( $data[$n] =~ /^#/ );
121        next if ( $data[$n] eq "" );
122        if ( $data[$n] !~ /^0x[[:xdigit:]]+\s+0x[[:xdigit:]]+\s*#/i ) {
123            printf STDERR "?? %d:%s\n", $n + 1, $data[$n];
124            next;
125        }
126
127        my $source = $data[$n];
128        $source =~ s/\s.*//;
129        $source = hex($source);
130        next if ( $source < 160 or $source > 255 );
131        $source -= 128;
132
133        my $target = $data[$n];
134        $target =~ s/^[^\s]+\s+(0x[[:xdigit:]]+).*$/$1/i;
135        $target = hex($target);
136
137        my $noteof = $data[$n];
138        $noteof =~ s/^[^#]+#\s*//;
139
140        $target[$source] = $target;
141        $noteof[$source] = $noteof;
142    }
143    my $lo = $target[32]  ? 32  : 33;
144    my $hi = $target[127] ? 127 : 126;
145    for my $n ( $lo .. $hi ) {
146        if ( defined $target[$n] ) {
147            $import[ $#import + 1 ] = sprintf "\t0x%02x\t0x%04x\t\t# %s", $n,
148              $target[$n], $noteof[$n];
149        }
150        else {
151            $import[ $#import + 1 ] = sprintf "\t0x%02x\tUNDEF\t\t# undefined",
152              $n;
153        }
154    }
155    if ($opt_v) {
156        for my $n ( 0 .. $#import ) {
157            printf "%s\n", $import[$n];
158        }
159    }
160}
161
162sub add_text($$) {
163    my @head = @{ $_[0] };
164    my @note = split /\n/, $_[1];
165    for my $n ( 0 .. $#note ) {
166        $head[ $#head + 1 ] = $note[$n];
167    }
168    return @head;
169}
170
171sub end_note($$) {
172    my @head = @{ $_[0] };
173    my $note = $_[1];
174    $head[ $#head + 1 ] = " */";
175    my $notes;
176    if ( $note == 1 ) {
177        $notes = $note_1;
178    }
179    elsif ( $note == 2 ) {
180        $notes = $note_2;
181    }
182    elsif ( $note == 3 ) {
183        $notes = $note_3;
184    }
185    else {
186        $notes = "";
187    }
188    return &add_text( \@head, $notes );
189}
190
191sub hex_of($) {
192    my $text = shift;
193    if ($text) {
194        $text =~ s/^(0x|u\+)//i;
195        $text = "0x" . $text if ( $text =~ /^[[:xdigit:]]+$/ );
196    }
197    return $text;
198}
199
200sub add($$) {
201    my @data = @{ $_[0] };
202    my $text = $_[1];
203    $data[ $#data + 1 ] = $text;
204    return @data;
205}
206
207sub add_unmap($$) {
208    my @head  = @{ $_[0] };
209    my %unmap = %{ $_[1] };
210    my %noted = %{ $_[2] };
211    my $title = $_[3];
212    my $macro = "un$title";
213    $macro .= "(code,dft)" unless ( $macro =~ /\(code/ );
214    $macro =~ s/code\)/code,dft\)/;
215    @head = &add( \@head, "" );
216
217    if (%unmap) {
218        my @codes = sort keys %unmap;
219
220        if ( $#codes > 0 ) {
221            @head = &add( \@head, "#define $macro \\" );
222            @head = &add( \@head, "\tswitch (code) { \\" );
223            for my $code ( sort keys %unmap ) {
224                my $note = $noted{$code};
225                my $pads = " ";
226                if ( $title =~ /_NRCS_/ ) {
227                    $pads = sprintf( "%*s", 17 - length($code), " " );
228                    $note =~ s/\t/ /;
229                }
230                @head = &add(
231                    \@head,
232                    sprintf(
233                        "\t    MAP(%s,%s%s)%s \\",
234                        $code, $pads, $unmap{$code}, $note
235                    )
236                );
237            }
238            @head = &add( \@head, "\t    default: dft; break; \\" );
239            @head = &add( \@head, "\t}" );
240        }
241        else {
242            @head = &add( \@head, "#define $macro /* nothing? */" );
243        }
244    }
245    else {
246        @head = &add( \@head, "#define $macro /* nothing */" );
247    }
248    return @head;
249}
250
251# Read the current charsets data file, and format a new charsets.h file.
252sub do_update($) {
253    my $file = shift;
254    my @data = &read_file($file);
255    return unless ( $#data >= 0 );
256    my @head;
257    my %noted;
258    my %unmap;
259    my $title = "";
260    my $state = 0;
261    my $ended = "";
262    my $extra = "";
263    my $notes = 0;
264    my $codep = 0;
265
266    for my $n ( 0 .. $#data ) {
267        my $data = $data[$n];
268        if ( $data =~ /^\s*#/ ) {
269            @head = &add( \@head, "/*" ) unless ( $state == 1 );
270            $data =~ s/#/ */;
271            @head  = &add( \@head, $data );
272            $state = 1;
273        }
274        elsif ( $data =~ /^\s*$/ ) {
275            @head = &end_note( \@head, $notes++ ) if ( $state == 1 );
276
277            if ( $state >= 2 ) {
278                @head = &add( \@head, $ended );
279                @head = &add_unmap( \@head, \%unmap, \%noted, $title );
280                @head = &add( \@head, $extra ) if ( $extra ne "" );
281            }
282            @head = &add( \@head, "" );
283
284            $title = "";
285            %unmap = ();
286            $state = 0;
287            $ended = "";
288            $extra = "";
289        }
290        elsif ( $data =~ /^map_/ ) {
291            $title = $data;
292            @head  = &end_note( \@head, $notes++ ) if ( $state == 1 );
293            $state = 2;
294            $codep = 0;
295            $codep = 94 if ( $data =~ /_DEC_/ );
296            $codep = 96 if ( $data =~ /_ISO_/ );
297
298            $data .= "(code)" unless ( $data =~ /\(code/ );
299
300            @head = &add( \@head, sprintf( "#define %s \\",           $data ) );
301            @head = &add( \@head, sprintf( "\tbegin_CODEPAGE(%d) \\", $codep ) )
302              if ($codep);
303            @head = &add( \@head, "\tswitch (code) { \\" );
304
305            $ended = $codep ? "\t} \\\n\tend_CODEPAGE()" : "\t}";
306
307            # special case for map_DEC_Supp_Graphic
308            if ( $data =~ /\(code,dft\)/ ) {
309                $ended = "\t    default: dft; break; \\\n" . $ended;
310                $extra = "\n#if OPT_WIDE_CHARS";
311            }
312        }
313        elsif (
314            $data =~ /^\s+(0x)?[[:xdigit:]]{2}
315                            \s+(BLANK
316                               |PUA\(\d\)
317                               |UNDEF
318                               |XK_\w+
319                               |(0x|U\+)?[[:xdigit:]]{1,4})\s*/x
320          )
321        {
322            @head  = &add( \@head, " */" ) if ( $state == 1 );
323            $state = 3;
324            my $note = "";
325            if ( $data =~ /#/ ) {
326                $note = $data;
327                $note =~ s/[^#]*#\s*//;
328                $note = "\t/* $note */" if ( $note ne "" );
329                $data =~ s/\s*#.*//;
330            }
331            $data =~ s/\s+/ /g;
332            $data =~ s/^ //;
333            $data =~ s/ $//;
334            my @fields = split /\s/, $data;
335            my $source = &hex_of( $fields[0] );
336            my $target = &hex_of( $fields[1] );
337            my $intern = &hex_of( $fields[2] );
338            my $macros = "UNI";
339            $macros = "MAP" if ( $target =~ /^XK_/ );
340            $macros = "XXX" if ( $target eq "UNDEF" );
341            $macros = "XXX" if ( $target =~ /PUA\(\d\)/ );
342
343            if ( $target ne $source ) {
344                $intern = $source unless ($intern);
345            }
346            my $item = sprintf( "    %s(%s, %s)", $macros, $source, $target );
347
348            # fix formatting for the XK_-based VT220 definitions
349            if (    $codep == 0
350                and $title !~ /(Greek|Hebrew|Turkish)/
351                and index( $note, "\t/*" ) == 0 )
352            {
353                my $pads = 24 - length($item);
354                $item .= "\t" if ( $pads > 0 );
355            }
356            @head = &add( \@head, sprintf( "\t%s%s \\", $item, $note ) );
357
358            if ( defined $intern ) {
359                if ( $source ne $intern ) {
360                    $unmap{$source} = $intern;
361                    $noted{$source} = $note;
362                }
363            }
364        }
365        else {
366            printf STDERR "? unexpected data:\n\t%s\n", $data;
367        }
368    }
369    if ( $state >= 2 ) {
370        @head = &add( \@head, $ended );
371        @head = &add_unmap( \@head, \%unmap, \%noted, $title );
372        @head = &add( \@head, $extra ) if ( $extra ne "" );
373    }
374    @head = &add( \@head, "#else" );
375    foreach my $key ( sort keys %wide_chars ) {
376        @head =
377          &add( \@head, sprintf( "#define %s(code)\t/* nothing */", $key ) );
378        @head = &add( \@head, sprintf( "#define un%s(code,dft) dft", $key ) );
379    }
380    @head = &add( \@head, "#endif /* OPT_WIDE_CHARS */" );
381    @head = &add( \@head, "" );
382    @head = &add( \@head, "#endif /* included_charsets_h */" );
383
384    my $origin = $file;
385    $origin =~ s/\.dat\b/.h/;
386    my $update = $origin . ".new";
387    unlink $update;
388    open( my $fh, ">", $update )
389      or die "Can't open > $update.txt: $!";
390    for my $n ( 0 .. $#head ) {
391        printf $fh "%s\n", $head[$n];
392    }
393    close $fh;
394
395    system("diff -u $origin $update") if $opt_v;
396    rename $update, $origin if $opt_x;
397    unlink $update;
398}
399
400sub main::HELP_MESSAGE() {
401    printf STDERR <<EOF
402Usage: $0 [options]
403
404Options:
405 -d       debug
406 -i       import charset data from Unicode file
407 -v       verbose
408 -x       update charsets.h from $data_file
409EOF
410      ;
411    exit 1;
412}
413
414$Getopt::Std::STANDARD_HELP_VERSION = 1;
415&getopts('i:vx') || &main::HELP_MESSAGE;
416$#ARGV >= 0 && &main::HELP_MESSAGE;
417
418&do_import($opt_i) if ($opt_i);
419&do_update($data_file);
420
4211;
422