1#! /usr/bin/env perl
2# $XTermId: gen-charsets.pl,v 1.41 2024/10/03 22:01:13 tom Exp $
3# -----------------------------------------------------------------------------
4# this file is part of xterm
5#
6# Copyright 2018-2023,2024 by Thomas E. Dickey
7#
8#                         All Rights Reserved
9#
10# Permission is hereby granted, free of charge, to any person obtaining a
11# copy of this software and associated documentation files (the
12# "Software"), to deal in the Software without restriction, including
13# without limitation the rights to use, copy, modify, merge, publish,
14# distribute, sublicense, and/or sell copies of the Software, and to
15# permit persons to whom the Software is furnished to do so, subject to
16# the following conditions:
17#
18# The above copyright notice and this permission notice shall be included
19# in all copies or substantial portions of the Software.
20#
21# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24# IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
25# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28#
29# Except as contained in this notice, the name(s) of the above copyright
30# holders shall not be used in advertising or otherwise to promote the
31# sale, use or other dealings in this Software without prior written
32# authorization.
33# -----------------------------------------------------------------------------
34# Format/maintain xterm's charsets.h header.
35
36use strict;
37use warnings;
38
39use Getopt::Std;
40
41$| = 1;
42
43our ( $opt_i, $opt_v, $opt_x );
44our $undef     = hex(0x2426);
45our $head_file = "charsets.h";
46our $data_file = "charsets.dat";
47our @import;
48
49our %wide_chars = qw(
50  map_DEC_Cyrillic 1
51  map_DEC_Greek_Supp 1
52  map_DEC_Hebrew_Supp 1
53  map_DEC_Technical 1
54  map_DEC_Turkish_Supp 1
55  map_ISO_Greek_Supp 1
56  map_ISO_Hebrew 1
57  map_ISO_Latin_2 1
58  map_ISO_Latin_5 1
59  map_ISO_Latin_Cyrillic 1
60  map_JIS_Roman 1
61  map_JIS_Katakana 1
62  map_NRCS_Greek 1
63  map_NRCS_Hebrew 1
64  map_NRCS_Serbo_Croatian 1
65  map_NRCS_Russian 1
66  map_NRCS_Turkish 1
67);
68
69our $note_1 = "\
70#ifndef included_charsets_h
71#define included_charsets_h 1";
72
73our $note_2 = "\
74#ifndef PUA
75#define PUA(n) (0xEEEE + (n))
76#endif
77#define UNDEF  0x2426\t\t/* rendered as a backwards \"?\" */";
78
79our $note_3 = "\
80#if OPT_WIDE_CHARS
81#define begin_CODEPAGE(size) \\
82\tif (!(xw->flags & NATIONAL)) { \\
83\t    screen->utf8_nrc_mode++; \\
84\t}
85#define end_CODEPAGE() \\
86\tif (!(xw->flags & NATIONAL)) { \\
87\t    screen->utf8_nrc_mode--; \\
88\t}
89#else
90#define begin_CODEPAGE(size)\t/* nothing */
91#define end_CODEPAGE()\t\t/* nothing */
92#endif";
93
94sub read_file($) {
95    my $file = shift;
96    open( FP, $file ) || do {
97        print STDERR "Can't open $file: $!\n";
98        return;
99    };
100    my @data = <FP>;
101    close(FP);
102    chomp @data;
103    return @data;
104}
105
106# Translate a Unicode mapping, e.g., for one of the ISO-8859-x codepages,
107# into the form used in charsets.c for converting characters.
108sub do_import($) {
109    my $file = shift;
110    my @data = &read_file($file);
111    my $name = $file;
112    $name =~ s,^.*/,,;
113    $name =~ s/\..*$//;
114    $name =~ s/^(8859)/ISO-$1/;
115    $name =~ s/-/_/g;
116    my @target;
117    my @noteof;
118
119    $import[ $#import + 1 ] = sprintf "map_%s", $name;
120    for my $n ( 0 .. $#data ) {
121        chomp $data[$n];
122        $data[$n] =~ s/^\s*//;
123        $data[$n] =~ s/\s*$//;
124        next if ( $data[$n] =~ /^#/ );
125        next if ( $data[$n] eq "" );
126        if ( $data[$n] !~ /^0x[[:xdigit:]]+\s+0x[[:xdigit:]]+\s*#/i ) {
127            printf STDERR "?? %d:%s\n", $n + 1, $data[$n];
128            next;
129        }
130
131        my $source = $data[$n];
132        $source =~ s/\s.*//;
133        $source = hex($source);
134        next if ( $source < 160 or $source > 255 );
135        $source -= 128;
136
137        my $target = $data[$n];
138        $target =~ s/^[^\s]+\s+(0x[[:xdigit:]]+).*$/$1/i;
139        $target = hex($target);
140
141        my $noteof = $data[$n];
142        $noteof =~ s/^[^#]+#\s*//;
143
144        $target[$source] = $target;
145        $noteof[$source] = $noteof;
146    }
147    my $lo = $target[32]  ? 32  : 33;
148    my $hi = $target[127] ? 127 : 126;
149    for my $n ( $lo .. $hi ) {
150        if ( defined $target[$n] ) {
151            $import[ $#import + 1 ] = sprintf "\t0x%02x\t0x%04x\t\t# %s", $n,
152              $target[$n], $noteof[$n];
153        }
154        else {
155            $import[ $#import + 1 ] = sprintf "\t0x%02x\tUNDEF\t\t# undefined",
156              $n;
157        }
158    }
159    if ($opt_v) {
160        for my $n ( 0 .. $#import ) {
161            printf "%s\n", $import[$n];
162        }
163    }
164}
165
166sub add_text($$) {
167    my @head = @{ $_[0] };
168    my @note = split /\n/, $_[1];
169    for my $n ( 0 .. $#note ) {
170        $head[ $#head + 1 ] = $note[$n];
171    }
172    return @head;
173}
174
175sub end_note($$) {
176    my @head = @{ $_[0] };
177    my $note = $_[1];
178    $head[ $#head + 1 ] = " */";
179    my $notes;
180    if ( $note == 1 ) {
181        $notes = $note_1;
182    }
183    elsif ( $note == 2 ) {
184        $notes = $note_2;
185    }
186    elsif ( $note == 3 ) {
187        $notes = $note_3;
188    }
189    else {
190        $notes = "";
191    }
192    return &add_text( \@head, $notes );
193}
194
195sub hex_of($) {
196    my $text = shift;
197    if ($text) {
198        $text =~ s/^(0x|u\+)//i;
199        $text = "0x" . $text if ( $text =~ /^[[:xdigit:]]+$/ );
200    }
201    return $text;
202}
203
204sub add($$) {
205    my @data = @{ $_[0] };
206    my $text = $_[1];
207    $data[ $#data + 1 ] = $text;
208    return @data;
209}
210
211sub add_unmap($$) {
212    my @head  = @{ $_[0] };
213    my %unmap = %{ $_[1] };
214    my %noted = %{ $_[2] };
215    my $title = $_[3];
216    my $macro = "un$title";
217    $macro .= "(code,dft)" unless ( $macro =~ /\(code/ );
218    $macro =~ s/code\)/code,dft\)/;
219    @head = &add( \@head, "" );
220
221    if (%unmap) {
222        my @codes = sort keys %unmap;
223
224        if ( $#codes > 0 ) {
225            @head = &add( \@head, "#define $macro \\" );
226            @head = &add( \@head, "\tswitch (code) { \\" );
227            for my $code ( sort keys %unmap ) {
228                my $note = $noted{$code};
229                my $pads = " ";
230                if ( $title =~ /_NRCS_/ ) {
231                    $pads = sprintf( "%*s", 17 - length($code), " " );
232                    $note =~ s/\t/ /;
233                }
234                @head = &add(
235                    \@head,
236                    sprintf(
237                        "\t    MAP(%s,%s%s)%s \\",
238                        $code, $pads, $unmap{$code}, $note
239                    )
240                );
241            }
242            @head = &add( \@head, "\t    default: dft; break; \\" );
243            @head = &add( \@head, "\t}" );
244        }
245        else {
246            @head = &add( \@head, "#define $macro /* nothing? */" );
247        }
248    }
249    else {
250        @head = &add( \@head, "#define $macro /* nothing */" );
251    }
252    return @head;
253}
254
255# Read the current charsets data file, and format a new charsets.h file.
256sub do_update($) {
257    my $file = shift;
258    my @data = &read_file($file);
259    return unless ( $#data >= 0 );
260    my @head;
261    my %noted;
262    my %unmap;
263    my $title = "";
264    my $state = 0;
265    my $ended = "";
266    my $extra = "";
267    my $notes = 0;
268    my $codep = 0;
269
270    for my $n ( 0 .. $#data ) {
271        my $data = $data[$n];
272        if ( $data =~ /^\s*#/ ) {
273            @head = &add( \@head, "/*" ) unless ( $state == 1 );
274            $data =~ s/#/ */;
275            @head  = &add( \@head, $data );
276            $state = 1;
277        }
278        elsif ( $data =~ /^\s*$/ ) {
279            @head = &end_note( \@head, $notes++ ) if ( $state == 1 );
280
281            if ( $state >= 2 ) {
282                @head = &add( \@head, $ended );
283                @head = &add_unmap( \@head, \%unmap, \%noted, $title );
284                @head = &add( \@head, $extra ) if ( $extra ne "" );
285            }
286            @head = &add( \@head, "" );
287
288            $title = "";
289            %unmap = ();
290            $state = 0;
291            $ended = "";
292            $extra = "";
293        }
294        elsif ( $data =~ /^map_/ ) {
295            $title = $data;
296            @head  = &end_note( \@head, $notes++ ) if ( $state == 1 );
297            $state = 2;
298            $codep = 0;
299            $codep = 94 if ( $data =~ /_DEC_/ );
300            $codep = 94 if ( $data =~ /_JIS_/ );
301            $codep = 96 if ( $data =~ /_ISO_/ );
302
303            $data .= "(code)" unless ( $data =~ /\(code/ );
304
305            @head = &add( \@head, sprintf( "#define %s \\",           $data ) );
306            @head = &add( \@head, sprintf( "\tbegin_CODEPAGE(%d) \\", $codep ) )
307              if ($codep);
308            @head = &add( \@head, "\tswitch (code) { \\" );
309
310            $ended = $codep ? "\t} \\\n\tend_CODEPAGE()" : "\t}";
311
312            # special case for map_DEC_Supp_Graphic
313            if ( $data =~ /\(code,dft\)/ ) {
314                $ended = "\t    default: dft; break; \\\n" . $ended;
315                $extra = "\n#if OPT_WIDE_CHARS";
316            }
317        }
318        elsif (
319            $data =~ /^\s+(0x)?[[:xdigit:]]{2}
320                            \s+(BLANK
321                               |PUA\(\d\)
322                               |UNDEF
323                               |XK_\w+
324                               |(0x|U\+)?[[:xdigit:]]{1,4})\s*/x
325          )
326        {
327            @head  = &add( \@head, " */" ) if ( $state == 1 );
328            $state = 3;
329            my $note = "";
330            if ( $data =~ /#/ ) {
331                $note = $data;
332                $note =~ s/[^#]*#\s*//;
333                $note = "\t/* $note */" if ( $note ne "" );
334                $data =~ s/\s*#.*//;
335            }
336            $data =~ s/\s+/ /g;
337            $data =~ s/^ //;
338            $data =~ s/ $//;
339            my @fields = split /\s/, $data;
340            my $source = &hex_of( $fields[0] );
341            my $target = &hex_of( $fields[1] );
342            my $intern = &hex_of( $fields[2] );
343            my $macros = "UNI";
344            $macros = "MAP" if ( $target =~ /^XK_/ );
345            $macros = "XXX" if ( $target eq "UNDEF" );
346            $macros = "XXX" if ( $target =~ /PUA\(\d\)/ );
347
348            if ( $target ne $source ) {
349                $intern = $source unless ($intern);
350            }
351            my $item = sprintf( "    %s(%s, %s)", $macros, $source, $target );
352
353            # fix formatting for the XK_-based VT220 definitions
354            if (    $codep == 0
355                and $title !~ /(Greek|Hebrew|Turkish)/
356                and index( $note, "\t/*" ) == 0 )
357            {
358                my $pads = 24 - length($item);
359                $item .= "\t" if ( $pads > 0 );
360            }
361            @head = &add( \@head, sprintf( "\t%s%s \\", $item, $note ) );
362
363            if ( defined $intern ) {
364                if ( $source ne $intern ) {
365                    $unmap{$source} = $intern;
366                    $noted{$source} = $note;
367                }
368            }
369        }
370        else {
371            printf STDERR "? unexpected data:\n\t%s\n", $data;
372        }
373    }
374    if ( $state >= 2 ) {
375        @head = &add( \@head, $ended );
376        @head = &add_unmap( \@head, \%unmap, \%noted, $title );
377        @head = &add( \@head, $extra ) if ( $extra ne "" );
378    }
379    @head = &add( \@head, "#else" );
380    foreach my $key ( sort keys %wide_chars ) {
381        @head =
382          &add( \@head, sprintf( "#define %s(code)\t/* nothing */", $key ) );
383        @head = &add( \@head, sprintf( "#define un%s(code,dft) dft", $key ) );
384    }
385    @head = &add( \@head, "#endif /* OPT_WIDE_CHARS */" );
386    @head = &add( \@head, "" );
387    @head = &add( \@head, "#endif /* included_charsets_h */" );
388
389    my $origin = $file;
390    $origin =~ s/\.dat\b/.h/;
391    my $update = $origin . ".new";
392    unlink $update;
393    open( my $fh, ">", $update )
394      or die "Can't open > $update.txt: $!";
395    for my $n ( 0 .. $#head ) {
396        printf $fh "%s\n", $head[$n];
397    }
398    close $fh;
399
400    system("diff -u $origin $update") if $opt_v;
401    rename $update, $origin if $opt_x;
402    unlink $update;
403}
404
405sub main::HELP_MESSAGE() {
406    printf STDERR <<EOF
407Usage: $0 [options]
408
409Options:
410 -i       import charset data from Unicode file
411 -v       verbose
412 -x       update charsets.h from $data_file
413EOF
414      ;
415    exit 1;
416}
417
418$Getopt::Std::STANDARD_HELP_VERSION = 1;
419&getopts('i:vx') || &main::HELP_MESSAGE;
420$#ARGV >= 0 && &main::HELP_MESSAGE;
421
422&do_import($opt_i) if ($opt_i);
423&do_update($data_file);
424
4251;
426