104b94745Smrg#! /usr/bin/env perl
25104ee6eSmrg# $XTermId: gen-charsets.pl,v 1.41 2024/10/03 22:01:13 tom Exp $
3f2e35a3aSmrg# -----------------------------------------------------------------------------
4f2e35a3aSmrg# this file is part of xterm
5f2e35a3aSmrg#
604b94745Smrg# Copyright 2018-2023,2024 by Thomas E. Dickey
7f2e35a3aSmrg#
8f2e35a3aSmrg#                         All Rights Reserved
9f2e35a3aSmrg#
10f2e35a3aSmrg# Permission is hereby granted, free of charge, to any person obtaining a
11f2e35a3aSmrg# copy of this software and associated documentation files (the
12f2e35a3aSmrg# "Software"), to deal in the Software without restriction, including
13f2e35a3aSmrg# without limitation the rights to use, copy, modify, merge, publish,
14f2e35a3aSmrg# distribute, sublicense, and/or sell copies of the Software, and to
15f2e35a3aSmrg# permit persons to whom the Software is furnished to do so, subject to
16f2e35a3aSmrg# the following conditions:
17f2e35a3aSmrg#
18f2e35a3aSmrg# The above copyright notice and this permission notice shall be included
19f2e35a3aSmrg# in all copies or substantial portions of the Software.
20f2e35a3aSmrg#
21f2e35a3aSmrg# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22f2e35a3aSmrg# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23f2e35a3aSmrg# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24f2e35a3aSmrg# IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
25f2e35a3aSmrg# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26f2e35a3aSmrg# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27f2e35a3aSmrg# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28f2e35a3aSmrg#
29f2e35a3aSmrg# Except as contained in this notice, the name(s) of the above copyright
30f2e35a3aSmrg# holders shall not be used in advertising or otherwise to promote the
31f2e35a3aSmrg# sale, use or other dealings in this Software without prior written
32f2e35a3aSmrg# authorization.
33f2e35a3aSmrg# -----------------------------------------------------------------------------
3404b94745Smrg# Format/maintain xterm's charsets.h header.
3504b94745Smrg
36f2e35a3aSmrguse strict;
3704b94745Smrguse warnings;
3804b94745Smrg
3904b94745Smrguse Getopt::Std;
40f2e35a3aSmrg
41f2e35a3aSmrg$| = 1;
42f2e35a3aSmrg
435104ee6eSmrgour ( $opt_i, $opt_v, $opt_x );
4404b94745Smrgour $undef     = hex(0x2426);
4504b94745Smrgour $head_file = "charsets.h";
4604b94745Smrgour $data_file = "charsets.dat";
4704b94745Smrgour @import;
4804b94745Smrg
4904b94745Smrgour %wide_chars = qw(
5004b94745Smrg  map_DEC_Cyrillic 1
5104b94745Smrg  map_DEC_Greek_Supp 1
5204b94745Smrg  map_DEC_Hebrew_Supp 1
5304b94745Smrg  map_DEC_Technical 1
5404b94745Smrg  map_DEC_Turkish_Supp 1
5504b94745Smrg  map_ISO_Greek_Supp 1
5604b94745Smrg  map_ISO_Hebrew 1
5704b94745Smrg  map_ISO_Latin_2 1
5804b94745Smrg  map_ISO_Latin_5 1
5904b94745Smrg  map_ISO_Latin_Cyrillic 1
605104ee6eSmrg  map_JIS_Roman 1
615104ee6eSmrg  map_JIS_Katakana 1
6204b94745Smrg  map_NRCS_Greek 1
6304b94745Smrg  map_NRCS_Hebrew 1
645104ee6eSmrg  map_NRCS_Serbo_Croatian 1
655104ee6eSmrg  map_NRCS_Russian 1
6604b94745Smrg  map_NRCS_Turkish 1
6704b94745Smrg);
6804b94745Smrg
6904b94745Smrgour $note_1 = "\
7004b94745Smrg#ifndef included_charsets_h
7104b94745Smrg#define included_charsets_h 1";
7204b94745Smrg
7304b94745Smrgour $note_2 = "\
7404b94745Smrg#ifndef PUA
7504b94745Smrg#define PUA(n) (0xEEEE + (n))
7604b94745Smrg#endif
7704b94745Smrg#define UNDEF  0x2426\t\t/* rendered as a backwards \"?\" */";
7804b94745Smrg
7904b94745Smrgour $note_3 = "\
8004b94745Smrg#if OPT_WIDE_CHARS
8104b94745Smrg#define begin_CODEPAGE(size) \\
8204b94745Smrg\tif (!(xw->flags & NATIONAL)) { \\
8304b94745Smrg\t    screen->utf8_nrc_mode++; \\
8404b94745Smrg\t}
8504b94745Smrg#define end_CODEPAGE() \\
8604b94745Smrg\tif (!(xw->flags & NATIONAL)) { \\
8704b94745Smrg\t    screen->utf8_nrc_mode--; \\
8804b94745Smrg\t}
8904b94745Smrg#else
9004b94745Smrg#define begin_CODEPAGE(size)\t/* nothing */
9104b94745Smrg#define end_CODEPAGE()\t\t/* nothing */
9204b94745Smrg#endif";
9304b94745Smrg
9404b94745Smrgsub read_file($) {
9504b94745Smrg    my $file = shift;
96f2e35a3aSmrg    open( FP, $file ) || do {
97f2e35a3aSmrg        print STDERR "Can't open $file: $!\n";
98f2e35a3aSmrg        return;
99f2e35a3aSmrg    };
100f2e35a3aSmrg    my @data = <FP>;
101f2e35a3aSmrg    close(FP);
10204b94745Smrg    chomp @data;
10304b94745Smrg    return @data;
10404b94745Smrg}
10504b94745Smrg
10604b94745Smrg# Translate a Unicode mapping, e.g., for one of the ISO-8859-x codepages,
10704b94745Smrg# into the form used in charsets.c for converting characters.
10804b94745Smrgsub do_import($) {
10904b94745Smrg    my $file = shift;
11004b94745Smrg    my @data = &read_file($file);
111f2e35a3aSmrg    my $name = $file;
112f2e35a3aSmrg    $name =~ s,^.*/,,;
113f2e35a3aSmrg    $name =~ s/\..*$//;
114f2e35a3aSmrg    $name =~ s/^(8859)/ISO-$1/;
115f2e35a3aSmrg    $name =~ s/-/_/g;
116f2e35a3aSmrg    my @target;
117f2e35a3aSmrg    my @noteof;
118f2e35a3aSmrg
11904b94745Smrg    $import[ $#import + 1 ] = sprintf "map_%s", $name;
120f2e35a3aSmrg    for my $n ( 0 .. $#data ) {
121f2e35a3aSmrg        chomp $data[$n];
122f2e35a3aSmrg        $data[$n] =~ s/^\s*//;
123f2e35a3aSmrg        $data[$n] =~ s/\s*$//;
124f2e35a3aSmrg        next if ( $data[$n] =~ /^#/ );
125f2e35a3aSmrg        next if ( $data[$n] eq "" );
126f2e35a3aSmrg        if ( $data[$n] !~ /^0x[[:xdigit:]]+\s+0x[[:xdigit:]]+\s*#/i ) {
127f2e35a3aSmrg            printf STDERR "?? %d:%s\n", $n + 1, $data[$n];
128f2e35a3aSmrg            next;
129f2e35a3aSmrg        }
130f2e35a3aSmrg
131f2e35a3aSmrg        my $source = $data[$n];
132f2e35a3aSmrg        $source =~ s/\s.*//;
133f2e35a3aSmrg        $source = hex($source);
134f2e35a3aSmrg        next if ( $source < 160 or $source > 255 );
135f2e35a3aSmrg        $source -= 128;
136f2e35a3aSmrg
137f2e35a3aSmrg        my $target = $data[$n];
138f2e35a3aSmrg        $target =~ s/^[^\s]+\s+(0x[[:xdigit:]]+).*$/$1/i;
139f2e35a3aSmrg        $target = hex($target);
140f2e35a3aSmrg
141f2e35a3aSmrg        my $noteof = $data[$n];
142f2e35a3aSmrg        $noteof =~ s/^[^#]+#\s*//;
143f2e35a3aSmrg
144f2e35a3aSmrg        $target[$source] = $target;
145f2e35a3aSmrg        $noteof[$source] = $noteof;
146f2e35a3aSmrg    }
147f2e35a3aSmrg    my $lo = $target[32]  ? 32  : 33;
148f2e35a3aSmrg    my $hi = $target[127] ? 127 : 126;
149f2e35a3aSmrg    for my $n ( $lo .. $hi ) {
150f2e35a3aSmrg        if ( defined $target[$n] ) {
15104b94745Smrg            $import[ $#import + 1 ] = sprintf "\t0x%02x\t0x%04x\t\t# %s", $n,
152f2e35a3aSmrg              $target[$n], $noteof[$n];
153f2e35a3aSmrg        }
154f2e35a3aSmrg        else {
15504b94745Smrg            $import[ $#import + 1 ] = sprintf "\t0x%02x\tUNDEF\t\t# undefined",
15604b94745Smrg              $n;
15704b94745Smrg        }
15804b94745Smrg    }
15904b94745Smrg    if ($opt_v) {
16004b94745Smrg        for my $n ( 0 .. $#import ) {
16104b94745Smrg            printf "%s\n", $import[$n];
162f2e35a3aSmrg        }
163f2e35a3aSmrg    }
164f2e35a3aSmrg}
165f2e35a3aSmrg
16604b94745Smrgsub add_text($$) {
16704b94745Smrg    my @head = @{ $_[0] };
16804b94745Smrg    my @note = split /\n/, $_[1];
16904b94745Smrg    for my $n ( 0 .. $#note ) {
17004b94745Smrg        $head[ $#head + 1 ] = $note[$n];
17104b94745Smrg    }
17204b94745Smrg    return @head;
17304b94745Smrg}
17404b94745Smrg
17504b94745Smrgsub end_note($$) {
17604b94745Smrg    my @head = @{ $_[0] };
17704b94745Smrg    my $note = $_[1];
17804b94745Smrg    $head[ $#head + 1 ] = " */";
17904b94745Smrg    my $notes;
18004b94745Smrg    if ( $note == 1 ) {
18104b94745Smrg        $notes = $note_1;
18204b94745Smrg    }
18304b94745Smrg    elsif ( $note == 2 ) {
18404b94745Smrg        $notes = $note_2;
18504b94745Smrg    }
18604b94745Smrg    elsif ( $note == 3 ) {
18704b94745Smrg        $notes = $note_3;
18804b94745Smrg    }
18904b94745Smrg    else {
19004b94745Smrg        $notes = "";
19104b94745Smrg    }
19204b94745Smrg    return &add_text( \@head, $notes );
19304b94745Smrg}
19404b94745Smrg
19504b94745Smrgsub hex_of($) {
19604b94745Smrg    my $text = shift;
19704b94745Smrg    if ($text) {
19804b94745Smrg        $text =~ s/^(0x|u\+)//i;
19904b94745Smrg        $text = "0x" . $text if ( $text =~ /^[[:xdigit:]]+$/ );
20004b94745Smrg    }
20104b94745Smrg    return $text;
202f2e35a3aSmrg}
203f2e35a3aSmrg
20404b94745Smrgsub add($$) {
20504b94745Smrg    my @data = @{ $_[0] };
20604b94745Smrg    my $text = $_[1];
20704b94745Smrg    $data[ $#data + 1 ] = $text;
20804b94745Smrg    return @data;
20904b94745Smrg}
21004b94745Smrg
21104b94745Smrgsub add_unmap($$) {
21204b94745Smrg    my @head  = @{ $_[0] };
21304b94745Smrg    my %unmap = %{ $_[1] };
21404b94745Smrg    my %noted = %{ $_[2] };
21504b94745Smrg    my $title = $_[3];
21604b94745Smrg    my $macro = "un$title";
21704b94745Smrg    $macro .= "(code,dft)" unless ( $macro =~ /\(code/ );
21804b94745Smrg    $macro =~ s/code\)/code,dft\)/;
21904b94745Smrg    @head = &add( \@head, "" );
22004b94745Smrg
22104b94745Smrg    if (%unmap) {
22204b94745Smrg        my @codes = sort keys %unmap;
22304b94745Smrg
22404b94745Smrg        if ( $#codes > 0 ) {
22504b94745Smrg            @head = &add( \@head, "#define $macro \\" );
22604b94745Smrg            @head = &add( \@head, "\tswitch (code) { \\" );
22704b94745Smrg            for my $code ( sort keys %unmap ) {
22804b94745Smrg                my $note = $noted{$code};
22904b94745Smrg                my $pads = " ";
23004b94745Smrg                if ( $title =~ /_NRCS_/ ) {
23104b94745Smrg                    $pads = sprintf( "%*s", 17 - length($code), " " );
23204b94745Smrg                    $note =~ s/\t/ /;
23304b94745Smrg                }
23404b94745Smrg                @head = &add(
23504b94745Smrg                    \@head,
23604b94745Smrg                    sprintf(
23704b94745Smrg                        "\t    MAP(%s,%s%s)%s \\",
23804b94745Smrg                        $code, $pads, $unmap{$code}, $note
23904b94745Smrg                    )
24004b94745Smrg                );
24104b94745Smrg            }
24204b94745Smrg            @head = &add( \@head, "\t    default: dft; break; \\" );
24304b94745Smrg            @head = &add( \@head, "\t}" );
24404b94745Smrg        }
24504b94745Smrg        else {
24604b94745Smrg            @head = &add( \@head, "#define $macro /* nothing? */" );
24704b94745Smrg        }
24804b94745Smrg    }
24904b94745Smrg    else {
25004b94745Smrg        @head = &add( \@head, "#define $macro /* nothing */" );
25104b94745Smrg    }
25204b94745Smrg    return @head;
25304b94745Smrg}
25404b94745Smrg
25504b94745Smrg# Read the current charsets data file, and format a new charsets.h file.
25604b94745Smrgsub do_update($) {
25704b94745Smrg    my $file = shift;
25804b94745Smrg    my @data = &read_file($file);
25904b94745Smrg    return unless ( $#data >= 0 );
26004b94745Smrg    my @head;
26104b94745Smrg    my %noted;
26204b94745Smrg    my %unmap;
26304b94745Smrg    my $title = "";
26404b94745Smrg    my $state = 0;
26504b94745Smrg    my $ended = "";
26604b94745Smrg    my $extra = "";
26704b94745Smrg    my $notes = 0;
26804b94745Smrg    my $codep = 0;
26904b94745Smrg
27004b94745Smrg    for my $n ( 0 .. $#data ) {
27104b94745Smrg        my $data = $data[$n];
27204b94745Smrg        if ( $data =~ /^\s*#/ ) {
27304b94745Smrg            @head = &add( \@head, "/*" ) unless ( $state == 1 );
27404b94745Smrg            $data =~ s/#/ */;
27504b94745Smrg            @head  = &add( \@head, $data );
27604b94745Smrg            $state = 1;
27704b94745Smrg        }
27804b94745Smrg        elsif ( $data =~ /^\s*$/ ) {
27904b94745Smrg            @head = &end_note( \@head, $notes++ ) if ( $state == 1 );
28004b94745Smrg
28104b94745Smrg            if ( $state >= 2 ) {
28204b94745Smrg                @head = &add( \@head, $ended );
28304b94745Smrg                @head = &add_unmap( \@head, \%unmap, \%noted, $title );
28404b94745Smrg                @head = &add( \@head, $extra ) if ( $extra ne "" );
28504b94745Smrg            }
28604b94745Smrg            @head = &add( \@head, "" );
28704b94745Smrg
28804b94745Smrg            $title = "";
28904b94745Smrg            %unmap = ();
29004b94745Smrg            $state = 0;
29104b94745Smrg            $ended = "";
29204b94745Smrg            $extra = "";
29304b94745Smrg        }
29404b94745Smrg        elsif ( $data =~ /^map_/ ) {
29504b94745Smrg            $title = $data;
29604b94745Smrg            @head  = &end_note( \@head, $notes++ ) if ( $state == 1 );
29704b94745Smrg            $state = 2;
29804b94745Smrg            $codep = 0;
29904b94745Smrg            $codep = 94 if ( $data =~ /_DEC_/ );
3005104ee6eSmrg            $codep = 94 if ( $data =~ /_JIS_/ );
30104b94745Smrg            $codep = 96 if ( $data =~ /_ISO_/ );
30204b94745Smrg
30304b94745Smrg            $data .= "(code)" unless ( $data =~ /\(code/ );
30404b94745Smrg
30504b94745Smrg            @head = &add( \@head, sprintf( "#define %s \\",           $data ) );
30604b94745Smrg            @head = &add( \@head, sprintf( "\tbegin_CODEPAGE(%d) \\", $codep ) )
30704b94745Smrg              if ($codep);
30804b94745Smrg            @head = &add( \@head, "\tswitch (code) { \\" );
30904b94745Smrg
31004b94745Smrg            $ended = $codep ? "\t} \\\n\tend_CODEPAGE()" : "\t}";
31104b94745Smrg
31204b94745Smrg            # special case for map_DEC_Supp_Graphic
31304b94745Smrg            if ( $data =~ /\(code,dft\)/ ) {
31404b94745Smrg                $ended = "\t    default: dft; break; \\\n" . $ended;
31504b94745Smrg                $extra = "\n#if OPT_WIDE_CHARS";
31604b94745Smrg            }
31704b94745Smrg        }
31804b94745Smrg        elsif (
31904b94745Smrg            $data =~ /^\s+(0x)?[[:xdigit:]]{2}
32004b94745Smrg                            \s+(BLANK
32104b94745Smrg                               |PUA\(\d\)
32204b94745Smrg                               |UNDEF
32304b94745Smrg                               |XK_\w+
32404b94745Smrg                               |(0x|U\+)?[[:xdigit:]]{1,4})\s*/x
32504b94745Smrg          )
32604b94745Smrg        {
32704b94745Smrg            @head  = &add( \@head, " */" ) if ( $state == 1 );
32804b94745Smrg            $state = 3;
32904b94745Smrg            my $note = "";
33004b94745Smrg            if ( $data =~ /#/ ) {
33104b94745Smrg                $note = $data;
33204b94745Smrg                $note =~ s/[^#]*#\s*//;
33304b94745Smrg                $note = "\t/* $note */" if ( $note ne "" );
33404b94745Smrg                $data =~ s/\s*#.*//;
33504b94745Smrg            }
33604b94745Smrg            $data =~ s/\s+/ /g;
33704b94745Smrg            $data =~ s/^ //;
33804b94745Smrg            $data =~ s/ $//;
33904b94745Smrg            my @fields = split /\s/, $data;
34004b94745Smrg            my $source = &hex_of( $fields[0] );
34104b94745Smrg            my $target = &hex_of( $fields[1] );
34204b94745Smrg            my $intern = &hex_of( $fields[2] );
34304b94745Smrg            my $macros = "UNI";
34404b94745Smrg            $macros = "MAP" if ( $target =~ /^XK_/ );
34504b94745Smrg            $macros = "XXX" if ( $target eq "UNDEF" );
34604b94745Smrg            $macros = "XXX" if ( $target =~ /PUA\(\d\)/ );
34704b94745Smrg
34804b94745Smrg            if ( $target ne $source ) {
34904b94745Smrg                $intern = $source unless ($intern);
35004b94745Smrg            }
35104b94745Smrg            my $item = sprintf( "    %s(%s, %s)", $macros, $source, $target );
35204b94745Smrg
35304b94745Smrg            # fix formatting for the XK_-based VT220 definitions
35404b94745Smrg            if (    $codep == 0
35504b94745Smrg                and $title !~ /(Greek|Hebrew|Turkish)/
35604b94745Smrg                and index( $note, "\t/*" ) == 0 )
35704b94745Smrg            {
35804b94745Smrg                my $pads = 24 - length($item);
35904b94745Smrg                $item .= "\t" if ( $pads > 0 );
36004b94745Smrg            }
36104b94745Smrg            @head = &add( \@head, sprintf( "\t%s%s \\", $item, $note ) );
36204b94745Smrg
36304b94745Smrg            if ( defined $intern ) {
36404b94745Smrg                if ( $source ne $intern ) {
36504b94745Smrg                    $unmap{$source} = $intern;
36604b94745Smrg                    $noted{$source} = $note;
36704b94745Smrg                }
36804b94745Smrg            }
36904b94745Smrg        }
37004b94745Smrg        else {
37104b94745Smrg            printf STDERR "? unexpected data:\n\t%s\n", $data;
37204b94745Smrg        }
37304b94745Smrg    }
37404b94745Smrg    if ( $state >= 2 ) {
37504b94745Smrg        @head = &add( \@head, $ended );
37604b94745Smrg        @head = &add_unmap( \@head, \%unmap, \%noted, $title );
37704b94745Smrg        @head = &add( \@head, $extra ) if ( $extra ne "" );
37804b94745Smrg    }
37904b94745Smrg    @head = &add( \@head, "#else" );
38004b94745Smrg    foreach my $key ( sort keys %wide_chars ) {
38104b94745Smrg        @head =
38204b94745Smrg          &add( \@head, sprintf( "#define %s(code)\t/* nothing */", $key ) );
38304b94745Smrg        @head = &add( \@head, sprintf( "#define un%s(code,dft) dft", $key ) );
38404b94745Smrg    }
38504b94745Smrg    @head = &add( \@head, "#endif /* OPT_WIDE_CHARS */" );
38604b94745Smrg    @head = &add( \@head, "" );
38704b94745Smrg    @head = &add( \@head, "#endif /* included_charsets_h */" );
38804b94745Smrg
38904b94745Smrg    my $origin = $file;
39004b94745Smrg    $origin =~ s/\.dat\b/.h/;
39104b94745Smrg    my $update = $origin . ".new";
39204b94745Smrg    unlink $update;
39304b94745Smrg    open( my $fh, ">", $update )
39404b94745Smrg      or die "Can't open > $update.txt: $!";
39504b94745Smrg    for my $n ( 0 .. $#head ) {
39604b94745Smrg        printf $fh "%s\n", $head[$n];
39704b94745Smrg    }
39804b94745Smrg    close $fh;
39904b94745Smrg
40004b94745Smrg    system("diff -u $origin $update") if $opt_v;
40104b94745Smrg    rename $update, $origin if $opt_x;
40204b94745Smrg    unlink $update;
40304b94745Smrg}
40404b94745Smrg
40504b94745Smrgsub main::HELP_MESSAGE() {
40604b94745Smrg    printf STDERR <<EOF
40704b94745SmrgUsage: $0 [options]
40804b94745Smrg
40904b94745SmrgOptions:
41004b94745Smrg -i       import charset data from Unicode file
41104b94745Smrg -v       verbose
41204b94745Smrg -x       update charsets.h from $data_file
41304b94745SmrgEOF
41404b94745Smrg      ;
41504b94745Smrg    exit 1;
41604b94745Smrg}
41704b94745Smrg
41804b94745Smrg$Getopt::Std::STANDARD_HELP_VERSION = 1;
41904b94745Smrg&getopts('i:vx') || &main::HELP_MESSAGE;
42004b94745Smrg$#ARGV >= 0 && &main::HELP_MESSAGE;
42104b94745Smrg
42204b94745Smrg&do_import($opt_i) if ($opt_i);
42304b94745Smrg&do_update($data_file);
42404b94745Smrg
425f2e35a3aSmrg1;
426