104b94745Smrg#! /usr/bin/env perl 25104ee6eSmrg# $XTermId: gen-charsets.pl,v 1.41 2024/10/03 22:01:13 tom Exp $ 3f2e35a3aSmrg# ----------------------------------------------------------------------------- 4f2e35a3aSmrg# this file is part of xterm 5f2e35a3aSmrg# 604b94745Smrg# Copyright 2018-2023,2024 by Thomas E. Dickey 7f2e35a3aSmrg# 8f2e35a3aSmrg# All Rights Reserved 9f2e35a3aSmrg# 10f2e35a3aSmrg# Permission is hereby granted, free of charge, to any person obtaining a 11f2e35a3aSmrg# copy of this software and associated documentation files (the 12f2e35a3aSmrg# "Software"), to deal in the Software without restriction, including 13f2e35a3aSmrg# without limitation the rights to use, copy, modify, merge, publish, 14f2e35a3aSmrg# distribute, sublicense, and/or sell copies of the Software, and to 15f2e35a3aSmrg# permit persons to whom the Software is furnished to do so, subject to 16f2e35a3aSmrg# the following conditions: 17f2e35a3aSmrg# 18f2e35a3aSmrg# The above copyright notice and this permission notice shall be included 19f2e35a3aSmrg# in all copies or substantial portions of the Software. 20f2e35a3aSmrg# 21f2e35a3aSmrg# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 22f2e35a3aSmrg# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23f2e35a3aSmrg# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24f2e35a3aSmrg# IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY 25f2e35a3aSmrg# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26f2e35a3aSmrg# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27f2e35a3aSmrg# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28f2e35a3aSmrg# 29f2e35a3aSmrg# Except as contained in this notice, the name(s) of the above copyright 30f2e35a3aSmrg# holders shall not be used in advertising or otherwise to promote the 31f2e35a3aSmrg# sale, use or other dealings in this Software without prior written 32f2e35a3aSmrg# authorization. 33f2e35a3aSmrg# ----------------------------------------------------------------------------- 3404b94745Smrg# Format/maintain xterm's charsets.h header. 3504b94745Smrg 36f2e35a3aSmrguse strict; 3704b94745Smrguse warnings; 3804b94745Smrg 3904b94745Smrguse Getopt::Std; 40f2e35a3aSmrg 41f2e35a3aSmrg$| = 1; 42f2e35a3aSmrg 435104ee6eSmrgour ( $opt_i, $opt_v, $opt_x ); 4404b94745Smrgour $undef = hex(0x2426); 4504b94745Smrgour $head_file = "charsets.h"; 4604b94745Smrgour $data_file = "charsets.dat"; 4704b94745Smrgour @import; 4804b94745Smrg 4904b94745Smrgour %wide_chars = qw( 5004b94745Smrg map_DEC_Cyrillic 1 5104b94745Smrg map_DEC_Greek_Supp 1 5204b94745Smrg map_DEC_Hebrew_Supp 1 5304b94745Smrg map_DEC_Technical 1 5404b94745Smrg map_DEC_Turkish_Supp 1 5504b94745Smrg map_ISO_Greek_Supp 1 5604b94745Smrg map_ISO_Hebrew 1 5704b94745Smrg map_ISO_Latin_2 1 5804b94745Smrg map_ISO_Latin_5 1 5904b94745Smrg map_ISO_Latin_Cyrillic 1 605104ee6eSmrg map_JIS_Roman 1 615104ee6eSmrg map_JIS_Katakana 1 6204b94745Smrg map_NRCS_Greek 1 6304b94745Smrg map_NRCS_Hebrew 1 645104ee6eSmrg map_NRCS_Serbo_Croatian 1 655104ee6eSmrg map_NRCS_Russian 1 6604b94745Smrg map_NRCS_Turkish 1 6704b94745Smrg); 6804b94745Smrg 6904b94745Smrgour $note_1 = "\ 7004b94745Smrg#ifndef included_charsets_h 7104b94745Smrg#define included_charsets_h 1"; 7204b94745Smrg 7304b94745Smrgour $note_2 = "\ 7404b94745Smrg#ifndef PUA 7504b94745Smrg#define PUA(n) (0xEEEE + (n)) 7604b94745Smrg#endif 7704b94745Smrg#define UNDEF 0x2426\t\t/* rendered as a backwards \"?\" */"; 7804b94745Smrg 7904b94745Smrgour $note_3 = "\ 8004b94745Smrg#if OPT_WIDE_CHARS 8104b94745Smrg#define begin_CODEPAGE(size) \\ 8204b94745Smrg\tif (!(xw->flags & NATIONAL)) { \\ 8304b94745Smrg\t screen->utf8_nrc_mode++; \\ 8404b94745Smrg\t} 8504b94745Smrg#define end_CODEPAGE() \\ 8604b94745Smrg\tif (!(xw->flags & NATIONAL)) { \\ 8704b94745Smrg\t screen->utf8_nrc_mode--; \\ 8804b94745Smrg\t} 8904b94745Smrg#else 9004b94745Smrg#define begin_CODEPAGE(size)\t/* nothing */ 9104b94745Smrg#define end_CODEPAGE()\t\t/* nothing */ 9204b94745Smrg#endif"; 9304b94745Smrg 9404b94745Smrgsub read_file($) { 9504b94745Smrg my $file = shift; 96f2e35a3aSmrg open( FP, $file ) || do { 97f2e35a3aSmrg print STDERR "Can't open $file: $!\n"; 98f2e35a3aSmrg return; 99f2e35a3aSmrg }; 100f2e35a3aSmrg my @data = <FP>; 101f2e35a3aSmrg close(FP); 10204b94745Smrg chomp @data; 10304b94745Smrg return @data; 10404b94745Smrg} 10504b94745Smrg 10604b94745Smrg# Translate a Unicode mapping, e.g., for one of the ISO-8859-x codepages, 10704b94745Smrg# into the form used in charsets.c for converting characters. 10804b94745Smrgsub do_import($) { 10904b94745Smrg my $file = shift; 11004b94745Smrg my @data = &read_file($file); 111f2e35a3aSmrg my $name = $file; 112f2e35a3aSmrg $name =~ s,^.*/,,; 113f2e35a3aSmrg $name =~ s/\..*$//; 114f2e35a3aSmrg $name =~ s/^(8859)/ISO-$1/; 115f2e35a3aSmrg $name =~ s/-/_/g; 116f2e35a3aSmrg my @target; 117f2e35a3aSmrg my @noteof; 118f2e35a3aSmrg 11904b94745Smrg $import[ $#import + 1 ] = sprintf "map_%s", $name; 120f2e35a3aSmrg for my $n ( 0 .. $#data ) { 121f2e35a3aSmrg chomp $data[$n]; 122f2e35a3aSmrg $data[$n] =~ s/^\s*//; 123f2e35a3aSmrg $data[$n] =~ s/\s*$//; 124f2e35a3aSmrg next if ( $data[$n] =~ /^#/ ); 125f2e35a3aSmrg next if ( $data[$n] eq "" ); 126f2e35a3aSmrg if ( $data[$n] !~ /^0x[[:xdigit:]]+\s+0x[[:xdigit:]]+\s*#/i ) { 127f2e35a3aSmrg printf STDERR "?? %d:%s\n", $n + 1, $data[$n]; 128f2e35a3aSmrg next; 129f2e35a3aSmrg } 130f2e35a3aSmrg 131f2e35a3aSmrg my $source = $data[$n]; 132f2e35a3aSmrg $source =~ s/\s.*//; 133f2e35a3aSmrg $source = hex($source); 134f2e35a3aSmrg next if ( $source < 160 or $source > 255 ); 135f2e35a3aSmrg $source -= 128; 136f2e35a3aSmrg 137f2e35a3aSmrg my $target = $data[$n]; 138f2e35a3aSmrg $target =~ s/^[^\s]+\s+(0x[[:xdigit:]]+).*$/$1/i; 139f2e35a3aSmrg $target = hex($target); 140f2e35a3aSmrg 141f2e35a3aSmrg my $noteof = $data[$n]; 142f2e35a3aSmrg $noteof =~ s/^[^#]+#\s*//; 143f2e35a3aSmrg 144f2e35a3aSmrg $target[$source] = $target; 145f2e35a3aSmrg $noteof[$source] = $noteof; 146f2e35a3aSmrg } 147f2e35a3aSmrg my $lo = $target[32] ? 32 : 33; 148f2e35a3aSmrg my $hi = $target[127] ? 127 : 126; 149f2e35a3aSmrg for my $n ( $lo .. $hi ) { 150f2e35a3aSmrg if ( defined $target[$n] ) { 15104b94745Smrg $import[ $#import + 1 ] = sprintf "\t0x%02x\t0x%04x\t\t# %s", $n, 152f2e35a3aSmrg $target[$n], $noteof[$n]; 153f2e35a3aSmrg } 154f2e35a3aSmrg else { 15504b94745Smrg $import[ $#import + 1 ] = sprintf "\t0x%02x\tUNDEF\t\t# undefined", 15604b94745Smrg $n; 15704b94745Smrg } 15804b94745Smrg } 15904b94745Smrg if ($opt_v) { 16004b94745Smrg for my $n ( 0 .. $#import ) { 16104b94745Smrg printf "%s\n", $import[$n]; 162f2e35a3aSmrg } 163f2e35a3aSmrg } 164f2e35a3aSmrg} 165f2e35a3aSmrg 16604b94745Smrgsub add_text($$) { 16704b94745Smrg my @head = @{ $_[0] }; 16804b94745Smrg my @note = split /\n/, $_[1]; 16904b94745Smrg for my $n ( 0 .. $#note ) { 17004b94745Smrg $head[ $#head + 1 ] = $note[$n]; 17104b94745Smrg } 17204b94745Smrg return @head; 17304b94745Smrg} 17404b94745Smrg 17504b94745Smrgsub end_note($$) { 17604b94745Smrg my @head = @{ $_[0] }; 17704b94745Smrg my $note = $_[1]; 17804b94745Smrg $head[ $#head + 1 ] = " */"; 17904b94745Smrg my $notes; 18004b94745Smrg if ( $note == 1 ) { 18104b94745Smrg $notes = $note_1; 18204b94745Smrg } 18304b94745Smrg elsif ( $note == 2 ) { 18404b94745Smrg $notes = $note_2; 18504b94745Smrg } 18604b94745Smrg elsif ( $note == 3 ) { 18704b94745Smrg $notes = $note_3; 18804b94745Smrg } 18904b94745Smrg else { 19004b94745Smrg $notes = ""; 19104b94745Smrg } 19204b94745Smrg return &add_text( \@head, $notes ); 19304b94745Smrg} 19404b94745Smrg 19504b94745Smrgsub hex_of($) { 19604b94745Smrg my $text = shift; 19704b94745Smrg if ($text) { 19804b94745Smrg $text =~ s/^(0x|u\+)//i; 19904b94745Smrg $text = "0x" . $text if ( $text =~ /^[[:xdigit:]]+$/ ); 20004b94745Smrg } 20104b94745Smrg return $text; 202f2e35a3aSmrg} 203f2e35a3aSmrg 20404b94745Smrgsub add($$) { 20504b94745Smrg my @data = @{ $_[0] }; 20604b94745Smrg my $text = $_[1]; 20704b94745Smrg $data[ $#data + 1 ] = $text; 20804b94745Smrg return @data; 20904b94745Smrg} 21004b94745Smrg 21104b94745Smrgsub add_unmap($$) { 21204b94745Smrg my @head = @{ $_[0] }; 21304b94745Smrg my %unmap = %{ $_[1] }; 21404b94745Smrg my %noted = %{ $_[2] }; 21504b94745Smrg my $title = $_[3]; 21604b94745Smrg my $macro = "un$title"; 21704b94745Smrg $macro .= "(code,dft)" unless ( $macro =~ /\(code/ ); 21804b94745Smrg $macro =~ s/code\)/code,dft\)/; 21904b94745Smrg @head = &add( \@head, "" ); 22004b94745Smrg 22104b94745Smrg if (%unmap) { 22204b94745Smrg my @codes = sort keys %unmap; 22304b94745Smrg 22404b94745Smrg if ( $#codes > 0 ) { 22504b94745Smrg @head = &add( \@head, "#define $macro \\" ); 22604b94745Smrg @head = &add( \@head, "\tswitch (code) { \\" ); 22704b94745Smrg for my $code ( sort keys %unmap ) { 22804b94745Smrg my $note = $noted{$code}; 22904b94745Smrg my $pads = " "; 23004b94745Smrg if ( $title =~ /_NRCS_/ ) { 23104b94745Smrg $pads = sprintf( "%*s", 17 - length($code), " " ); 23204b94745Smrg $note =~ s/\t/ /; 23304b94745Smrg } 23404b94745Smrg @head = &add( 23504b94745Smrg \@head, 23604b94745Smrg sprintf( 23704b94745Smrg "\t MAP(%s,%s%s)%s \\", 23804b94745Smrg $code, $pads, $unmap{$code}, $note 23904b94745Smrg ) 24004b94745Smrg ); 24104b94745Smrg } 24204b94745Smrg @head = &add( \@head, "\t default: dft; break; \\" ); 24304b94745Smrg @head = &add( \@head, "\t}" ); 24404b94745Smrg } 24504b94745Smrg else { 24604b94745Smrg @head = &add( \@head, "#define $macro /* nothing? */" ); 24704b94745Smrg } 24804b94745Smrg } 24904b94745Smrg else { 25004b94745Smrg @head = &add( \@head, "#define $macro /* nothing */" ); 25104b94745Smrg } 25204b94745Smrg return @head; 25304b94745Smrg} 25404b94745Smrg 25504b94745Smrg# Read the current charsets data file, and format a new charsets.h file. 25604b94745Smrgsub do_update($) { 25704b94745Smrg my $file = shift; 25804b94745Smrg my @data = &read_file($file); 25904b94745Smrg return unless ( $#data >= 0 ); 26004b94745Smrg my @head; 26104b94745Smrg my %noted; 26204b94745Smrg my %unmap; 26304b94745Smrg my $title = ""; 26404b94745Smrg my $state = 0; 26504b94745Smrg my $ended = ""; 26604b94745Smrg my $extra = ""; 26704b94745Smrg my $notes = 0; 26804b94745Smrg my $codep = 0; 26904b94745Smrg 27004b94745Smrg for my $n ( 0 .. $#data ) { 27104b94745Smrg my $data = $data[$n]; 27204b94745Smrg if ( $data =~ /^\s*#/ ) { 27304b94745Smrg @head = &add( \@head, "/*" ) unless ( $state == 1 ); 27404b94745Smrg $data =~ s/#/ */; 27504b94745Smrg @head = &add( \@head, $data ); 27604b94745Smrg $state = 1; 27704b94745Smrg } 27804b94745Smrg elsif ( $data =~ /^\s*$/ ) { 27904b94745Smrg @head = &end_note( \@head, $notes++ ) if ( $state == 1 ); 28004b94745Smrg 28104b94745Smrg if ( $state >= 2 ) { 28204b94745Smrg @head = &add( \@head, $ended ); 28304b94745Smrg @head = &add_unmap( \@head, \%unmap, \%noted, $title ); 28404b94745Smrg @head = &add( \@head, $extra ) if ( $extra ne "" ); 28504b94745Smrg } 28604b94745Smrg @head = &add( \@head, "" ); 28704b94745Smrg 28804b94745Smrg $title = ""; 28904b94745Smrg %unmap = (); 29004b94745Smrg $state = 0; 29104b94745Smrg $ended = ""; 29204b94745Smrg $extra = ""; 29304b94745Smrg } 29404b94745Smrg elsif ( $data =~ /^map_/ ) { 29504b94745Smrg $title = $data; 29604b94745Smrg @head = &end_note( \@head, $notes++ ) if ( $state == 1 ); 29704b94745Smrg $state = 2; 29804b94745Smrg $codep = 0; 29904b94745Smrg $codep = 94 if ( $data =~ /_DEC_/ ); 3005104ee6eSmrg $codep = 94 if ( $data =~ /_JIS_/ ); 30104b94745Smrg $codep = 96 if ( $data =~ /_ISO_/ ); 30204b94745Smrg 30304b94745Smrg $data .= "(code)" unless ( $data =~ /\(code/ ); 30404b94745Smrg 30504b94745Smrg @head = &add( \@head, sprintf( "#define %s \\", $data ) ); 30604b94745Smrg @head = &add( \@head, sprintf( "\tbegin_CODEPAGE(%d) \\", $codep ) ) 30704b94745Smrg if ($codep); 30804b94745Smrg @head = &add( \@head, "\tswitch (code) { \\" ); 30904b94745Smrg 31004b94745Smrg $ended = $codep ? "\t} \\\n\tend_CODEPAGE()" : "\t}"; 31104b94745Smrg 31204b94745Smrg # special case for map_DEC_Supp_Graphic 31304b94745Smrg if ( $data =~ /\(code,dft\)/ ) { 31404b94745Smrg $ended = "\t default: dft; break; \\\n" . $ended; 31504b94745Smrg $extra = "\n#if OPT_WIDE_CHARS"; 31604b94745Smrg } 31704b94745Smrg } 31804b94745Smrg elsif ( 31904b94745Smrg $data =~ /^\s+(0x)?[[:xdigit:]]{2} 32004b94745Smrg \s+(BLANK 32104b94745Smrg |PUA\(\d\) 32204b94745Smrg |UNDEF 32304b94745Smrg |XK_\w+ 32404b94745Smrg |(0x|U\+)?[[:xdigit:]]{1,4})\s*/x 32504b94745Smrg ) 32604b94745Smrg { 32704b94745Smrg @head = &add( \@head, " */" ) if ( $state == 1 ); 32804b94745Smrg $state = 3; 32904b94745Smrg my $note = ""; 33004b94745Smrg if ( $data =~ /#/ ) { 33104b94745Smrg $note = $data; 33204b94745Smrg $note =~ s/[^#]*#\s*//; 33304b94745Smrg $note = "\t/* $note */" if ( $note ne "" ); 33404b94745Smrg $data =~ s/\s*#.*//; 33504b94745Smrg } 33604b94745Smrg $data =~ s/\s+/ /g; 33704b94745Smrg $data =~ s/^ //; 33804b94745Smrg $data =~ s/ $//; 33904b94745Smrg my @fields = split /\s/, $data; 34004b94745Smrg my $source = &hex_of( $fields[0] ); 34104b94745Smrg my $target = &hex_of( $fields[1] ); 34204b94745Smrg my $intern = &hex_of( $fields[2] ); 34304b94745Smrg my $macros = "UNI"; 34404b94745Smrg $macros = "MAP" if ( $target =~ /^XK_/ ); 34504b94745Smrg $macros = "XXX" if ( $target eq "UNDEF" ); 34604b94745Smrg $macros = "XXX" if ( $target =~ /PUA\(\d\)/ ); 34704b94745Smrg 34804b94745Smrg if ( $target ne $source ) { 34904b94745Smrg $intern = $source unless ($intern); 35004b94745Smrg } 35104b94745Smrg my $item = sprintf( " %s(%s, %s)", $macros, $source, $target ); 35204b94745Smrg 35304b94745Smrg # fix formatting for the XK_-based VT220 definitions 35404b94745Smrg if ( $codep == 0 35504b94745Smrg and $title !~ /(Greek|Hebrew|Turkish)/ 35604b94745Smrg and index( $note, "\t/*" ) == 0 ) 35704b94745Smrg { 35804b94745Smrg my $pads = 24 - length($item); 35904b94745Smrg $item .= "\t" if ( $pads > 0 ); 36004b94745Smrg } 36104b94745Smrg @head = &add( \@head, sprintf( "\t%s%s \\", $item, $note ) ); 36204b94745Smrg 36304b94745Smrg if ( defined $intern ) { 36404b94745Smrg if ( $source ne $intern ) { 36504b94745Smrg $unmap{$source} = $intern; 36604b94745Smrg $noted{$source} = $note; 36704b94745Smrg } 36804b94745Smrg } 36904b94745Smrg } 37004b94745Smrg else { 37104b94745Smrg printf STDERR "? unexpected data:\n\t%s\n", $data; 37204b94745Smrg } 37304b94745Smrg } 37404b94745Smrg if ( $state >= 2 ) { 37504b94745Smrg @head = &add( \@head, $ended ); 37604b94745Smrg @head = &add_unmap( \@head, \%unmap, \%noted, $title ); 37704b94745Smrg @head = &add( \@head, $extra ) if ( $extra ne "" ); 37804b94745Smrg } 37904b94745Smrg @head = &add( \@head, "#else" ); 38004b94745Smrg foreach my $key ( sort keys %wide_chars ) { 38104b94745Smrg @head = 38204b94745Smrg &add( \@head, sprintf( "#define %s(code)\t/* nothing */", $key ) ); 38304b94745Smrg @head = &add( \@head, sprintf( "#define un%s(code,dft) dft", $key ) ); 38404b94745Smrg } 38504b94745Smrg @head = &add( \@head, "#endif /* OPT_WIDE_CHARS */" ); 38604b94745Smrg @head = &add( \@head, "" ); 38704b94745Smrg @head = &add( \@head, "#endif /* included_charsets_h */" ); 38804b94745Smrg 38904b94745Smrg my $origin = $file; 39004b94745Smrg $origin =~ s/\.dat\b/.h/; 39104b94745Smrg my $update = $origin . ".new"; 39204b94745Smrg unlink $update; 39304b94745Smrg open( my $fh, ">", $update ) 39404b94745Smrg or die "Can't open > $update.txt: $!"; 39504b94745Smrg for my $n ( 0 .. $#head ) { 39604b94745Smrg printf $fh "%s\n", $head[$n]; 39704b94745Smrg } 39804b94745Smrg close $fh; 39904b94745Smrg 40004b94745Smrg system("diff -u $origin $update") if $opt_v; 40104b94745Smrg rename $update, $origin if $opt_x; 40204b94745Smrg unlink $update; 40304b94745Smrg} 40404b94745Smrg 40504b94745Smrgsub main::HELP_MESSAGE() { 40604b94745Smrg printf STDERR <<EOF 40704b94745SmrgUsage: $0 [options] 40804b94745Smrg 40904b94745SmrgOptions: 41004b94745Smrg -i import charset data from Unicode file 41104b94745Smrg -v verbose 41204b94745Smrg -x update charsets.h from $data_file 41304b94745SmrgEOF 41404b94745Smrg ; 41504b94745Smrg exit 1; 41604b94745Smrg} 41704b94745Smrg 41804b94745Smrg$Getopt::Std::STANDARD_HELP_VERSION = 1; 41904b94745Smrg&getopts('i:vx') || &main::HELP_MESSAGE; 42004b94745Smrg$#ARGV >= 0 && &main::HELP_MESSAGE; 42104b94745Smrg 42204b94745Smrg&do_import($opt_i) if ($opt_i); 42304b94745Smrg&do_update($data_file); 42404b94745Smrg 425f2e35a3aSmrg1; 426