1#! /usr/bin/env perl 2# $XTermId: gen-charsets.pl,v 1.41 2024/10/03 22:01:13 tom Exp $ 3# ----------------------------------------------------------------------------- 4# this file is part of xterm 5# 6# Copyright 2018-2023,2024 by Thomas E. Dickey 7# 8# All Rights Reserved 9# 10# Permission is hereby granted, free of charge, to any person obtaining a 11# copy of this software and associated documentation files (the 12# "Software"), to deal in the Software without restriction, including 13# without limitation the rights to use, copy, modify, merge, publish, 14# distribute, sublicense, and/or sell copies of the Software, and to 15# permit persons to whom the Software is furnished to do so, subject to 16# the following conditions: 17# 18# The above copyright notice and this permission notice shall be included 19# in all copies or substantial portions of the Software. 20# 21# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 22# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24# IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY 25# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28# 29# Except as contained in this notice, the name(s) of the above copyright 30# holders shall not be used in advertising or otherwise to promote the 31# sale, use or other dealings in this Software without prior written 32# authorization. 33# ----------------------------------------------------------------------------- 34# Format/maintain xterm's charsets.h header. 35 36use strict; 37use warnings; 38 39use Getopt::Std; 40 41$| = 1; 42 43our ( $opt_i, $opt_v, $opt_x ); 44our $undef = hex(0x2426); 45our $head_file = "charsets.h"; 46our $data_file = "charsets.dat"; 47our @import; 48 49our %wide_chars = qw( 50 map_DEC_Cyrillic 1 51 map_DEC_Greek_Supp 1 52 map_DEC_Hebrew_Supp 1 53 map_DEC_Technical 1 54 map_DEC_Turkish_Supp 1 55 map_ISO_Greek_Supp 1 56 map_ISO_Hebrew 1 57 map_ISO_Latin_2 1 58 map_ISO_Latin_5 1 59 map_ISO_Latin_Cyrillic 1 60 map_JIS_Roman 1 61 map_JIS_Katakana 1 62 map_NRCS_Greek 1 63 map_NRCS_Hebrew 1 64 map_NRCS_Serbo_Croatian 1 65 map_NRCS_Russian 1 66 map_NRCS_Turkish 1 67); 68 69our $note_1 = "\ 70#ifndef included_charsets_h 71#define included_charsets_h 1"; 72 73our $note_2 = "\ 74#ifndef PUA 75#define PUA(n) (0xEEEE + (n)) 76#endif 77#define UNDEF 0x2426\t\t/* rendered as a backwards \"?\" */"; 78 79our $note_3 = "\ 80#if OPT_WIDE_CHARS 81#define begin_CODEPAGE(size) \\ 82\tif (!(xw->flags & NATIONAL)) { \\ 83\t screen->utf8_nrc_mode++; \\ 84\t} 85#define end_CODEPAGE() \\ 86\tif (!(xw->flags & NATIONAL)) { \\ 87\t screen->utf8_nrc_mode--; \\ 88\t} 89#else 90#define begin_CODEPAGE(size)\t/* nothing */ 91#define end_CODEPAGE()\t\t/* nothing */ 92#endif"; 93 94sub read_file($) { 95 my $file = shift; 96 open( FP, $file ) || do { 97 print STDERR "Can't open $file: $!\n"; 98 return; 99 }; 100 my @data = <FP>; 101 close(FP); 102 chomp @data; 103 return @data; 104} 105 106# Translate a Unicode mapping, e.g., for one of the ISO-8859-x codepages, 107# into the form used in charsets.c for converting characters. 108sub do_import($) { 109 my $file = shift; 110 my @data = &read_file($file); 111 my $name = $file; 112 $name =~ s,^.*/,,; 113 $name =~ s/\..*$//; 114 $name =~ s/^(8859)/ISO-$1/; 115 $name =~ s/-/_/g; 116 my @target; 117 my @noteof; 118 119 $import[ $#import + 1 ] = sprintf "map_%s", $name; 120 for my $n ( 0 .. $#data ) { 121 chomp $data[$n]; 122 $data[$n] =~ s/^\s*//; 123 $data[$n] =~ s/\s*$//; 124 next if ( $data[$n] =~ /^#/ ); 125 next if ( $data[$n] eq "" ); 126 if ( $data[$n] !~ /^0x[[:xdigit:]]+\s+0x[[:xdigit:]]+\s*#/i ) { 127 printf STDERR "?? %d:%s\n", $n + 1, $data[$n]; 128 next; 129 } 130 131 my $source = $data[$n]; 132 $source =~ s/\s.*//; 133 $source = hex($source); 134 next if ( $source < 160 or $source > 255 ); 135 $source -= 128; 136 137 my $target = $data[$n]; 138 $target =~ s/^[^\s]+\s+(0x[[:xdigit:]]+).*$/$1/i; 139 $target = hex($target); 140 141 my $noteof = $data[$n]; 142 $noteof =~ s/^[^#]+#\s*//; 143 144 $target[$source] = $target; 145 $noteof[$source] = $noteof; 146 } 147 my $lo = $target[32] ? 32 : 33; 148 my $hi = $target[127] ? 127 : 126; 149 for my $n ( $lo .. $hi ) { 150 if ( defined $target[$n] ) { 151 $import[ $#import + 1 ] = sprintf "\t0x%02x\t0x%04x\t\t# %s", $n, 152 $target[$n], $noteof[$n]; 153 } 154 else { 155 $import[ $#import + 1 ] = sprintf "\t0x%02x\tUNDEF\t\t# undefined", 156 $n; 157 } 158 } 159 if ($opt_v) { 160 for my $n ( 0 .. $#import ) { 161 printf "%s\n", $import[$n]; 162 } 163 } 164} 165 166sub add_text($$) { 167 my @head = @{ $_[0] }; 168 my @note = split /\n/, $_[1]; 169 for my $n ( 0 .. $#note ) { 170 $head[ $#head + 1 ] = $note[$n]; 171 } 172 return @head; 173} 174 175sub end_note($$) { 176 my @head = @{ $_[0] }; 177 my $note = $_[1]; 178 $head[ $#head + 1 ] = " */"; 179 my $notes; 180 if ( $note == 1 ) { 181 $notes = $note_1; 182 } 183 elsif ( $note == 2 ) { 184 $notes = $note_2; 185 } 186 elsif ( $note == 3 ) { 187 $notes = $note_3; 188 } 189 else { 190 $notes = ""; 191 } 192 return &add_text( \@head, $notes ); 193} 194 195sub hex_of($) { 196 my $text = shift; 197 if ($text) { 198 $text =~ s/^(0x|u\+)//i; 199 $text = "0x" . $text if ( $text =~ /^[[:xdigit:]]+$/ ); 200 } 201 return $text; 202} 203 204sub add($$) { 205 my @data = @{ $_[0] }; 206 my $text = $_[1]; 207 $data[ $#data + 1 ] = $text; 208 return @data; 209} 210 211sub add_unmap($$) { 212 my @head = @{ $_[0] }; 213 my %unmap = %{ $_[1] }; 214 my %noted = %{ $_[2] }; 215 my $title = $_[3]; 216 my $macro = "un$title"; 217 $macro .= "(code,dft)" unless ( $macro =~ /\(code/ ); 218 $macro =~ s/code\)/code,dft\)/; 219 @head = &add( \@head, "" ); 220 221 if (%unmap) { 222 my @codes = sort keys %unmap; 223 224 if ( $#codes > 0 ) { 225 @head = &add( \@head, "#define $macro \\" ); 226 @head = &add( \@head, "\tswitch (code) { \\" ); 227 for my $code ( sort keys %unmap ) { 228 my $note = $noted{$code}; 229 my $pads = " "; 230 if ( $title =~ /_NRCS_/ ) { 231 $pads = sprintf( "%*s", 17 - length($code), " " ); 232 $note =~ s/\t/ /; 233 } 234 @head = &add( 235 \@head, 236 sprintf( 237 "\t MAP(%s,%s%s)%s \\", 238 $code, $pads, $unmap{$code}, $note 239 ) 240 ); 241 } 242 @head = &add( \@head, "\t default: dft; break; \\" ); 243 @head = &add( \@head, "\t}" ); 244 } 245 else { 246 @head = &add( \@head, "#define $macro /* nothing? */" ); 247 } 248 } 249 else { 250 @head = &add( \@head, "#define $macro /* nothing */" ); 251 } 252 return @head; 253} 254 255# Read the current charsets data file, and format a new charsets.h file. 256sub do_update($) { 257 my $file = shift; 258 my @data = &read_file($file); 259 return unless ( $#data >= 0 ); 260 my @head; 261 my %noted; 262 my %unmap; 263 my $title = ""; 264 my $state = 0; 265 my $ended = ""; 266 my $extra = ""; 267 my $notes = 0; 268 my $codep = 0; 269 270 for my $n ( 0 .. $#data ) { 271 my $data = $data[$n]; 272 if ( $data =~ /^\s*#/ ) { 273 @head = &add( \@head, "/*" ) unless ( $state == 1 ); 274 $data =~ s/#/ */; 275 @head = &add( \@head, $data ); 276 $state = 1; 277 } 278 elsif ( $data =~ /^\s*$/ ) { 279 @head = &end_note( \@head, $notes++ ) if ( $state == 1 ); 280 281 if ( $state >= 2 ) { 282 @head = &add( \@head, $ended ); 283 @head = &add_unmap( \@head, \%unmap, \%noted, $title ); 284 @head = &add( \@head, $extra ) if ( $extra ne "" ); 285 } 286 @head = &add( \@head, "" ); 287 288 $title = ""; 289 %unmap = (); 290 $state = 0; 291 $ended = ""; 292 $extra = ""; 293 } 294 elsif ( $data =~ /^map_/ ) { 295 $title = $data; 296 @head = &end_note( \@head, $notes++ ) if ( $state == 1 ); 297 $state = 2; 298 $codep = 0; 299 $codep = 94 if ( $data =~ /_DEC_/ ); 300 $codep = 94 if ( $data =~ /_JIS_/ ); 301 $codep = 96 if ( $data =~ /_ISO_/ ); 302 303 $data .= "(code)" unless ( $data =~ /\(code/ ); 304 305 @head = &add( \@head, sprintf( "#define %s \\", $data ) ); 306 @head = &add( \@head, sprintf( "\tbegin_CODEPAGE(%d) \\", $codep ) ) 307 if ($codep); 308 @head = &add( \@head, "\tswitch (code) { \\" ); 309 310 $ended = $codep ? "\t} \\\n\tend_CODEPAGE()" : "\t}"; 311 312 # special case for map_DEC_Supp_Graphic 313 if ( $data =~ /\(code,dft\)/ ) { 314 $ended = "\t default: dft; break; \\\n" . $ended; 315 $extra = "\n#if OPT_WIDE_CHARS"; 316 } 317 } 318 elsif ( 319 $data =~ /^\s+(0x)?[[:xdigit:]]{2} 320 \s+(BLANK 321 |PUA\(\d\) 322 |UNDEF 323 |XK_\w+ 324 |(0x|U\+)?[[:xdigit:]]{1,4})\s*/x 325 ) 326 { 327 @head = &add( \@head, " */" ) if ( $state == 1 ); 328 $state = 3; 329 my $note = ""; 330 if ( $data =~ /#/ ) { 331 $note = $data; 332 $note =~ s/[^#]*#\s*//; 333 $note = "\t/* $note */" if ( $note ne "" ); 334 $data =~ s/\s*#.*//; 335 } 336 $data =~ s/\s+/ /g; 337 $data =~ s/^ //; 338 $data =~ s/ $//; 339 my @fields = split /\s/, $data; 340 my $source = &hex_of( $fields[0] ); 341 my $target = &hex_of( $fields[1] ); 342 my $intern = &hex_of( $fields[2] ); 343 my $macros = "UNI"; 344 $macros = "MAP" if ( $target =~ /^XK_/ ); 345 $macros = "XXX" if ( $target eq "UNDEF" ); 346 $macros = "XXX" if ( $target =~ /PUA\(\d\)/ ); 347 348 if ( $target ne $source ) { 349 $intern = $source unless ($intern); 350 } 351 my $item = sprintf( " %s(%s, %s)", $macros, $source, $target ); 352 353 # fix formatting for the XK_-based VT220 definitions 354 if ( $codep == 0 355 and $title !~ /(Greek|Hebrew|Turkish)/ 356 and index( $note, "\t/*" ) == 0 ) 357 { 358 my $pads = 24 - length($item); 359 $item .= "\t" if ( $pads > 0 ); 360 } 361 @head = &add( \@head, sprintf( "\t%s%s \\", $item, $note ) ); 362 363 if ( defined $intern ) { 364 if ( $source ne $intern ) { 365 $unmap{$source} = $intern; 366 $noted{$source} = $note; 367 } 368 } 369 } 370 else { 371 printf STDERR "? unexpected data:\n\t%s\n", $data; 372 } 373 } 374 if ( $state >= 2 ) { 375 @head = &add( \@head, $ended ); 376 @head = &add_unmap( \@head, \%unmap, \%noted, $title ); 377 @head = &add( \@head, $extra ) if ( $extra ne "" ); 378 } 379 @head = &add( \@head, "#else" ); 380 foreach my $key ( sort keys %wide_chars ) { 381 @head = 382 &add( \@head, sprintf( "#define %s(code)\t/* nothing */", $key ) ); 383 @head = &add( \@head, sprintf( "#define un%s(code,dft) dft", $key ) ); 384 } 385 @head = &add( \@head, "#endif /* OPT_WIDE_CHARS */" ); 386 @head = &add( \@head, "" ); 387 @head = &add( \@head, "#endif /* included_charsets_h */" ); 388 389 my $origin = $file; 390 $origin =~ s/\.dat\b/.h/; 391 my $update = $origin . ".new"; 392 unlink $update; 393 open( my $fh, ">", $update ) 394 or die "Can't open > $update.txt: $!"; 395 for my $n ( 0 .. $#head ) { 396 printf $fh "%s\n", $head[$n]; 397 } 398 close $fh; 399 400 system("diff -u $origin $update") if $opt_v; 401 rename $update, $origin if $opt_x; 402 unlink $update; 403} 404 405sub main::HELP_MESSAGE() { 406 printf STDERR <<EOF 407Usage: $0 [options] 408 409Options: 410 -i import charset data from Unicode file 411 -v verbose 412 -x update charsets.h from $data_file 413EOF 414 ; 415 exit 1; 416} 417 418$Getopt::Std::STANDARD_HELP_VERSION = 1; 419&getopts('i:vx') || &main::HELP_MESSAGE; 420$#ARGV >= 0 && &main::HELP_MESSAGE; 421 422&do_import($opt_i) if ($opt_i); 423&do_update($data_file); 424 4251; 426