gen-charsets.pl revision 04b94745
1#! /usr/bin/env perl 2# $XTermId: gen-charsets.pl,v 1.37 2024/02/09 01:11:52 tom Exp $ 3# ----------------------------------------------------------------------------- 4# this file is part of xterm 5# 6# Copyright 2018-2023,2024 by Thomas E. Dickey 7# 8# All Rights Reserved 9# 10# Permission is hereby granted, free of charge, to any person obtaining a 11# copy of this software and associated documentation files (the 12# "Software"), to deal in the Software without restriction, including 13# without limitation the rights to use, copy, modify, merge, publish, 14# distribute, sublicense, and/or sell copies of the Software, and to 15# permit persons to whom the Software is furnished to do so, subject to 16# the following conditions: 17# 18# The above copyright notice and this permission notice shall be included 19# in all copies or substantial portions of the Software. 20# 21# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 22# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24# IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY 25# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28# 29# Except as contained in this notice, the name(s) of the above copyright 30# holders shall not be used in advertising or otherwise to promote the 31# sale, use or other dealings in this Software without prior written 32# authorization. 33# ----------------------------------------------------------------------------- 34# Format/maintain xterm's charsets.h header. 35 36use strict; 37use warnings; 38 39use Getopt::Std; 40 41$| = 1; 42 43our ( $opt_d, $opt_i, $opt_v, $opt_x ); 44our $undef = hex(0x2426); 45our $head_file = "charsets.h"; 46our $data_file = "charsets.dat"; 47our @import; 48 49our %wide_chars = qw( 50 map_DEC_Cyrillic 1 51 map_DEC_Greek_Supp 1 52 map_DEC_Hebrew_Supp 1 53 map_DEC_Technical 1 54 map_DEC_Turkish_Supp 1 55 map_ISO_Greek_Supp 1 56 map_ISO_Hebrew 1 57 map_ISO_Latin_2 1 58 map_ISO_Latin_5 1 59 map_ISO_Latin_Cyrillic 1 60 map_NRCS_Greek 1 61 map_NRCS_Hebrew 1 62 map_NRCS_Turkish 1 63); 64 65our $note_1 = "\ 66#ifndef included_charsets_h 67#define included_charsets_h 1"; 68 69our $note_2 = "\ 70#ifndef PUA 71#define PUA(n) (0xEEEE + (n)) 72#endif 73#define UNDEF 0x2426\t\t/* rendered as a backwards \"?\" */"; 74 75our $note_3 = "\ 76#if OPT_WIDE_CHARS 77#define begin_CODEPAGE(size) \\ 78\tif (!(xw->flags & NATIONAL)) { \\ 79\t screen->utf8_nrc_mode++; \\ 80\t} 81#define end_CODEPAGE() \\ 82\tif (!(xw->flags & NATIONAL)) { \\ 83\t screen->utf8_nrc_mode--; \\ 84\t} 85#else 86#define begin_CODEPAGE(size)\t/* nothing */ 87#define end_CODEPAGE()\t\t/* nothing */ 88#endif"; 89 90sub read_file($) { 91 my $file = shift; 92 open( FP, $file ) || do { 93 print STDERR "Can't open $file: $!\n"; 94 return; 95 }; 96 my @data = <FP>; 97 close(FP); 98 chomp @data; 99 return @data; 100} 101 102# Translate a Unicode mapping, e.g., for one of the ISO-8859-x codepages, 103# into the form used in charsets.c for converting characters. 104sub do_import($) { 105 my $file = shift; 106 my @data = &read_file($file); 107 my $name = $file; 108 $name =~ s,^.*/,,; 109 $name =~ s/\..*$//; 110 $name =~ s/^(8859)/ISO-$1/; 111 $name =~ s/-/_/g; 112 my @target; 113 my @noteof; 114 115 $import[ $#import + 1 ] = sprintf "map_%s", $name; 116 for my $n ( 0 .. $#data ) { 117 chomp $data[$n]; 118 $data[$n] =~ s/^\s*//; 119 $data[$n] =~ s/\s*$//; 120 next if ( $data[$n] =~ /^#/ ); 121 next if ( $data[$n] eq "" ); 122 if ( $data[$n] !~ /^0x[[:xdigit:]]+\s+0x[[:xdigit:]]+\s*#/i ) { 123 printf STDERR "?? %d:%s\n", $n + 1, $data[$n]; 124 next; 125 } 126 127 my $source = $data[$n]; 128 $source =~ s/\s.*//; 129 $source = hex($source); 130 next if ( $source < 160 or $source > 255 ); 131 $source -= 128; 132 133 my $target = $data[$n]; 134 $target =~ s/^[^\s]+\s+(0x[[:xdigit:]]+).*$/$1/i; 135 $target = hex($target); 136 137 my $noteof = $data[$n]; 138 $noteof =~ s/^[^#]+#\s*//; 139 140 $target[$source] = $target; 141 $noteof[$source] = $noteof; 142 } 143 my $lo = $target[32] ? 32 : 33; 144 my $hi = $target[127] ? 127 : 126; 145 for my $n ( $lo .. $hi ) { 146 if ( defined $target[$n] ) { 147 $import[ $#import + 1 ] = sprintf "\t0x%02x\t0x%04x\t\t# %s", $n, 148 $target[$n], $noteof[$n]; 149 } 150 else { 151 $import[ $#import + 1 ] = sprintf "\t0x%02x\tUNDEF\t\t# undefined", 152 $n; 153 } 154 } 155 if ($opt_v) { 156 for my $n ( 0 .. $#import ) { 157 printf "%s\n", $import[$n]; 158 } 159 } 160} 161 162sub add_text($$) { 163 my @head = @{ $_[0] }; 164 my @note = split /\n/, $_[1]; 165 for my $n ( 0 .. $#note ) { 166 $head[ $#head + 1 ] = $note[$n]; 167 } 168 return @head; 169} 170 171sub end_note($$) { 172 my @head = @{ $_[0] }; 173 my $note = $_[1]; 174 $head[ $#head + 1 ] = " */"; 175 my $notes; 176 if ( $note == 1 ) { 177 $notes = $note_1; 178 } 179 elsif ( $note == 2 ) { 180 $notes = $note_2; 181 } 182 elsif ( $note == 3 ) { 183 $notes = $note_3; 184 } 185 else { 186 $notes = ""; 187 } 188 return &add_text( \@head, $notes ); 189} 190 191sub hex_of($) { 192 my $text = shift; 193 if ($text) { 194 $text =~ s/^(0x|u\+)//i; 195 $text = "0x" . $text if ( $text =~ /^[[:xdigit:]]+$/ ); 196 } 197 return $text; 198} 199 200sub add($$) { 201 my @data = @{ $_[0] }; 202 my $text = $_[1]; 203 $data[ $#data + 1 ] = $text; 204 return @data; 205} 206 207sub add_unmap($$) { 208 my @head = @{ $_[0] }; 209 my %unmap = %{ $_[1] }; 210 my %noted = %{ $_[2] }; 211 my $title = $_[3]; 212 my $macro = "un$title"; 213 $macro .= "(code,dft)" unless ( $macro =~ /\(code/ ); 214 $macro =~ s/code\)/code,dft\)/; 215 @head = &add( \@head, "" ); 216 217 if (%unmap) { 218 my @codes = sort keys %unmap; 219 220 if ( $#codes > 0 ) { 221 @head = &add( \@head, "#define $macro \\" ); 222 @head = &add( \@head, "\tswitch (code) { \\" ); 223 for my $code ( sort keys %unmap ) { 224 my $note = $noted{$code}; 225 my $pads = " "; 226 if ( $title =~ /_NRCS_/ ) { 227 $pads = sprintf( "%*s", 17 - length($code), " " ); 228 $note =~ s/\t/ /; 229 } 230 @head = &add( 231 \@head, 232 sprintf( 233 "\t MAP(%s,%s%s)%s \\", 234 $code, $pads, $unmap{$code}, $note 235 ) 236 ); 237 } 238 @head = &add( \@head, "\t default: dft; break; \\" ); 239 @head = &add( \@head, "\t}" ); 240 } 241 else { 242 @head = &add( \@head, "#define $macro /* nothing? */" ); 243 } 244 } 245 else { 246 @head = &add( \@head, "#define $macro /* nothing */" ); 247 } 248 return @head; 249} 250 251# Read the current charsets data file, and format a new charsets.h file. 252sub do_update($) { 253 my $file = shift; 254 my @data = &read_file($file); 255 return unless ( $#data >= 0 ); 256 my @head; 257 my %noted; 258 my %unmap; 259 my $title = ""; 260 my $state = 0; 261 my $ended = ""; 262 my $extra = ""; 263 my $notes = 0; 264 my $codep = 0; 265 266 for my $n ( 0 .. $#data ) { 267 my $data = $data[$n]; 268 if ( $data =~ /^\s*#/ ) { 269 @head = &add( \@head, "/*" ) unless ( $state == 1 ); 270 $data =~ s/#/ */; 271 @head = &add( \@head, $data ); 272 $state = 1; 273 } 274 elsif ( $data =~ /^\s*$/ ) { 275 @head = &end_note( \@head, $notes++ ) if ( $state == 1 ); 276 277 if ( $state >= 2 ) { 278 @head = &add( \@head, $ended ); 279 @head = &add_unmap( \@head, \%unmap, \%noted, $title ); 280 @head = &add( \@head, $extra ) if ( $extra ne "" ); 281 } 282 @head = &add( \@head, "" ); 283 284 $title = ""; 285 %unmap = (); 286 $state = 0; 287 $ended = ""; 288 $extra = ""; 289 } 290 elsif ( $data =~ /^map_/ ) { 291 $title = $data; 292 @head = &end_note( \@head, $notes++ ) if ( $state == 1 ); 293 $state = 2; 294 $codep = 0; 295 $codep = 94 if ( $data =~ /_DEC_/ ); 296 $codep = 96 if ( $data =~ /_ISO_/ ); 297 298 $data .= "(code)" unless ( $data =~ /\(code/ ); 299 300 @head = &add( \@head, sprintf( "#define %s \\", $data ) ); 301 @head = &add( \@head, sprintf( "\tbegin_CODEPAGE(%d) \\", $codep ) ) 302 if ($codep); 303 @head = &add( \@head, "\tswitch (code) { \\" ); 304 305 $ended = $codep ? "\t} \\\n\tend_CODEPAGE()" : "\t}"; 306 307 # special case for map_DEC_Supp_Graphic 308 if ( $data =~ /\(code,dft\)/ ) { 309 $ended = "\t default: dft; break; \\\n" . $ended; 310 $extra = "\n#if OPT_WIDE_CHARS"; 311 } 312 } 313 elsif ( 314 $data =~ /^\s+(0x)?[[:xdigit:]]{2} 315 \s+(BLANK 316 |PUA\(\d\) 317 |UNDEF 318 |XK_\w+ 319 |(0x|U\+)?[[:xdigit:]]{1,4})\s*/x 320 ) 321 { 322 @head = &add( \@head, " */" ) if ( $state == 1 ); 323 $state = 3; 324 my $note = ""; 325 if ( $data =~ /#/ ) { 326 $note = $data; 327 $note =~ s/[^#]*#\s*//; 328 $note = "\t/* $note */" if ( $note ne "" ); 329 $data =~ s/\s*#.*//; 330 } 331 $data =~ s/\s+/ /g; 332 $data =~ s/^ //; 333 $data =~ s/ $//; 334 my @fields = split /\s/, $data; 335 my $source = &hex_of( $fields[0] ); 336 my $target = &hex_of( $fields[1] ); 337 my $intern = &hex_of( $fields[2] ); 338 my $macros = "UNI"; 339 $macros = "MAP" if ( $target =~ /^XK_/ ); 340 $macros = "XXX" if ( $target eq "UNDEF" ); 341 $macros = "XXX" if ( $target =~ /PUA\(\d\)/ ); 342 343 if ( $target ne $source ) { 344 $intern = $source unless ($intern); 345 } 346 my $item = sprintf( " %s(%s, %s)", $macros, $source, $target ); 347 348 # fix formatting for the XK_-based VT220 definitions 349 if ( $codep == 0 350 and $title !~ /(Greek|Hebrew|Turkish)/ 351 and index( $note, "\t/*" ) == 0 ) 352 { 353 my $pads = 24 - length($item); 354 $item .= "\t" if ( $pads > 0 ); 355 } 356 @head = &add( \@head, sprintf( "\t%s%s \\", $item, $note ) ); 357 358 if ( defined $intern ) { 359 if ( $source ne $intern ) { 360 $unmap{$source} = $intern; 361 $noted{$source} = $note; 362 } 363 } 364 } 365 else { 366 printf STDERR "? unexpected data:\n\t%s\n", $data; 367 } 368 } 369 if ( $state >= 2 ) { 370 @head = &add( \@head, $ended ); 371 @head = &add_unmap( \@head, \%unmap, \%noted, $title ); 372 @head = &add( \@head, $extra ) if ( $extra ne "" ); 373 } 374 @head = &add( \@head, "#else" ); 375 foreach my $key ( sort keys %wide_chars ) { 376 @head = 377 &add( \@head, sprintf( "#define %s(code)\t/* nothing */", $key ) ); 378 @head = &add( \@head, sprintf( "#define un%s(code,dft) dft", $key ) ); 379 } 380 @head = &add( \@head, "#endif /* OPT_WIDE_CHARS */" ); 381 @head = &add( \@head, "" ); 382 @head = &add( \@head, "#endif /* included_charsets_h */" ); 383 384 my $origin = $file; 385 $origin =~ s/\.dat\b/.h/; 386 my $update = $origin . ".new"; 387 unlink $update; 388 open( my $fh, ">", $update ) 389 or die "Can't open > $update.txt: $!"; 390 for my $n ( 0 .. $#head ) { 391 printf $fh "%s\n", $head[$n]; 392 } 393 close $fh; 394 395 system("diff -u $origin $update") if $opt_v; 396 rename $update, $origin if $opt_x; 397 unlink $update; 398} 399 400sub main::HELP_MESSAGE() { 401 printf STDERR <<EOF 402Usage: $0 [options] 403 404Options: 405 -d debug 406 -i import charset data from Unicode file 407 -v verbose 408 -x update charsets.h from $data_file 409EOF 410 ; 411 exit 1; 412} 413 414$Getopt::Std::STANDARD_HELP_VERSION = 1; 415&getopts('i:vx') || &main::HELP_MESSAGE; 416$#ARGV >= 0 && &main::HELP_MESSAGE; 417 418&do_import($opt_i) if ($opt_i); 419&do_update($data_file); 420 4211; 422