Home | History | Annotate | Line # | Download | only in ctype
      1 #!/usr/bin/env perl
      2 #	$OpenBSD: gen_ctype_utf8.pl,v 1.8 2023/02/16 01:06:01 afresh1 Exp $	#
      3 use 5.022;
      4 use warnings;
      5 
      6 # Copyright (c) 2015 Andrew Fresh <afresh1 (at] openbsd.org>
      7 #
      8 # Permission to use, copy, modify, and distribute this software for any
      9 # purpose with or without fee is hereby granted, provided that the above
     10 # copyright notice and this permission notice appear in all copies.
     11 #
     12 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     13 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     14 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     15 # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     16 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     17 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     18 # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     19 
     20 use Unicode::UCD v0.610 qw( charinfo charprop prop_invmap );
     21 
     22 my @lists = qw(
     23     ALPHA
     24     CONTROL
     25     DIGIT
     26     GRAPH
     27     LOWER
     28     PUNCT
     29     SPACE
     30     UPPER
     31     XDIGIT
     32     BLANK
     33     PRINT
     34     SPECIAL
     35     PHONOGRAM
     36 
     37     SWIDTH0
     38     SWIDTH1
     39     SWIDTH2
     40 );
     41 
     42 my @maps = qw(
     43     MAPUPPER
     44     MAPLOWER
     45     TODIGIT
     46 );
     47 
     48 my ( $blocks_ranges_ref, $blocks_maps_ref ) = prop_invmap("Block");
     49 
     50 print "/*\t\$" . 'NetBSD' . "\$\t*/\n";
     51 print <<'EOL';
     52 
     53 /*
     54  * COPYRIGHT AND PERMISSION NOTICE
     55  *
     56  * Copyright (c) 1991-2021 Unicode, Inc. All rights reserved.
     57  * Distributed under the Terms of Use in
     58  * https://www.unicode.org/copyright.html.
     59  *
     60  * Permission is hereby granted, free of charge, to any person obtaining
     61  * a copy of the Unicode data files and any associated documentation
     62  * (the "Data Files") or Unicode software and any associated documentation
     63  * (the "Software") to deal in the Data Files or Software
     64  * without restriction, including without limitation the rights to use,
     65  * copy, modify, merge, publish, distribute, and/or sell copies of
     66  * the Data Files or Software, and to permit persons to whom the Data Files
     67  * or Software are furnished to do so, provided that either
     68  * (a) this copyright and permission notice appear with all copies
     69  * of the Data Files or Software, or
     70  * (b) this copyright and permission notice appear in associated
     71  * Documentation.
     72  *
     73  * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
     74  * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
     75  * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     76  * NONINFRINGEMENT OF THIRD PARTY RIGHTS.
     77  * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
     78  * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
     79  * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
     80  * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
     81  * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
     82  * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
     83  *
     84  * Except as contained in this notice, the name of a copyright holder
     85  * shall not be used in advertising or otherwise to promote the sale,
     86  * use or other dealings in these Data Files or Software without prior
     87  * written authorization of the copyright holder.
     88  */
     89 
     90 ENCODING        "UTF8"
     91 VARIABLE        CODESET=UTF-8
     92 
     93 EOL
     94 
     95 print "/* Unicode Version " . Unicode::UCD::UnicodeVersion() . " */\n";
     96 
     97 for my $i ( 0 .. $#{ $blocks_ranges_ref } ) {
     98 	my $start = $blocks_ranges_ref->[ $i ];
     99 	my $end   = ( $blocks_ranges_ref->[ $i + 1 ] || 0 ) - 1;
    100 
    101 	my $descr = sprintf "U+%04X - U+%04X : %s",
    102 	     $start, $end, $blocks_maps_ref->[$i];
    103 
    104 	warn "$descr\n";
    105 	print "\n/*\n * $descr\n */\n\n";
    106 
    107 	last if $end == -1;
    108 	next if $blocks_maps_ref->[$i] eq 'No_Block';
    109 
    110 	my %info;
    111 	categorize( $_, \%info ) for $start .. $end;
    112 	print_info(%info);
    113 }
    114 
    115 # http://www.unicode.org/reports/tr44/tr44-16.html#General_Category_Values
    116 # Table 12. General_Category Values
    117 #
    118 # Abbr         Long                             Description
    119 # Lu   Uppercase_Letter      an uppercase letter
    120 # Ll   Lowercase_Letter      a lowercase letter
    121 # Lt   Titlecase_Letter      a digraphic character, with first part uppercase
    122 # LC   Cased_Letter          Lu | Ll | Lt
    123 # Lm   Modifier_Letter       a modifier letter
    124 # Lo   Other_Letter          other letters, including syllables and ideographs
    125 # L    Letter                Lu | Ll | Lt | Lm | Lo
    126 # Mn   Nonspacing_Mark       a nonspacing combining mark (zero advance width)
    127 # Mc   Spacing_Mark          a spacing combining mark (positive advance width)
    128 # Me   Enclosing_Mark        an enclosing combining mark
    129 # M    Mark                  Mn | Mc | Me
    130 # Nd   Decimal_Number        a decimal digit
    131 # Nl   Letter_Number         a letterlike numeric character
    132 # No   Other_Number          a numeric character of other type
    133 # N    Number                Nd | Nl | No
    134 # Pc   Connector_Punctuation a connecting punctuation mark, like a tie
    135 # Pd   Dash_Punctuation      a dash or hyphen punctuation mark
    136 # Ps   Open_Punctuation      an opening punctuation mark (of a pair)
    137 # Pe   Close_Punctuation     a closing punctuation mark (of a pair)
    138 # Pi   Initial_Punctuation   an initial quotation mark
    139 # Pf   Final_Punctuation     a final quotation mark
    140 # Po   Other_Punctuation     a punctuation mark of other type
    141 # P    Punctuation           Pc | Pd | Ps | Pe | Pi | Pf | Po
    142 # Sm   Math_Symbol           a symbol of mathematical use
    143 # Sc   Currency_Symbol       a currency sign
    144 # Sk   Modifier_Symbol       a non-letterlike modifier symbol
    145 # So   Other_Symbol          a symbol of other type
    146 # S    Symbol                Sm | Sc | Sk | So
    147 # Zs   Space_Separator       a space character (of various non-zero widths)
    148 # Zl   Line_Separator        U+2028 LINE SEPARATOR only
    149 # Zp   Paragraph_Separator   U+2029 PARAGRAPH SEPARATOR only
    150 # Z    Separator             Zs | Zl | Zp
    151 # Cc   Control               a C0 or C1 control code
    152 # Cf   Format                a format control character
    153 # Cs   Surrogate             a surrogate code point
    154 # Co   Private_Use           a private-use character
    155 # Cn   Unassigned            a reserved unassigned code point or a noncharacter
    156 # C    Other                 Cc | Cf | Cs | Co | Cn
    157 
    158 sub categorize
    159 {
    160 	my ( $code, $info ) = @_;
    161 
    162 	# http://www.unicode.org/L2/L2003/03139-posix-classes.htm
    163 	my $charinfo = charinfo($code);
    164 	return unless $charinfo;
    165 	my $general_category = $charinfo->{category};
    166 	my $gc = substr $general_category, 0, 1;
    167 
    168 	my $is_upper = $general_category eq 'Lu';
    169 	my $is_lower = $general_category eq 'Ll';
    170 	my $is_space = charprop( $code, 'Sentence_Break' ) eq 'Sp';
    171 
    172 	my $is_print;
    173 	my $matched;
    174 	if ( $general_category eq 'Nd' ) {
    175 		push @{ $info->{DIGIT} }, $code;
    176 		$is_print = 1;
    177 		$matched  = 1;
    178 	} elsif ( $gc eq 'P' or $gc eq 'S' ) {
    179 		push @{ $info->{PUNCT} }, $code;
    180 		$is_print = 1;
    181 		$matched  = 1;
    182 	} elsif ( charprop( $code, 'White_Space' ) eq 'Yes' ) {
    183 		push @{ $info->{SPACE} }, $code;
    184 		$is_print = 1 if charprop( $code, 'Grapheme_Base' ) eq 'Yes';
    185 		$matched = 1;
    186 	} elsif ( charprop( $code, 'Alphabetic' ) eq 'Yes' ) {
    187 		push @{ $info->{ALPHA} }, $code
    188 		    if charprop( $code, 'Numeric_Type' ) eq 'None';
    189 		push @{ $info->{LOWER} }, $code if $is_lower;
    190 		push @{ $info->{UPPER} }, $code if $is_upper;
    191 		push @{ $info->{PHONOGRAM} }, $code
    192 		    if $charinfo->{name} =~ /SYLLABLE/
    193 		    or $charinfo->{block} =~ /Syllable/i;
    194 		$is_print = 1;
    195 		$matched  = 1;
    196 	}
    197 
    198 	if ( $general_category eq 'Cc'
    199 		or charprop( $code, 'Grapheme_Cluster_Break' ) eq 'Control' )
    200 	{
    201 		push @{ $info->{CONTROL} }, $code;
    202 		$matched = 1;
    203 	}
    204 
    205 	push @{ $info->{BLANK} }, $code if $is_space;
    206 
    207 	if (
    208 		not(
    209 			   $is_space or $general_category eq 'Cc',
    210 			or $general_category eq 'Ss',
    211 			or $general_category eq 'Cn',
    212 		)
    213 	    )
    214 	{
    215 		push @{ $info->{GRAPH} }, $code;
    216 		push @{ $info->{SPECIAL} }, $code unless $matched;
    217 		$is_print = 1;
    218 	}
    219 	push @{ $info->{PRINT} }, $code if $is_print;
    220 
    221 	if ( charprop( $code, 'Hex_Digit' ) eq 'Yes' ) {
    222 		push @{ $info->{XDIGIT} }, $code;
    223 		$info->{TODIGIT}{$code} = hex chr $code
    224 		    if charprop( $code, 'ASCII_Hex_Digit' ) eq 'Yes';
    225 		$matched = 1;
    226 	}
    227 
    228 	if ($is_lower) {
    229 		my $mapping = ord charprop( $code, 'Simple_Uppercase_Mapping' );
    230 		$info->{MAPUPPER}{$code} = $mapping if $mapping != $code;
    231 	}
    232 
    233 	if ($is_upper) {
    234 		my $mapping = ord charprop( $code, 'Simple_Lowercase_Mapping' );
    235 		$info->{MAPLOWER}{$code} = $mapping if $mapping != $code;
    236 	}
    237 
    238 	{
    239 		my $mapping = charprop( $code, 'Numeric_Value' );
    240 		$info->{TODIGIT}{$code} = $mapping
    241 		    if $mapping =~ /^[0-9]+$/ and chr($code) ne $mapping;
    242 	}
    243 
    244 	if ($is_print) {
    245 		my $columns = codepoint_columns( $code, $charinfo );
    246 		push @{ $info->{"SWIDTH$columns"} }, $code if defined $columns;
    247 	}
    248 }
    249 
    250 sub print_info
    251 {
    252 	my (%info) = @_;
    253 
    254 	my $printed = 0;
    255 
    256 	foreach my $list (@lists) {
    257 		next unless $info{$list};
    258 		$printed = 1;
    259 		print_list( $list => $info{$list} );
    260 	}
    261 
    262 	print "\n" if $printed;
    263 
    264 	foreach my $map (@maps) {
    265 		next unless $info{$map};
    266 		print_map( $map => $info{$map} );
    267 	}
    268 }
    269 
    270 sub print_list
    271 {
    272 	my ( $list, $points ) = @_;
    273 
    274 	my @squished = reverse @{ squish_points($points) };
    275 	my $line = sprintf "%-10s%s", $list, pop @squished;
    276 
    277 	while (@squished) {
    278 		my $item = pop @squished;
    279 
    280 		if ( length("$line  $item") > 80 ) {
    281 			say $line;
    282 			$line = sprintf "%-10s%s", $list, $item;
    283 		} else {
    284 			$line .= "  $item";    # two leading spaces on purpose
    285 		}
    286 	}
    287 
    288 	say $line;
    289 }
    290 
    291 sub print_map
    292 {
    293 	my ( $map, $points ) = @_;
    294 	my $single = '< %s %s >';
    295 	my $range  = '< %s : %s >';
    296 
    297 	my %map;
    298 
    299 	my $adjustment;
    300 	my $last_diff = 0;
    301 	my $first_point;
    302 	my $prev_point;
    303 	foreach my $point ( sort { $a <=> $b } keys %{$points} ) {
    304 		my $diff = $point - $points->{$point};
    305 
    306 		if ( $diff != $last_diff
    307 			or
    308 			( defined $prev_point and $point - 1 != $prev_point ) )
    309 		{
    310 			$first_point = undef;
    311 			$adjustment  = undef;
    312 			$last_diff   = undef;
    313 		}
    314 
    315 		$first_point //= $point;
    316 		$adjustment  //= $points->{$point};
    317 		$last_diff   //= $diff;
    318 
    319 		$prev_point = $point;
    320 
    321 		push @{ $map{$first_point}{$adjustment} }, $point;
    322 	}
    323 
    324 	my @ranges;
    325 
    326 	foreach my $point ( keys %map ) {
    327 		foreach my $adjustment ( keys %{ $map{$point} } ) {
    328 			my $adj =
    329 			    $map eq 'TODIGIT'
    330 			    ? ( $adjustment || '0x0000' )
    331 			    : format_point($adjustment);
    332 			foreach (
    333 				@{ squish_points( $map{$point}{$adjustment} ) }
    334 			    )
    335 			{
    336 				my $format = / - / ? $range : $single;
    337 				my $formatted = sprintf $format, $_, $adj;
    338 				push @ranges, $formatted;
    339 			}
    340 		}
    341 	}
    342 
    343 	printf "%-10s%s\n", $map, $_ for sort @ranges;
    344 }
    345 
    346 sub squish_points
    347 {
    348 	my ($points) = @_;
    349 	my @squished;
    350 
    351 	my $start;
    352 	my $last_point = 0;
    353 
    354 	foreach my $i ( 0 .. $#{$points} + 1 ) {
    355 
    356 		my $point = $points->[$i];
    357 
    358 		if ( defined $point and $point - 1 == $last_point ) {
    359 			$last_point = $point;
    360 			next;
    361 		}
    362 
    363 		if ( defined $start ) {
    364 			if ( $start == $i - 1 ) {
    365 				push @squished,
    366 				    format_point( $points->[$start] );
    367 			}
    368 
    369 			# TODO: This is nice, but breaks print_map
    370 			#elsif ( $start == $i - 2 ) {
    371 			#    push @squished, format_point( $points->[$start] ),
    372 			#        format_point( $points->[ $i - 1 ] );
    373 			#}
    374 			else {
    375 				push @squished, join ' - ',
    376 				    format_point( $points->[$start] ),
    377 				    format_point( $points->[ $i - 1 ] );
    378 			}
    379 		}
    380 
    381 		$start      = $i;
    382 		$last_point = $point;
    383 	}
    384 
    385 	return \@squished;
    386 }
    387 
    388 sub format_point
    389 {
    390 	my ($point) = @_;
    391 	state %make_chr;
    392 	%make_chr = map { $_ => 1 } ( 0 .. 9, 'a' .. 'z', 'A' .. 'Z' )
    393 	    unless %make_chr;
    394 
    395 	my $chr = chr $point;
    396 	return "'$chr'" if $make_chr{$chr};
    397 	return sprintf "0x%04x", $point;
    398 }
    399 
    400 sub codepoint_columns
    401 {
    402 	my ( $code, $charinfo ) = @_;
    403 	return undef unless defined $code;
    404 
    405 	# Private use areas are _most likely_ used by one column glyphs
    406 	return 1 if $charinfo->{category} eq 'Co';
    407 
    408 	return 0 if $charinfo->{category} eq 'Mn';
    409 	return 0 if $charinfo->{category} eq 'Me';
    410 	return 0 if index( $charinfo->{category}, 'C' ) == 0;
    411 
    412 	return 2 if $charinfo->{block} eq 'Hangul Jamo';
    413 	return 2 if $charinfo->{block} eq 'Hangul Jamo Extended-B';
    414 
    415 	{
    416 		my $eaw = charprop( $code, 'East_Asian_Width' );
    417 		return 2 if $eaw eq 'Wide' or $eaw eq 'Fullwidth';
    418 	}
    419 
    420 	return 1;
    421 }
    422 
    423 __END__
    424 =head1 NAME
    425 
    426 gen_ctype_utf8.pl - rebuild  src/share/locale/ctype/en_US.UTF-8.src
    427 
    428 =head1 SYNOPSIS
    429 
    430 gen_ctype_utf8.pl > en_US.UTF-8.src
    431 
    432 =head1 DESCRIPTION
    433 
    434 The perl community does a good job of keeping their Unicode tables up to date
    435 we can reuse their hard work to rebuild our tables.
    436 
    437 We don't directly use the files from the Unicode Consortium instead we use
    438 summary files that we generate.
    439 See L<mklocale(1)> for more information about these files.
    440 
    441 =head1 CAVEATS
    442 
    443 Requires perl 5.22 or newer.
    444 
    445 =head1 AUTHOR
    446 
    447 Andrew Fresh <afresh1 (at] openbsd.org>
    448