Home | History | Annotate | Line # | Download | only in libcpp
      1 /* Make uname2c.h from various sources.
      2    Copyright (C) 2005-2024 Free Software Foundation, Inc.
      3    Contributed by Jakub Jelinek <jakub (at) redhat.com>
      4 
      5 This program is free software; you can redistribute it and/or modify it
      6 under the terms of the GNU General Public License as published by the
      7 Free Software Foundation; either version 3, or (at your option) any
      8 later version.
      9 
     10 This program is distributed in the hope that it will be useful,
     11 but WITHOUT ANY WARRANTY; without even the implied warranty of
     12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13 GNU General Public License for more details.
     14 
     15 You should have received a copy of the GNU General Public License
     16 along with this program; see the file COPYING3.  If not see
     17 <http://www.gnu.org/licenses/>.  */
     18 
     19 /* Run this program as
     20    ./makeuname2c UnicodeData.txt NameAliases.txt > uname2c.h
     21 
     22    This program generates 2 big arrays and 2 small ones.
     23    The large ones are uname2c_dict, initialized by string literal
     24    representing dictionary, and uname2c_tree, which is a space optimized
     25    radix tree.
     26    The format of the radix tree is:
     27    byte 0	either 0x80 + (key[0] - ' ')	(if key_len == 1)
     28 		or key_len			(otherwise)
     29 		either of them ored with 0x40 if it has a codepoint
     30    byte 1	LSB of offset into uname2c_dict for key	(only if key_len > 1)
     31    byte 2	MSB of offset into uname2c_dict for key	(only if key_len > 1)
     32 		if key_len == 1, the above 2 bytes are omitted
     33    byte 3	LSB of codepoint (only if it has a codepoint)
     34    byte 4	middle byte of codepoint (ditto)
     35    byte 5	MSB of codepoint (ditto), ored with 0x80 if node has children
     36 				   ored with 0x40 if it doesn't have siblings
     37 		if it doesn't have a codepoint, the above 3 bytes are omitted
     38 		and we assume that the node has children
     39    byte 6, 7, 8	uleb128 encoded offset to first child relative to the end
     40 		of the uleb128 (only if node has children)
     41    byte 9	0xff (only if node doesn't have a codepoint and doesn't
     42 		      have siblings)
     43 
     44    For prefixes of Unicode NR1 or NR2 rule generated names, on a node
     45    representing end of the prefix codepoint is 0xd800 + index into
     46    uname2c_generated array with indexes into uname2c_pairs array of
     47    code points (low, high) of the ranges terminated by single 0.
     48    0xd800 is NR1 rule (Hangul syllables), rest are NR2 rules.
     49 */
     50 
     51 #include <assert.h>
     52 #include <stdio.h>
     53 #include <string.h>
     54 #include <stdint.h>
     55 #include <ctype.h>
     56 #include <limits.h>
     57 #include <stdarg.h>
     58 #include <stdbool.h>
     59 #include <stdlib.h>
     60 
     61 #define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
     62 
     63 #define NUM_CODE_POINTS 0x110000
     64 #define MAX_CODE_POINT 0x10ffff
     65 #define NO_VALUE 0xdc00
     66 #define GENERATED 0xd800
     67 
     68 struct entry { const char *name; unsigned long codepoint; };
     69 static struct entry *entries;
     70 static unsigned long num_allocated, num_entries;
     71 
     72 /* Unicode 15.1 Table 4-8.  */
     73 struct generated {
     74   const char *prefix;
     75   /* max_high is a workaround for UnicodeData.txt inconsistencies
     76      on a few CJK UNIFIED IDEOGRAPH- ranges where the "*, Last>"
     77      entry is a few code points above the end of the range.  */
     78   unsigned long low, high, max_high;
     79   int idx, ok;
     80 };
     81 static struct generated generated_ranges[] =
     82 { { "HANGUL SYLLABLE ", 0xac00, 0xd7a3, 0, 0, 0 }, /* NR1 rule */
     83   { "CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4dbf, 0, 1, 0 }, /* NR2 rules */
     84   { "CJK UNIFIED IDEOGRAPH-", 0x4e00, 0x9fff, 0, 1, 0 },
     85   { "CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2a6df, 0, 1, 0 },
     86   { "CJK UNIFIED IDEOGRAPH-", 0x2a700, 0x2b739, 0, 1, 0 },
     87   { "CJK UNIFIED IDEOGRAPH-", 0x2b740, 0x2b81d, 0, 1, 0 },
     88   { "CJK UNIFIED IDEOGRAPH-", 0x2b820, 0x2cea1, 0, 1, 0 },
     89   { "CJK UNIFIED IDEOGRAPH-", 0x2ceb0, 0x2ebe0, 0, 1, 0 },
     90   { "CJK UNIFIED IDEOGRAPH-", 0x2ebf0, 0x2ee5d, 0, 1, 0 },
     91   { "CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134a, 0, 1, 0 },
     92   { "CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323af, 0, 1, 0 },
     93   { "TANGUT IDEOGRAPH-", 0x17000, 0x187f7, 0, 2, 0 },
     94   { "TANGUT IDEOGRAPH-", 0x18d00, 0x18d08, 0, 2, 0 },
     95   { "KHITAN SMALL SCRIPT CHARACTER-", 0x18b00, 0x18cd5, 0, 3, 0 },
     96   { "NUSHU CHARACTER-", 0x1b170, 0x1b2fb, 0, 4, 0 },
     97   { "CJK COMPATIBILITY IDEOGRAPH-", 0xf900, 0xfa6d, 0, 5, 0 },
     98   { "CJK COMPATIBILITY IDEOGRAPH-", 0xfa70, 0xfad9, 0, 5, 0 },
     99   { "CJK COMPATIBILITY IDEOGRAPH-", 0x2f800, 0x2fa1d, 0, 5, 0 }
    100 };
    101 
    102 struct node {
    103   struct node *sibling, *child;
    104   const char *key;
    105   size_t key_len, key_idx, node_size, size_sum, child_off;
    106   unsigned long codepoint;
    107   bool in_dict;
    108 };
    109 static struct node *root, **nodes;
    110 static unsigned long num_nodes;
    111 static size_t dict_size, tree_size, max_entry_len;
    112 static char *dict;
    113 static unsigned char *tree;
    114 
    115 /* Die!  */
    116 
    117 static void
    118 fail (const char *s, ...)
    119 {
    120   va_list ap;
    121 
    122   va_start (ap, s);
    123   vfprintf (stderr, s, ap);
    124   va_end (ap);
    125   fputc ('\n', stderr);
    126   exit (1);
    127 }
    128 
    129 static void *
    130 xmalloc (size_t size)
    131 {
    132   void *ret = malloc (size);
    133 
    134   if (ret == NULL)
    135     fail ("failed to allocate %ld bytes", (long) size);
    136   return ret;
    137 }
    138 
    139 static void *
    140 xrealloc (void *p, size_t size)
    141 {
    142   void *ret = p ? realloc (p, size) : malloc (size);
    143 
    144   if (ret == NULL)
    145     fail ("failed to allocate %ld bytes", (long) size);
    146   return ret;
    147 }
    148 
    149 static int
    150 entrycmp (const void *p1, const void *p2)
    151 {
    152   const struct entry *e1 = (const struct entry *) p1;
    153   const struct entry *e2 = (const struct entry *) p2;
    154   int ret = strcmp (e1->name, e2->name);
    155 
    156   if (ret != 0)
    157     return ret;
    158   if (e1->codepoint < e2->codepoint)
    159     return -1;
    160   if (e1->codepoint > e2->codepoint)
    161     return 1;
    162   return 0;
    163 }
    164 
    165 static int
    166 nodecmp (const void *p1, const void *p2)
    167 {
    168   const struct node *n1 = *(const struct node *const *) p1;
    169   const struct node *n2 = *(const struct node *const *) p2;
    170   if (n1->key_len > n2->key_len)
    171     return -1;
    172   if (n1->key_len < n2->key_len)
    173     return 1;
    174   return memcmp (n1->key, n2->key, n1->key_len);
    175 }
    176 
    177 /* Read UnicodeData.txt and fill in the 'decomp' table to be the
    178    decompositions of characters for which both the character
    179    decomposed and all the code points in the decomposition are valid
    180    for some supported language version, and the 'all_decomp' table to
    181    be the decompositions of all characters without those
    182    constraints.  */
    183 
    184 static void
    185 read_table (char *fname, bool aliases_p)
    186 {
    187   FILE *f = fopen (fname, "r");
    188   const char *sname = aliases_p ? "NameAliases.txt" : "UnicodeData.txt";
    189 
    190   if (!f)
    191     fail ("opening %s", sname);
    192   for (;;)
    193     {
    194       char line[256];
    195       unsigned long codepoint;
    196       const char *name, *aname;
    197       char *l;
    198       size_t i;
    199 
    200       if (!fgets (line, sizeof (line), f))
    201 	break;
    202       codepoint = strtoul (line, &l, 16);
    203       if (l == line && aliases_p)
    204 	{
    205 	  /* NameAliased.txt can contain comments and empty lines.  */
    206 	  if (*line == '#' || *line == '\n')
    207 	    continue;
    208 	}
    209       if (l == line || *l != ';')
    210 	fail ("parsing %s, reading code point", sname);
    211       if (codepoint > MAX_CODE_POINT)
    212 	fail ("parsing %s, code point too large", sname);
    213 
    214       name = l + 1;
    215       do {
    216 	++l;
    217       } while (*l != ';');
    218 
    219       aname = NULL;
    220       if (aliases_p)
    221 	{
    222 	  /* Ignore figment and abbreviation aliases.  */
    223 	  if (strcmp (l + 1, "correction\n") != 0
    224 	      && strcmp (l + 1, "control\n") != 0
    225 	      && strcmp (l + 1, "alternate\n") != 0)
    226 	    continue;
    227 	  i = ARRAY_SIZE (generated_ranges);
    228 	}
    229       else
    230 	{
    231 	  for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
    232 	    if (codepoint >= generated_ranges[i].low
    233 		&& codepoint <= generated_ranges[i].max_high)
    234 	      break;
    235 	  if (i != ARRAY_SIZE (generated_ranges))
    236 	    {
    237 	      if (*name == '<' && l[-1] == '>')
    238 		{
    239 		  if (codepoint == generated_ranges[i].low
    240 		      && l - name >= 9
    241 		      && memcmp (l - 8, ", First>", 8) == 0
    242 		      && generated_ranges[i].ok == 0)
    243 		    {
    244 		      generated_ranges[i].ok = INT_MAX - 1;
    245 		      aname = generated_ranges[i].prefix;
    246 		      codepoint = GENERATED + generated_ranges[i].idx;
    247 		    }
    248 		  /* Unfortunately, UnicodeData.txt isn't consistent
    249 		     with the Table 4-8 range endpoints in 3 cases,
    250 		     the ranges are longer there by a few codepoints.
    251 		     So use the max_high hack to avoid verification
    252 		     failures.  */
    253 		  else if (codepoint == generated_ranges[i].max_high
    254 			   && l - name >= 8
    255 			   && memcmp (l - 7, ", Last>", 7) == 0
    256 			   && generated_ranges[i].ok == INT_MAX - 1)
    257 		    {
    258 		      generated_ranges[i].ok = INT_MAX;
    259 		      continue;
    260 		    }
    261 		  else
    262 		    fail ("unexpected generated entry %lx %.*s",
    263 			  codepoint, (int) (l - name), name);
    264 		}
    265 	      else if (codepoint
    266 		       == generated_ranges[i].low + generated_ranges[i].ok
    267 		       && l - name == (strlen (generated_ranges[i].prefix)
    268 				       + (name - 1 - line))
    269 		       && memcmp (name, generated_ranges[i].prefix,
    270 				  strlen (generated_ranges[i].prefix)) == 0
    271 		       && memcmp (name + strlen (generated_ranges[i].prefix),
    272 				  line, name - 1 - line) == 0)
    273 		{
    274 		  ++generated_ranges[i].ok;
    275 		  if (codepoint != generated_ranges[i].low)
    276 		    continue;
    277 		  aname = generated_ranges[i].prefix;
    278 		  codepoint = GENERATED + generated_ranges[i].idx;
    279 		}
    280 	      else
    281 		fail ("unexpected generated entry %lx %.*s",
    282 		      codepoint, (int) (l - name), name);
    283 	      if (aname == generated_ranges[i].prefix)
    284 		{
    285 		  size_t j;
    286 
    287 		  /* Don't add an entry for a generated range where the
    288 		     same prefix has been added already.  */
    289 		  for (j = 0; j < i; ++j)
    290 		    if (generated_ranges[j].idx == generated_ranges[i].idx
    291 			&& generated_ranges[j].ok != 0)
    292 		      break;
    293 		  if (j < i)
    294 		    continue;
    295 		}
    296 	    }
    297 	  else if (*name == '<' && l[-1] == '>')
    298 	    continue;
    299 	}
    300 
    301       if (num_entries == num_allocated)
    302 	{
    303 	  num_allocated = num_allocated ? 2 * num_allocated : 65536;
    304 	  entries = (struct entry *) xrealloc (entries, num_allocated
    305 							* sizeof (entries[0]));
    306 	}
    307 
    308       if (aname == NULL)
    309 	{
    310 	  char *a = (char *) xmalloc (l + 1 - name);
    311 	  if (l - name > max_entry_len)
    312 	    max_entry_len = l - name;
    313 	  memcpy (a, name, l - name);
    314 	  a[l - name] = '\0';
    315 	  aname = a;
    316 	}
    317       entries[num_entries].name = aname;
    318       entries[num_entries++].codepoint = codepoint;
    319     }
    320   if (ferror (f))
    321     fail ("reading %s", sname);
    322   fclose (f);
    323 }
    324 
    325 /* Assumes nodes are added from sorted array, so we never
    326    add any node before existing one, only after it.  */
    327 
    328 static void
    329 node_add (struct node **p, const char *key, size_t key_len,
    330 	  unsigned long codepoint)
    331 {
    332   struct node *n;
    333   size_t i;
    334 
    335   do
    336     {
    337       if (*p == NULL)
    338 	{
    339 	  *p = n = (struct node *) xmalloc (sizeof (struct node));
    340 	  ++num_nodes;
    341 	  assert (key_len);
    342 	  n->sibling = NULL;
    343 	  n->child = NULL;
    344 	  n->key = key;
    345 	  n->key_len = key_len;
    346 	  n->codepoint = codepoint;
    347 	  return;
    348 	}
    349       n = *p;
    350       for (i = 0; i < n->key_len && i < key_len; ++i)
    351 	if (n->key[i] != key[i])
    352 	  break;
    353       if (i == 0)
    354 	{
    355 	  p = &n->sibling;
    356 	  continue;
    357 	}
    358       if (i == n->key_len)
    359 	{
    360 	  assert (key_len > n->key_len);
    361 	  p = &n->child;
    362 	  key += n->key_len;
    363 	  key_len -= n->key_len;
    364 	  continue;
    365 	}
    366       /* Need to split the node.  */
    367       assert (i < key_len);
    368       n = (struct node *) xmalloc (sizeof (struct node));
    369       ++num_nodes;
    370       n->sibling = NULL;
    371       n->child = (*p)->child;
    372       n->key = (*p)->key + i;
    373       n->key_len = (*p)->key_len - i;
    374       n->codepoint = (*p)->codepoint;
    375       (*p)->child = n;
    376       (*p)->key_len = i;
    377       (*p)->codepoint = NO_VALUE;
    378       key += i;
    379       key_len -= i;
    380       p = &n->sibling;
    381     }
    382   while (1);
    383 }
    384 
    385 static void
    386 append_nodes (struct node *n)
    387 {
    388   for (; n; n = n->sibling)
    389     {
    390       nodes[num_nodes++] = n;
    391       append_nodes (n->child);
    392     }
    393 }
    394 
    395 static size_t
    396 sizeof_uleb128 (size_t val)
    397 {
    398   size_t sz = 0;
    399   do
    400     {
    401       val >>= 7;
    402       sz += 1;
    403     }
    404   while (val != 0);
    405   return sz;
    406 }
    407 
    408 static void
    409 size_nodes (struct node *n)
    410 {
    411   if (n->child)
    412     size_nodes (n->child);
    413   if (n->sibling)
    414     size_nodes (n->sibling);
    415   n->node_size = 1 + (n->key_len > 1) * 2;
    416   if (n->codepoint != NO_VALUE)
    417     n->node_size += 3;
    418   else if (n->sibling == NULL)
    419     ++n->node_size;
    420   n->size_sum = 0;
    421   n->child_off = 0;
    422   if (n->sibling)
    423     n->size_sum += n->sibling->size_sum;
    424   if (n->child)
    425     {
    426       n->child_off = n->size_sum + (n->codepoint == NO_VALUE
    427 				    && n->sibling == NULL);
    428       n->node_size += sizeof_uleb128 (n->child_off);
    429     }
    430   n->size_sum += n->node_size;
    431   if (n->child)
    432     n->size_sum += n->child->size_sum;
    433   tree_size += n->node_size;
    434 }
    435 
    436 static void
    437 write_uleb128 (unsigned char *p, size_t val)
    438 {
    439   unsigned char c;
    440   do
    441     {
    442       c = val & 0x7f;
    443       val >>= 7;
    444       if (val)
    445 	c |= 0x80;
    446       *p++ = c;
    447     }
    448   while (val);
    449 }
    450 
    451 static void
    452 write_nodes (struct node *n, size_t off)
    453 {
    454   for (; n; n = n->sibling)
    455     {
    456       assert (off < tree_size && tree[off] == 0);
    457       if (n->key_len > 1)
    458 	{
    459 	  assert (n->key_len < 64);
    460 	  tree[off] = n->key_len;
    461 	}
    462       else
    463 	tree[off] = (n->key[0] - ' ') | 0x80;
    464       assert ((tree[off] & 0x40) == 0);
    465       if (n->codepoint != NO_VALUE)
    466 	tree[off] |= 0x40;
    467       off++;
    468       if (n->key_len > 1)
    469 	{
    470 	  tree[off++] = n->key_idx & 0xff;
    471 	  tree[off++] = (n->key_idx >> 8) & 0xff;
    472 	}
    473       if (n->codepoint != NO_VALUE)
    474 	{
    475 	  assert (n->codepoint < (1L << 21));
    476 	  tree[off++] = n->codepoint & 0xff;
    477 	  tree[off++] = (n->codepoint >> 8) & 0xff;
    478 	  tree[off] = (n->codepoint >> 16) & 0xff;
    479 	  if (n->child)
    480 	    tree[off] |= 0x80;
    481 	  if (!n->sibling)
    482 	    tree[off] |= 0x40;
    483 	  off++;
    484 	}
    485       if (n->child)
    486 	{
    487 	  write_uleb128 (&tree[off], n->child_off);
    488 	  off += sizeof_uleb128 (n->child_off);
    489 	  write_nodes (n->child, off + n->child_off);
    490 	}
    491       if (n->codepoint == NO_VALUE
    492 	  && n->sibling == NULL)
    493 	tree[off++] = 0xff;
    494     }
    495   assert (off <= tree_size);
    496 }
    497 
    498 static void
    499 build_radix_tree (void)
    500 {
    501   size_t i, j, k, key_idx;
    502 
    503   for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
    504     if (generated_ranges[i].ok == INT_MAX)
    505       {
    506 	if (generated_ranges[i].max_high - generated_ranges[i].high > 15UL)
    507 	  break;
    508       }
    509     else if (generated_ranges[i].ok == (generated_ranges[i].high
    510 					- generated_ranges[i].low + 1))
    511       {
    512 	if (generated_ranges[i].max_high != generated_ranges[i].high)
    513 	  break;
    514       }
    515     else
    516       break;
    517   if (i < ARRAY_SIZE (generated_ranges))
    518     fail ("uncovered generated range %s %lx %lx",
    519 	  generated_ranges[i].prefix, generated_ranges[i].low,
    520 	  generated_ranges[i].high);
    521   /* Sort entries alphabetically, node_add relies on that.  */
    522   qsort (entries, num_entries, sizeof (struct entry), entrycmp);
    523   for (i = 1; i < num_entries; ++i)
    524     if (i && strcmp (entries[i].name, entries[i - 1].name) == 0)
    525       fail ("multiple entries for name %s", entries[i].name);
    526 
    527   for (i = 0; i < num_entries; ++i)
    528     node_add (&root, entries[i].name, strlen (entries[i].name),
    529 	      entries[i].codepoint);
    530 
    531   nodes = (struct node **) xmalloc (num_nodes * sizeof (struct node *));
    532   i = num_nodes;
    533   num_nodes = 0;
    534   append_nodes (root);
    535   assert (num_nodes == i);
    536   /* Sort node pointers by decreasing string length to handle substrings
    537      right.  */
    538   qsort (nodes, num_nodes, sizeof (struct node *), nodecmp);
    539   if (nodes[0]->key_len >= 64)
    540     /* We could actually encode even 64 and 65, as key_len 0 and 1 will
    541        never appear in the multiple letter key encodings, so could subtract
    542        2.  */
    543     fail ("can't encode key length %d >= 64, so need to split some radix "
    544 	  "tree nodes to ensure length fits", nodes[0]->key_len);
    545 
    546   /* Verify a property charset.cc UAX44-LM2 matching relies on:
    547      if - is at the end of key of some node, then all its siblings
    548      start with alphanumeric characters.
    549      Only 2 character names and 1 alias have - followed by space:
    550      U+0F0A TIBETAN MARK BKA- SHOG YIG MGO
    551      U+0FD0 TIBETAN MARK BKA- SHOG GI MGO RGYAN
    552      U+0FD0 TIBETAN MARK BSKA- SHOG GI MGO RGYAN
    553      so the KA- in there will always be followed at least by SHOG
    554      in the same node.
    555      If this changes, charset.cc needs to change.  */
    556   for (i = 0; i < num_nodes; ++i)
    557     if (nodes[i]->key[nodes[i]->key_len - 1] == '-'
    558 	&& nodes[i]->child)
    559       {
    560 	struct node *n;
    561 
    562 	for (n = nodes[i]->child; n; n = n->sibling)
    563 	  if (n->key[0] == ' ')
    564 	    fail ("node with key %.*s followed by node with key %.*s",
    565 		  (int) nodes[i]->key_len, nodes[i]->key,
    566 		  (int) n->key_len, n->key);
    567       }
    568 
    569   /* This is expensive, O(num_nodes * num_nodes * nodes[0]->key_len), but
    570      fortunately num_nodes is < 64K and key_len < 64.  */
    571   key_idx = 0;
    572   for (i = 0; i < num_nodes; ++i)
    573     {
    574       nodes[i]->key_idx = SIZE_MAX;
    575       nodes[i]->in_dict = false;
    576       if (nodes[i]->key_len > 1)
    577 	{
    578 	  for (j = 0; j < i; ++j)
    579 	    /* Can't rely on memmem unfortunately.  */
    580 	    if (nodes[j]->in_dict)
    581 	      {
    582 		for (k = 0; k <= nodes[j]->key_len - nodes[i]->key_len; ++k)
    583 		  if (nodes[j]->key[k] == nodes[i]->key[0]
    584 		      && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1,
    585 				 nodes[i]->key_len - 1) == 0)
    586 		    {
    587 		      nodes[i]->key_idx = nodes[j]->key_idx + k;
    588 		      j = i;
    589 		      break;
    590 		    }
    591 		if (j == i)
    592 		  break;
    593 		for (; k < nodes[j]->key_len; ++k)
    594 		  if (nodes[j]->key[k] == nodes[i]->key[0]
    595 		      && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1,
    596 				 nodes[j]->key_len - 1 - k) == 0)
    597 		    {
    598 		      size_t l;
    599 
    600 		      for (l = j + 1; l < i; ++l)
    601 			if (nodes[l]->in_dict)
    602 			  break;
    603 		      if (l < i
    604 			  && memcmp (nodes[l]->key,
    605 				     nodes[i]->key + (nodes[j]->key_len - k),
    606 				     nodes[i]->key_len
    607 				     - (nodes[j]->key_len - k)) == 0)
    608 			{
    609 			  nodes[i]->key_idx = nodes[j]->key_idx + k;
    610 			  j = i;
    611 			}
    612 		      else
    613 			j = l - 1;
    614 		      break;
    615 		    }
    616 	      }
    617 	  if (nodes[i]->key_idx == SIZE_MAX)
    618 	    {
    619 	      nodes[i]->key_idx = key_idx;
    620 	      nodes[i]->in_dict = true;
    621 	      key_idx += nodes[i]->key_len;
    622 	    }
    623 	}
    624     }
    625   if (key_idx >= 65536)
    626     /* We only use 2 bytes for offsets into the dictionary.
    627        If it grows more, there is e.g. a possibility to replace
    628        most often seen words or substrings in the dictionary
    629        with characters other than [A-Z0-9 -] (say LETTER occurs
    630        in the dictionary almost 197 times and so by using a
    631        instead of LETTER we could save (6 - 1) * 197 bytes,
    632        with some on the side table mapping 'a' to "LETTER".  */
    633     fail ("too large dictionary %ld", (long) key_idx);
    634   dict_size = key_idx;
    635 
    636   size_nodes (root);
    637 
    638   dict = (char *) xmalloc (dict_size + 1);
    639   for (i = 0; i < num_nodes; ++i)
    640     if (nodes[i]->in_dict)
    641       memcpy (dict + nodes[i]->key_idx, nodes[i]->key, nodes[i]->key_len);
    642   dict[dict_size] = '\0';
    643 
    644   tree = (unsigned char *) xmalloc (tree_size);
    645   memset (tree, 0, tree_size);
    646   write_nodes (root, 0);
    647 }
    648 
    649 /* Print out the huge copyright notice.  */
    650 
    651 static void
    652 write_copyright (void)
    653 {
    654   static const char copyright[] = "\
    655 /* Unicode name to codepoint.\n\
    656    Copyright (C) 2005-2024 Free Software Foundation, Inc.\n\
    657 \n\
    658    This program is free software; you can redistribute it and/or modify it\n\
    659    under the terms of the GNU General Public License as published by the\n\
    660    Free Software Foundation; either version 3, or (at your option) any\n\
    661    later version.\n\
    662 \n\
    663    This program is distributed in the hope that it will be useful,\n\
    664    but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
    665    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n\
    666    GNU General Public License for more details.\n\
    667 \n\
    668    You should have received a copy of the GNU General Public License\n\
    669    along with this program; see the file COPYING3.  If not see\n\
    670    <http://www.gnu.org/licenses/>.\n\
    671 \n\
    672 \n\
    673    Copyright (C) 1991-2023 Unicode, Inc.  All rights reserved.\n\
    674    Distributed under the Terms of Use in\n\
    675    http://www.unicode.org/copyright.html.\n\
    676 \n\
    677    Permission is hereby granted, free of charge, to any person\n\
    678    obtaining a copy of the Unicode data files and any associated\n\
    679    documentation (the \"Data Files\") or Unicode software and any\n\
    680    associated documentation (the \"Software\") to deal in the Data Files\n\
    681    or Software without restriction, including without limitation the\n\
    682    rights to use, copy, modify, merge, publish, distribute, and/or\n\
    683    sell copies of the Data Files or Software, and to permit persons to\n\
    684    whom the Data Files or Software are furnished to do so, provided\n\
    685    that (a) the above copyright notice(s) and this permission notice\n\
    686    appear with all copies of the Data Files or Software, (b) both the\n\
    687    above copyright notice(s) and this permission notice appear in\n\
    688    associated documentation, and (c) there is clear notice in each\n\
    689    modified Data File or in the Software as well as in the\n\
    690    documentation associated with the Data File(s) or Software that the\n\
    691    data or software has been modified.\n\
    692 \n\
    693    THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\
    694    OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\
    695    WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\
    696    NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\
    697    COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\
    698    ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\
    699    DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\
    700    WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\
    701    ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\
    702    OF THE DATA FILES OR SOFTWARE.\n\
    703 \n\
    704    Except as contained in this notice, the name of a copyright holder\n\
    705    shall not be used in advertising or otherwise to promote the sale,\n\
    706    use or other dealings in these Data Files or Software without prior\n\
    707    written authorization of the copyright holder.  */\n";
    708 
    709    puts (copyright);
    710 }
    711 
    712 static void
    713 write_dict (void)
    714 {
    715   size_t i;
    716 
    717   printf ("static const char uname2c_dict[%ld] =\n", (long) (dict_size + 1));
    718   for (i = 0; i < dict_size; i += 77)
    719     printf ("\"%.77s\"%s\n", dict + i, i + 76 > dict_size ? ";" : "");
    720   puts ("");
    721 }
    722 
    723 static void
    724 write_tree (void)
    725 {
    726   size_t i, j;
    727 
    728   printf ("static const unsigned char uname2c_tree[%ld] = {\n",
    729 	  (long) tree_size);
    730   for (i = 0, j = 0; i < tree_size; ++i)
    731     {
    732       printf ("%s0x%02x%s", j == 0 ? "  " : "", tree[i],
    733 	      i == tree_size - 1 ? " };\n\n" : j == 11 ? ",\n" : ", ");
    734       if (j == 11)
    735 	j = 0;
    736       else
    737 	++j;
    738     }
    739 }
    740 
    741 static void
    742 write_generated (void)
    743 {
    744   size_t i, j;
    745 
    746   puts ("static const cppchar_t uname2c_pairs[] = {");
    747   for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
    748     {
    749       if (i == 0)
    750 	;
    751       else if (generated_ranges[i - 1].idx != generated_ranges[i].idx)
    752 	puts (", 0,");
    753       else
    754 	puts (",");
    755       printf ("  0x%lx, 0x%lx /* %s */",
    756 	      generated_ranges[i].low,
    757 	      generated_ranges[i].high,
    758 	      generated_ranges[i].prefix);
    759     }
    760   puts (", 0 };\n");
    761 
    762   puts ("static const unsigned char uname2c_generated[] = {");
    763   for (i = 0, j = -1; i < ARRAY_SIZE (generated_ranges); ++i)
    764     {
    765       if (i == 0 || generated_ranges[i - 1].idx != generated_ranges[i].idx)
    766 	printf ("%s  %d /* %s */", i ? ",\n" : "",
    767 		++j, generated_ranges[i].prefix);
    768       j += 2;
    769     }
    770   puts (" };\n");
    771 }
    772 
    773 /* Main program.  */
    774 
    775 int
    776 main (int argc, char **argv)
    777 {
    778   size_t i;
    779 
    780   if (argc != 3)
    781     fail ("too few arguments to makeradixtree");
    782   for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
    783     if (!generated_ranges[i].max_high)
    784       generated_ranges[i].max_high = generated_ranges[i].high;
    785   read_table (argv[1], false);
    786   read_table (argv[2], true);
    787   build_radix_tree ();
    788 
    789   write_copyright ();
    790   write_dict ();
    791   write_tree ();
    792   write_generated ();
    793   printf ("static const unsigned int uname2c_max_name_len = %ld;\n\n", max_entry_len);
    794   return 0;
    795 }
    796