Home | History | Annotate | Line # | Download | only in ucdata
ucgendat.c revision 1.1.1.1.8.2
      1 /* $OpenLDAP: pkg/ldap/libraries/liblunicode/ucdata/ucgendat.c,v 1.39.2.3 2008/02/11 23:26:42 kurt Exp $ */
      2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
      3  *
      4  * Copyright 1998-2008 The OpenLDAP Foundation.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted only as authorized by the OpenLDAP
      9  * Public License.
     10  *
     11  * A copy of this license is available in file LICENSE in the
     12  * top-level directory of the distribution or, alternatively, at
     13  * <http://www.OpenLDAP.org/license.html>.
     14  */
     15 /* Copyright 2001 Computing Research Labs, New Mexico State University
     16  *
     17  * Permission is hereby granted, free of charge, to any person obtaining a
     18  * copy of this software and associated documentation files (the "Software"),
     19  * to deal in the Software without restriction, including without limitation
     20  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     21  * and/or sell copies of the Software, and to permit persons to whom the
     22  * Software is furnished to do so, subject to the following conditions:
     23  *
     24  * The above copyright notice and this permission notice shall be included in
     25  * all copies or substantial portions of the Software.
     26  *
     27  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     28  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     29  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     30  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
     31  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
     32  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
     33  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     34  */
     35 /* $Id: ucgendat.c,v 1.1.1.1.8.2 2008/05/22 14:20:37 lukem Exp $" */
     36 
     37 #include "portable.h"
     38 #include "ldap_config.h"
     39 
     40 #include <stdio.h>
     41 #include <ac/ctype.h>
     42 #include <ac/stdlib.h>
     43 #include <ac/string.h>
     44 #include <ac/unistd.h>
     45 
     46 #include <ac/bytes.h>
     47 
     48 #include <lutil.h>
     49 
     50 #ifndef HARDCODE_DATA
     51 #define	HARDCODE_DATA	1
     52 #endif
     53 
     54 #undef ishdigit
     55 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
     56                       ((cc) >= 'A' && (cc) <= 'F') ||\
     57                       ((cc) >= 'a' && (cc) <= 'f'))
     58 
     59 /*
     60  * A header written to the output file with the byte-order-mark and the number
     61  * of property nodes.
     62  */
     63 static ac_uint2 hdr[2] = {0xfeff, 0};
     64 
     65 #define NUMPROPS 50
     66 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
     67 
     68 typedef struct {
     69     char *name;
     70     int len;
     71 } _prop_t;
     72 
     73 /*
     74  * List of properties expected to be found in the Unicode Character Database
     75  * including some implementation specific properties.
     76  *
     77  * The implementation specific properties are:
     78  * Cm = Composed (can be decomposed)
     79  * Nb = Non-breaking
     80  * Sy = Symmetric (has left and right forms)
     81  * Hd = Hex digit
     82  * Qm = Quote marks
     83  * Mr = Mirroring
     84  * Ss = Space, other
     85  * Cp = Defined character
     86  */
     87 static _prop_t props[NUMPROPS] = {
     88     {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
     89     {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
     90     {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
     91     {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
     92     {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L",  1}, {"R",  1},
     93     {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B",  1},
     94     {"S",  1}, {"WS", 2}, {"ON", 2},
     95     {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
     96     {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2}
     97 };
     98 
     99 typedef struct {
    100     ac_uint4 *ranges;
    101     ac_uint2 used;
    102     ac_uint2 size;
    103 } _ranges_t;
    104 
    105 static _ranges_t proptbl[NUMPROPS];
    106 
    107 /*
    108  * Make sure this array is sized to be on a 4-byte boundary at compile time.
    109  */
    110 static ac_uint2 propcnt[NEEDPROPS];
    111 
    112 /*
    113  * Array used to collect a decomposition before adding it to the decomposition
    114  * table.
    115  */
    116 static ac_uint4 dectmp[64];
    117 static ac_uint4 dectmp_size;
    118 
    119 typedef struct {
    120     ac_uint4 code;
    121     ac_uint2 size;
    122     ac_uint2 used;
    123     ac_uint4 *decomp;
    124 } _decomp_t;
    125 
    126 /*
    127  * List of decomposition.  Created and expanded in order as the characters are
    128  * encountered. First list contains canonical mappings, second also includes
    129  * compatibility mappings.
    130  */
    131 static _decomp_t *decomps;
    132 static ac_uint4 decomps_used;
    133 static ac_uint4 decomps_size;
    134 
    135 static _decomp_t *kdecomps;
    136 static ac_uint4 kdecomps_used;
    137 static ac_uint4 kdecomps_size;
    138 
    139 /*
    140  * Composition exclusion table stuff.
    141  */
    142 #define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31)))
    143 #define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31)))
    144 static ac_uint4 compexs[8192];
    145 
    146 /*
    147  * Struct for holding a composition pair, and array of composition pairs
    148  */
    149 typedef struct {
    150     ac_uint4 comp;
    151     ac_uint4 count;
    152     ac_uint4 code1;
    153     ac_uint4 code2;
    154 } _comp_t;
    155 
    156 static _comp_t *comps;
    157 static ac_uint4 comps_used;
    158 
    159 /*
    160  * Types and lists for handling lists of case mappings.
    161  */
    162 typedef struct {
    163     ac_uint4 key;
    164     ac_uint4 other1;
    165     ac_uint4 other2;
    166 } _case_t;
    167 
    168 static _case_t *upper;
    169 static _case_t *lower;
    170 static _case_t *title;
    171 static ac_uint4 upper_used;
    172 static ac_uint4 upper_size;
    173 static ac_uint4 lower_used;
    174 static ac_uint4 lower_size;
    175 static ac_uint4 title_used;
    176 static ac_uint4 title_size;
    177 
    178 /*
    179  * Array used to collect case mappings before adding them to a list.
    180  */
    181 static ac_uint4 cases[3];
    182 
    183 /*
    184  * An array to hold ranges for combining classes.
    185  */
    186 static ac_uint4 *ccl;
    187 static ac_uint4 ccl_used;
    188 static ac_uint4 ccl_size;
    189 
    190 /*
    191  * Structures for handling numbers.
    192  */
    193 typedef struct {
    194     ac_uint4 code;
    195     ac_uint4 idx;
    196 } _codeidx_t;
    197 
    198 typedef struct {
    199     short numerator;
    200     short denominator;
    201 } _num_t;
    202 
    203 /*
    204  * Arrays to hold the mapping of codes to numbers.
    205  */
    206 static _codeidx_t *ncodes;
    207 static ac_uint4 ncodes_used;
    208 static ac_uint4 ncodes_size;
    209 
    210 static _num_t *nums;
    211 static ac_uint4 nums_used;
    212 static ac_uint4 nums_size;
    213 
    214 /*
    215  * Array for holding numbers.
    216  */
    217 static _num_t *nums;
    218 static ac_uint4 nums_used;
    219 static ac_uint4 nums_size;
    220 
    221 static void
    222 add_range(ac_uint4 start, ac_uint4 end, char *p1, char *p2)
    223 {
    224     int i, j, k, len;
    225     _ranges_t *rlp;
    226     char *name;
    227 
    228     for (k = 0; k < 2; k++) {
    229         if (k == 0) {
    230             name = p1;
    231             len = 2;
    232         } else {
    233             if (p2 == 0)
    234               break;
    235 
    236             name = p2;
    237             len = 1;
    238         }
    239 
    240         for (i = 0; i < NUMPROPS; i++) {
    241             if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
    242               break;
    243         }
    244 
    245         if (i == NUMPROPS)
    246           continue;
    247 
    248         rlp = &proptbl[i];
    249 
    250         /*
    251          * Resize the range list if necessary.
    252          */
    253         if (rlp->used == rlp->size) {
    254             if (rlp->size == 0)
    255               rlp->ranges = (ac_uint4 *)
    256                   malloc(sizeof(ac_uint4) << 3);
    257             else
    258               rlp->ranges = (ac_uint4 *)
    259                   realloc((char *) rlp->ranges,
    260                           sizeof(ac_uint4) * (rlp->size + 8));
    261             rlp->size += 8;
    262         }
    263 
    264         /*
    265          * If this is the first code for this property list, just add it
    266          * and return.
    267          */
    268         if (rlp->used == 0) {
    269             rlp->ranges[0] = start;
    270             rlp->ranges[1] = end;
    271             rlp->used += 2;
    272             continue;
    273         }
    274 
    275         /*
    276          * Optimize the case of adding the range to the end.
    277          */
    278         j = rlp->used - 1;
    279         if (start > rlp->ranges[j]) {
    280             j = rlp->used;
    281             rlp->ranges[j++] = start;
    282             rlp->ranges[j++] = end;
    283             rlp->used = j;
    284             continue;
    285         }
    286 
    287         /*
    288          * Need to locate the insertion point.
    289          */
    290         for (i = 0;
    291              i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
    292 
    293         /*
    294          * If the start value lies in the current range, then simply set the
    295          * new end point of the range to the end value passed as a parameter.
    296          */
    297         if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
    298             rlp->ranges[i + 1] = end;
    299             return;
    300         }
    301 
    302         /*
    303          * Shift following values up by two.
    304          */
    305         for (j = rlp->used; j > i; j -= 2) {
    306             rlp->ranges[j] = rlp->ranges[j - 2];
    307             rlp->ranges[j + 1] = rlp->ranges[j - 1];
    308         }
    309 
    310         /*
    311          * Add the new range at the insertion point.
    312          */
    313         rlp->ranges[i] = start;
    314         rlp->ranges[i + 1] = end;
    315         rlp->used += 2;
    316     }
    317 }
    318 
    319 static void
    320 ordered_range_insert(ac_uint4 c, char *name, int len)
    321 {
    322     int i, j;
    323     ac_uint4 s, e;
    324     _ranges_t *rlp;
    325 
    326     if (len == 0)
    327       return;
    328 
    329     /*
    330      * Deal with directionality codes introduced in Unicode 3.0.
    331      */
    332     if ((len == 2 && memcmp(name, "BN", 2) == 0) ||
    333         (len == 3 &&
    334          (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 ||
    335           memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 ||
    336           memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0))) {
    337         /*
    338          * Mark all of these as Other Neutral to preserve compatibility with
    339          * older versions.
    340          */
    341         len = 2;
    342         name = "ON";
    343     }
    344 
    345     for (i = 0; i < NUMPROPS; i++) {
    346         if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
    347           break;
    348     }
    349 
    350     if (i == NUMPROPS)
    351       return;
    352 
    353     /*
    354      * Have a match, so insert the code in order.
    355      */
    356     rlp = &proptbl[i];
    357 
    358     /*
    359      * Resize the range list if necessary.
    360      */
    361     if (rlp->used == rlp->size) {
    362         if (rlp->size == 0)
    363           rlp->ranges = (ac_uint4 *)
    364               malloc(sizeof(ac_uint4) << 3);
    365         else
    366           rlp->ranges = (ac_uint4 *)
    367               realloc((char *) rlp->ranges,
    368                       sizeof(ac_uint4) * (rlp->size + 8));
    369         rlp->size += 8;
    370     }
    371 
    372     /*
    373      * If this is the first code for this property list, just add it
    374      * and return.
    375      */
    376     if (rlp->used == 0) {
    377         rlp->ranges[0] = rlp->ranges[1] = c;
    378         rlp->used += 2;
    379         return;
    380     }
    381 
    382     /*
    383      * Optimize the cases of extending the last range and adding new ranges to
    384      * the end.
    385      */
    386     j = rlp->used - 1;
    387     e = rlp->ranges[j];
    388     s = rlp->ranges[j - 1];
    389 
    390     if (c == e + 1) {
    391         /*
    392          * Extend the last range.
    393          */
    394         rlp->ranges[j] = c;
    395         return;
    396     }
    397 
    398     if (c > e + 1) {
    399         /*
    400          * Start another range on the end.
    401          */
    402         j = rlp->used;
    403         rlp->ranges[j] = rlp->ranges[j + 1] = c;
    404         rlp->used += 2;
    405         return;
    406     }
    407 
    408     if (c >= s)
    409       /*
    410        * The code is a duplicate of a code in the last range, so just return.
    411        */
    412       return;
    413 
    414     /*
    415      * The code should be inserted somewhere before the last range in the
    416      * list.  Locate the insertion point.
    417      */
    418     for (i = 0;
    419          i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
    420 
    421     s = rlp->ranges[i];
    422     e = rlp->ranges[i + 1];
    423 
    424     if (c == e + 1)
    425       /*
    426        * Simply extend the current range.
    427        */
    428       rlp->ranges[i + 1] = c;
    429     else if (c < s) {
    430         /*
    431          * Add a new entry before the current location.  Shift all entries
    432          * before the current one up by one to make room.
    433          */
    434         for (j = rlp->used; j > i; j -= 2) {
    435             rlp->ranges[j] = rlp->ranges[j - 2];
    436             rlp->ranges[j + 1] = rlp->ranges[j - 1];
    437         }
    438         rlp->ranges[i] = rlp->ranges[i + 1] = c;
    439 
    440         rlp->used += 2;
    441     }
    442 }
    443 
    444 static void
    445 add_decomp(ac_uint4 code, short compat)
    446 {
    447     ac_uint4 i, j, size;
    448     _decomp_t **pdecomps;
    449     ac_uint4 *pdecomps_used;
    450     ac_uint4 *pdecomps_size;
    451 
    452     if (compat) {
    453 	pdecomps = &kdecomps;
    454 	pdecomps_used = &kdecomps_used;
    455 	pdecomps_size = &kdecomps_size;
    456     } else {
    457 	pdecomps = &decomps;
    458 	pdecomps_used = &decomps_used;
    459 	pdecomps_size = &decomps_size;
    460     }
    461 
    462     /*
    463      * Add the code to the composite property.
    464      */
    465     if (!compat) {
    466 	ordered_range_insert(code, "Cm", 2);
    467     }
    468 
    469     /*
    470      * Locate the insertion point for the code.
    471      */
    472     for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ;
    473 
    474     /*
    475      * Allocate space for a new decomposition.
    476      */
    477     if (*pdecomps_used == *pdecomps_size) {
    478         if (*pdecomps_size == 0)
    479           *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
    480         else
    481           *pdecomps = (_decomp_t *)
    482               realloc((char *) *pdecomps,
    483                       sizeof(_decomp_t) * (*pdecomps_size + 8));
    484         (void) memset((char *) (*pdecomps + *pdecomps_size), '\0',
    485                       sizeof(_decomp_t) << 3);
    486         *pdecomps_size += 8;
    487     }
    488 
    489     if (i < *pdecomps_used && code != (*pdecomps)[i].code) {
    490         /*
    491          * Shift the decomps up by one if the codes don't match.
    492          */
    493         for (j = *pdecomps_used; j > i; j--)
    494           (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1],
    495                         sizeof(_decomp_t));
    496     }
    497 
    498     /*
    499      * Insert or replace a decomposition.
    500      */
    501     size = dectmp_size + (4 - (dectmp_size & 3));
    502     if ((*pdecomps)[i].size < size) {
    503         if ((*pdecomps)[i].size == 0)
    504           (*pdecomps)[i].decomp = (ac_uint4 *)
    505               malloc(sizeof(ac_uint4) * size);
    506         else
    507           (*pdecomps)[i].decomp = (ac_uint4 *)
    508               realloc((char *) (*pdecomps)[i].decomp,
    509                       sizeof(ac_uint4) * size);
    510         (*pdecomps)[i].size = size;
    511     }
    512 
    513     if ((*pdecomps)[i].code != code)
    514       (*pdecomps_used)++;
    515 
    516     (*pdecomps)[i].code = code;
    517     (*pdecomps)[i].used = dectmp_size;
    518     (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp,
    519                   sizeof(ac_uint4) * dectmp_size);
    520 
    521     /*
    522      * NOTICE: This needs changing later so it is more general than simply
    523      * pairs.  This calculation is done here to simplify allocation elsewhere.
    524      */
    525     if (!compat && dectmp_size == 2)
    526       comps_used++;
    527 }
    528 
    529 static void
    530 add_title(ac_uint4 code)
    531 {
    532     ac_uint4 i, j;
    533 
    534     /*
    535      * Always map the code to itself.
    536      */
    537     cases[2] = code;
    538 
    539     if (title_used == title_size) {
    540         if (title_size == 0)
    541           title = (_case_t *) malloc(sizeof(_case_t) << 3);
    542         else
    543           title = (_case_t *) realloc((char *) title,
    544                                       sizeof(_case_t) * (title_size + 8));
    545         title_size += 8;
    546     }
    547 
    548     /*
    549      * Locate the insertion point.
    550      */
    551     for (i = 0; i < title_used && code > title[i].key; i++) ;
    552 
    553     if (i < title_used) {
    554         /*
    555          * Shift the array up by one.
    556          */
    557         for (j = title_used; j > i; j--)
    558           (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1],
    559                         sizeof(_case_t));
    560     }
    561 
    562     title[i].key = cases[2];    /* Title */
    563     title[i].other1 = cases[0]; /* Upper */
    564     title[i].other2 = cases[1]; /* Lower */
    565 
    566     title_used++;
    567 }
    568 
    569 static void
    570 add_upper(ac_uint4 code)
    571 {
    572     ac_uint4 i, j;
    573 
    574     /*
    575      * Always map the code to itself.
    576      */
    577     cases[0] = code;
    578 
    579     /*
    580      * If the title case character is not present, then make it the same as
    581      * the upper case.
    582      */
    583     if (cases[2] == 0)
    584       cases[2] = code;
    585 
    586     if (upper_used == upper_size) {
    587         if (upper_size == 0)
    588           upper = (_case_t *) malloc(sizeof(_case_t) << 3);
    589         else
    590           upper = (_case_t *) realloc((char *) upper,
    591                                       sizeof(_case_t) * (upper_size + 8));
    592         upper_size += 8;
    593     }
    594 
    595     /*
    596      * Locate the insertion point.
    597      */
    598     for (i = 0; i < upper_used && code > upper[i].key; i++) ;
    599 
    600     if (i < upper_used) {
    601         /*
    602          * Shift the array up by one.
    603          */
    604         for (j = upper_used; j > i; j--)
    605           (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1],
    606                         sizeof(_case_t));
    607     }
    608 
    609     upper[i].key = cases[0];    /* Upper */
    610     upper[i].other1 = cases[1]; /* Lower */
    611     upper[i].other2 = cases[2]; /* Title */
    612 
    613     upper_used++;
    614 }
    615 
    616 static void
    617 add_lower(ac_uint4 code)
    618 {
    619     ac_uint4 i, j;
    620 
    621     /*
    622      * Always map the code to itself.
    623      */
    624     cases[1] = code;
    625 
    626     /*
    627      * If the title case character is empty, then make it the same as the
    628      * upper case.
    629      */
    630     if (cases[2] == 0)
    631       cases[2] = cases[0];
    632 
    633     if (lower_used == lower_size) {
    634         if (lower_size == 0)
    635           lower = (_case_t *) malloc(sizeof(_case_t) << 3);
    636         else
    637           lower = (_case_t *) realloc((char *) lower,
    638                                       sizeof(_case_t) * (lower_size + 8));
    639         lower_size += 8;
    640     }
    641 
    642     /*
    643      * Locate the insertion point.
    644      */
    645     for (i = 0; i < lower_used && code > lower[i].key; i++) ;
    646 
    647     if (i < lower_used) {
    648         /*
    649          * Shift the array up by one.
    650          */
    651         for (j = lower_used; j > i; j--)
    652           (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1],
    653                         sizeof(_case_t));
    654     }
    655 
    656     lower[i].key = cases[1];    /* Lower */
    657     lower[i].other1 = cases[0]; /* Upper */
    658     lower[i].other2 = cases[2]; /* Title */
    659 
    660     lower_used++;
    661 }
    662 
    663 static void
    664 ordered_ccl_insert(ac_uint4 c, ac_uint4 ccl_code)
    665 {
    666     ac_uint4 i, j;
    667 
    668     if (ccl_used == ccl_size) {
    669         if (ccl_size == 0)
    670           ccl = (ac_uint4 *) malloc(sizeof(ac_uint4) * 24);
    671         else
    672           ccl = (ac_uint4 *)
    673               realloc((char *) ccl, sizeof(ac_uint4) * (ccl_size + 24));
    674         ccl_size += 24;
    675     }
    676 
    677     /*
    678      * Optimize adding the first item.
    679      */
    680     if (ccl_used == 0) {
    681         ccl[0] = ccl[1] = c;
    682         ccl[2] = ccl_code;
    683         ccl_used += 3;
    684         return;
    685     }
    686 
    687     /*
    688      * Handle the special case of extending the range on the end.  This
    689      * requires that the combining class codes are the same.
    690      */
    691     if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
    692         ccl[ccl_used - 2] = c;
    693         return;
    694     }
    695 
    696     /*
    697      * Handle the special case of adding another range on the end.
    698      */
    699     if (c > ccl[ccl_used - 2] + 1 ||
    700         (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
    701         ccl[ccl_used++] = c;
    702         ccl[ccl_used++] = c;
    703         ccl[ccl_used++] = ccl_code;
    704         return;
    705     }
    706 
    707     /*
    708      * Locate either the insertion point or range for the code.
    709      */
    710     for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
    711 
    712     if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
    713         /*
    714          * Extend an existing range.
    715          */
    716         ccl[i + 1] = c;
    717         return;
    718     } else if (c < ccl[i]) {
    719         /*
    720          * Start a new range before the current location.
    721          */
    722         for (j = ccl_used; j > i; j -= 3) {
    723             ccl[j] = ccl[j - 3];
    724             ccl[j - 1] = ccl[j - 4];
    725             ccl[j - 2] = ccl[j - 5];
    726         }
    727         ccl[i] = ccl[i + 1] = c;
    728         ccl[i + 2] = ccl_code;
    729     }
    730 }
    731 
    732 /*
    733  * Adds a number if it does not already exist and returns an index value
    734  * multiplied by 2.
    735  */
    736 static ac_uint4
    737 make_number(short num, short denom)
    738 {
    739     ac_uint4 n;
    740 
    741     /*
    742      * Determine if the number already exists.
    743      */
    744     for (n = 0; n < nums_used; n++) {
    745         if (nums[n].numerator == num && nums[n].denominator == denom)
    746           return n << 1;
    747     }
    748 
    749     if (nums_used == nums_size) {
    750         if (nums_size == 0)
    751           nums = (_num_t *) malloc(sizeof(_num_t) << 3);
    752         else
    753           nums = (_num_t *) realloc((char *) nums,
    754                                     sizeof(_num_t) * (nums_size + 8));
    755         nums_size += 8;
    756     }
    757 
    758     n = nums_used++;
    759     nums[n].numerator = num;
    760     nums[n].denominator = denom;
    761 
    762     return n << 1;
    763 }
    764 
    765 static void
    766 add_number(ac_uint4 code, short num, short denom)
    767 {
    768     ac_uint4 i, j;
    769 
    770     /*
    771      * Insert the code in order.
    772      */
    773     for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
    774 
    775     /*
    776      * Handle the case of the codes matching and simply replace the number
    777      * that was there before.
    778      */
    779     if (i < ncodes_used && code == ncodes[i].code) {
    780         ncodes[i].idx = make_number(num, denom);
    781         return;
    782     }
    783 
    784     /*
    785      * Resize the array if necessary.
    786      */
    787     if (ncodes_used == ncodes_size) {
    788         if (ncodes_size == 0)
    789           ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
    790         else
    791           ncodes = (_codeidx_t *)
    792               realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
    793 
    794         ncodes_size += 8;
    795     }
    796 
    797     /*
    798      * Shift things around to insert the code if necessary.
    799      */
    800     if (i < ncodes_used) {
    801         for (j = ncodes_used; j > i; j--) {
    802             ncodes[j].code = ncodes[j - 1].code;
    803             ncodes[j].idx = ncodes[j - 1].idx;
    804         }
    805     }
    806     ncodes[i].code = code;
    807     ncodes[i].idx = make_number(num, denom);
    808 
    809     ncodes_used++;
    810 }
    811 
    812 /*
    813  * This routine assumes that the line is a valid Unicode Character Database
    814  * entry.
    815  */
    816 static void
    817 read_cdata(FILE *in)
    818 {
    819     ac_uint4 i, lineno, skip, code, ccl_code;
    820     short wnum, neg, number[2], compat;
    821     char line[512], *s, *e;
    822 
    823     lineno = skip = 0;
    824     while (fgets(line, sizeof(line), in)) {
    825 	if( (s=strchr(line, '\n')) ) *s = '\0';
    826         lineno++;
    827 
    828         /*
    829          * Skip blank lines and lines that start with a '#'.
    830          */
    831         if (line[0] == 0 || line[0] == '#')
    832           continue;
    833 
    834         /*
    835          * If lines need to be skipped, do it here.
    836          */
    837         if (skip) {
    838             skip--;
    839             continue;
    840         }
    841 
    842         /*
    843          * Collect the code.  The code can be up to 6 hex digits in length to
    844          * allow surrogates to be specified.
    845          */
    846         for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
    847             code <<= 4;
    848             if (*s >= '0' && *s <= '9')
    849               code += *s - '0';
    850             else if (*s >= 'A' && *s <= 'F')
    851               code += (*s - 'A') + 10;
    852             else if (*s >= 'a' && *s <= 'f')
    853               code += (*s - 'a') + 10;
    854         }
    855 
    856         /*
    857          * Handle the following special cases:
    858          * 1. 4E00-9FA5 CJK Ideographs.
    859          * 2. AC00-D7A3 Hangul Syllables.
    860          * 3. D800-DFFF Surrogates.
    861          * 4. E000-F8FF Private Use Area.
    862          * 5. F900-FA2D Han compatibility.
    863 	 * ...Plus additional ranges in newer Unicode versions...
    864          */
    865         switch (code) {
    866 	  case 0x3400:
    867 	    /* CJK Ideograph Extension A */
    868             add_range(0x3400, 0x4db5, "Lo", "L");
    869 
    870             add_range(0x3400, 0x4db5, "Cp", 0);
    871 
    872 	    skip = 1;
    873 	    break;
    874           case 0x4e00:
    875             /*
    876              * The Han ideographs.
    877              */
    878             add_range(0x4e00, 0x9fff, "Lo", "L");
    879 
    880             /*
    881              * Add the characters to the defined category.
    882              */
    883             add_range(0x4e00, 0x9fa5, "Cp", 0);
    884 
    885             skip = 1;
    886             break;
    887           case 0xac00:
    888             /*
    889              * The Hangul syllables.
    890              */
    891             add_range(0xac00, 0xd7a3, "Lo", "L");
    892 
    893             /*
    894              * Add the characters to the defined category.
    895              */
    896             add_range(0xac00, 0xd7a3, "Cp", 0);
    897 
    898             skip = 1;
    899             break;
    900           case 0xd800:
    901             /*
    902              * Make a range of all surrogates and assume some default
    903              * properties.
    904              */
    905             add_range(0x010000, 0x10ffff, "Cs", "L");
    906             skip = 5;
    907             break;
    908           case 0xe000:
    909             /*
    910              * The Private Use area.  Add with a default set of properties.
    911              */
    912             add_range(0xe000, 0xf8ff, "Co", "L");
    913             skip = 1;
    914             break;
    915           case 0xf900:
    916             /*
    917              * The CJK compatibility area.
    918              */
    919             add_range(0xf900, 0xfaff, "Lo", "L");
    920 
    921             /*
    922              * Add the characters to the defined category.
    923              */
    924             add_range(0xf900, 0xfaff, "Cp", 0);
    925 
    926             skip = 1;
    927 	    break;
    928 	  case 0x20000:
    929 	    /* CJK Ideograph Extension B */
    930             add_range(0x20000, 0x2a6d6, "Lo", "L");
    931 
    932             add_range(0x20000, 0x2a6d6, "Cp", 0);
    933 
    934 	    skip = 1;
    935 	    break;
    936 	  case 0xf0000:
    937 	    /* Plane 15 private use */
    938 	    add_range(0xf0000, 0xffffd, "Co", "L");
    939 	    skip = 1;
    940 	    break;
    941 
    942 	  case 0x100000:
    943 	    /* Plane 16 private use */
    944 	    add_range(0x100000, 0x10fffd, "Co", "L");
    945 	    skip = 1;
    946 	    break;
    947         }
    948 
    949         if (skip)
    950           continue;
    951 
    952         /*
    953          * Add the code to the defined category.
    954          */
    955         ordered_range_insert(code, "Cp", 2);
    956 
    957         /*
    958          * Locate the first character property field.
    959          */
    960         for (i = 0; *s != 0 && i < 2; s++) {
    961             if (*s == ';')
    962               i++;
    963         }
    964         for (e = s; *e && *e != ';'; e++) ;
    965 
    966         ordered_range_insert(code, s, e - s);
    967 
    968         /*
    969          * Locate the combining class code.
    970          */
    971         for (s = e; *s != 0 && i < 3; s++) {
    972             if (*s == ';')
    973               i++;
    974         }
    975 
    976         /*
    977          * Convert the combining class code from decimal.
    978          */
    979         for (ccl_code = 0, e = s; *e && *e != ';'; e++)
    980           ccl_code = (ccl_code * 10) + (*e - '0');
    981 
    982         /*
    983          * Add the code if it not 0.
    984          */
    985         if (ccl_code != 0)
    986           ordered_ccl_insert(code, ccl_code);
    987 
    988         /*
    989          * Locate the second character property field.
    990          */
    991         for (s = e; *s != 0 && i < 4; s++) {
    992             if (*s == ';')
    993               i++;
    994         }
    995         for (e = s; *e && *e != ';'; e++) ;
    996 
    997         ordered_range_insert(code, s, e - s);
    998 
    999         /*
   1000          * Check for a decomposition.
   1001          */
   1002         s = ++e;
   1003         if (*s != ';') {
   1004 	    compat = *s == '<';
   1005 	    if (compat) {
   1006 		/*
   1007 		 * Skip compatibility formatting tag.
   1008 		 */
   1009 		while (*s++ != '>');
   1010 	    }
   1011             /*
   1012              * Collect the codes of the decomposition.
   1013              */
   1014             for (dectmp_size = 0; *s != ';'; ) {
   1015                 /*
   1016                  * Skip all leading non-hex digits.
   1017                  */
   1018                 while (!ishdigit(*s))
   1019  		  s++;
   1020 
   1021                 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
   1022                     dectmp[dectmp_size] <<= 4;
   1023                     if (*s >= '0' && *s <= '9')
   1024                       dectmp[dectmp_size] += *s - '0';
   1025                     else if (*s >= 'A' && *s <= 'F')
   1026                       dectmp[dectmp_size] += (*s - 'A') + 10;
   1027                     else if (*s >= 'a' && *s <= 'f')
   1028                       dectmp[dectmp_size] += (*s - 'a') + 10;
   1029                 }
   1030                 dectmp_size++;
   1031             }
   1032 
   1033             /*
   1034              * If there are any codes in the temporary decomposition array,
   1035              * then add the character with its decomposition.
   1036              */
   1037             if (dectmp_size > 0) {
   1038 		if (!compat) {
   1039 		    add_decomp(code, 0);
   1040 		}
   1041 		add_decomp(code, 1);
   1042 	    }
   1043         }
   1044 
   1045         /*
   1046          * Skip to the number field.
   1047          */
   1048         for (i = 0; i < 3 && *s; s++) {
   1049             if (*s == ';')
   1050               i++;
   1051         }
   1052 
   1053         /*
   1054          * Scan the number in.
   1055          */
   1056         number[0] = number[1] = 0;
   1057         for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
   1058             if (*e == '-') {
   1059                 neg = 1;
   1060                 continue;
   1061             }
   1062 
   1063             if (*e == '/') {
   1064                 /*
   1065                  * Move the the denominator of the fraction.
   1066                  */
   1067                 if (neg)
   1068                   number[wnum] *= -1;
   1069                 neg = 0;
   1070                 e++;
   1071                 wnum++;
   1072             }
   1073             number[wnum] = (number[wnum] * 10) + (*e - '0');
   1074         }
   1075 
   1076         if (e > s) {
   1077             /*
   1078              * Adjust the denominator in case of integers and add the number.
   1079              */
   1080             if (wnum == 0)
   1081               number[1] = 1;
   1082 
   1083             add_number(code, number[0], number[1]);
   1084         }
   1085 
   1086         /*
   1087          * Skip to the start of the possible case mappings.
   1088          */
   1089         for (s = e, i = 0; i < 4 && *s; s++) {
   1090             if (*s == ';')
   1091               i++;
   1092         }
   1093 
   1094         /*
   1095          * Collect the case mappings.
   1096          */
   1097         cases[0] = cases[1] = cases[2] = 0;
   1098         for (i = 0; i < 3; i++) {
   1099             while (ishdigit(*s)) {
   1100                 cases[i] <<= 4;
   1101                 if (*s >= '0' && *s <= '9')
   1102                   cases[i] += *s - '0';
   1103                 else if (*s >= 'A' && *s <= 'F')
   1104                   cases[i] += (*s - 'A') + 10;
   1105                 else if (*s >= 'a' && *s <= 'f')
   1106                   cases[i] += (*s - 'a') + 10;
   1107                 s++;
   1108             }
   1109             if (*s == ';')
   1110               s++;
   1111         }
   1112         if (cases[0] && cases[1])
   1113           /*
   1114            * Add the upper and lower mappings for a title case character.
   1115            */
   1116           add_title(code);
   1117         else if (cases[1])
   1118           /*
   1119            * Add the lower and title case mappings for the upper case
   1120            * character.
   1121            */
   1122           add_upper(code);
   1123         else if (cases[0])
   1124           /*
   1125            * Add the upper and title case mappings for the lower case
   1126            * character.
   1127            */
   1128           add_lower(code);
   1129     }
   1130 }
   1131 
   1132 static _decomp_t *
   1133 find_decomp(ac_uint4 code, short compat)
   1134 {
   1135     long l, r, m;
   1136     _decomp_t *decs;
   1137 
   1138     l = 0;
   1139     r = (compat ? kdecomps_used : decomps_used) - 1;
   1140     decs = compat ? kdecomps : decomps;
   1141     while (l <= r) {
   1142         m = (l + r) >> 1;
   1143         if (code > decs[m].code)
   1144           l = m + 1;
   1145         else if (code < decs[m].code)
   1146           r = m - 1;
   1147         else
   1148           return &decs[m];
   1149     }
   1150     return 0;
   1151 }
   1152 
   1153 static void
   1154 decomp_it(_decomp_t *d, short compat)
   1155 {
   1156     ac_uint4 i;
   1157     _decomp_t *dp;
   1158 
   1159     for (i = 0; i < d->used; i++) {
   1160         if ((dp = find_decomp(d->decomp[i], compat)) != 0)
   1161           decomp_it(dp, compat);
   1162         else
   1163           dectmp[dectmp_size++] = d->decomp[i];
   1164     }
   1165 }
   1166 
   1167 /*
   1168  * Expand all decompositions by recursively decomposing each character
   1169  * in the decomposition.
   1170  */
   1171 static void
   1172 expand_decomp(void)
   1173 {
   1174     ac_uint4 i;
   1175 
   1176     for (i = 0; i < decomps_used; i++) {
   1177         dectmp_size = 0;
   1178         decomp_it(&decomps[i], 0);
   1179         if (dectmp_size > 0)
   1180           add_decomp(decomps[i].code, 0);
   1181     }
   1182 
   1183     for (i = 0; i < kdecomps_used; i++) {
   1184         dectmp_size = 0;
   1185         decomp_it(&kdecomps[i], 1);
   1186         if (dectmp_size > 0)
   1187           add_decomp(kdecomps[i].code, 1);
   1188     }
   1189 }
   1190 
   1191 static int
   1192 cmpcomps(const void *v_comp1, const void *v_comp2)
   1193 {
   1194 	const _comp_t *comp1 = v_comp1, *comp2 = v_comp2;
   1195     long diff = comp1->code1 - comp2->code1;
   1196 
   1197     if (!diff)
   1198 	diff = comp1->code2 - comp2->code2;
   1199     return (int) diff;
   1200 }
   1201 
   1202 /*
   1203  * Load composition exclusion data
   1204  */
   1205 static void
   1206 read_compexdata(FILE *in)
   1207 {
   1208     ac_uint2 i;
   1209     ac_uint4 code;
   1210     char line[512], *s;
   1211 
   1212     (void) memset((char *) compexs, 0, sizeof(compexs));
   1213 
   1214     while (fgets(line, sizeof(line), in)) {
   1215 	if( (s=strchr(line, '\n')) ) *s = '\0';
   1216         /*
   1217          * Skip blank lines and lines that start with a '#'.
   1218          */
   1219         if (line[0] == 0 || line[0] == '#')
   1220 	    continue;
   1221 
   1222 	/*
   1223          * Collect the code.  Assume max 6 digits
   1224          */
   1225 
   1226 	for (s = line, i = code = 0; *s != '#' && i < 6; i++, s++) {
   1227 	    if (isspace((unsigned char)*s)) break;
   1228             code <<= 4;
   1229             if (*s >= '0' && *s <= '9')
   1230 		code += *s - '0';
   1231             else if (*s >= 'A' && *s <= 'F')
   1232 		code += (*s - 'A') + 10;
   1233             else if (*s >= 'a' && *s <= 'f')
   1234 		code += (*s - 'a') + 10;
   1235         }
   1236         COMPEX_SET(code);
   1237     }
   1238 }
   1239 
   1240 /*
   1241  * Creates array of compositions from decomposition array
   1242  */
   1243 static void
   1244 create_comps(void)
   1245 {
   1246     ac_uint4 i, cu;
   1247 
   1248     comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t));
   1249 
   1250     for (i = cu = 0; i < decomps_used; i++) {
   1251 	if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code))
   1252 	    continue;
   1253 	comps[cu].comp = decomps[i].code;
   1254 	comps[cu].count = 2;
   1255 	comps[cu].code1 = decomps[i].decomp[0];
   1256 	comps[cu].code2 = decomps[i].decomp[1];
   1257 	cu++;
   1258     }
   1259     comps_used = cu;
   1260     qsort(comps, comps_used, sizeof(_comp_t), cmpcomps);
   1261 }
   1262 
   1263 #if HARDCODE_DATA
   1264 static void
   1265 write_case(FILE *out, _case_t *tab, int num, int first)
   1266 {
   1267     int i;
   1268 
   1269     for (i=0; i<num; i++) {
   1270 	if (first) first = 0;
   1271 	else fprintf(out, ",");
   1272 	fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx",
   1273 		(unsigned long) tab[i].key, (unsigned long) tab[i].other1,
   1274 		(unsigned long) tab[i].other2);
   1275     }
   1276 }
   1277 
   1278 #define PREF "static const "
   1279 
   1280 #endif
   1281 
   1282 static void
   1283 write_cdata(char *opath)
   1284 {
   1285     FILE *out;
   1286 	ac_uint4 bytes;
   1287     ac_uint4 i, idx, nprops;
   1288 #if !(HARDCODE_DATA)
   1289     ac_uint2 casecnt[2];
   1290 #endif
   1291     char path[BUFSIZ];
   1292 #if HARDCODE_DATA
   1293     int j, k;
   1294 
   1295     /*****************************************************************
   1296      *
   1297      * Generate the ctype data.
   1298      *
   1299      *****************************************************************/
   1300 
   1301     /*
   1302      * Open the output file.
   1303      */
   1304     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "uctable.h", opath);
   1305     if ((out = fopen(path, "w")) == 0)
   1306       return;
   1307 #else
   1308     /*
   1309      * Open the ctype.dat file.
   1310      */
   1311     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "ctype.dat", opath);
   1312     if ((out = fopen(path, "wb")) == 0)
   1313       return;
   1314 #endif
   1315 
   1316     /*
   1317      * Collect the offsets for the properties.  The offsets array is
   1318      * on a 4-byte boundary to keep things efficient for architectures
   1319      * that need such a thing.
   1320      */
   1321     for (i = idx = 0; i < NUMPROPS; i++) {
   1322         propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
   1323         idx += proptbl[i].used;
   1324     }
   1325 
   1326     /*
   1327      * Add the sentinel index which is used by the binary search as the upper
   1328      * bound for a search.
   1329      */
   1330     propcnt[i] = idx;
   1331 
   1332     /*
   1333      * Record the actual number of property lists.  This may be different than
   1334      * the number of offsets actually written because of aligning on a 4-byte
   1335      * boundary.
   1336      */
   1337     hdr[1] = NUMPROPS;
   1338 
   1339     /*
   1340      * Calculate the byte count needed and pad the property counts array to a
   1341      * 4-byte boundary.
   1342      */
   1343     if ((bytes = sizeof(ac_uint2) * (NUMPROPS + 1)) & 3)
   1344       bytes += 4 - (bytes & 3);
   1345     nprops = bytes / sizeof(ac_uint2);
   1346     bytes += sizeof(ac_uint4) * idx;
   1347 
   1348 #if HARDCODE_DATA
   1349     fprintf(out, PREF "ac_uint4 _ucprop_size = %d;\n\n", NUMPROPS);
   1350 
   1351     fprintf(out, PREF "ac_uint2 _ucprop_offsets[] = {");
   1352 
   1353     for (i = 0; i<nprops; i++) {
   1354        if (i) fprintf(out, ",");
   1355        if (!(i&7)) fprintf(out, "\n\t");
   1356        else fprintf(out, " ");
   1357        fprintf(out, "0x%04x", propcnt[i]);
   1358     }
   1359     fprintf(out, "\n};\n\n");
   1360 
   1361     fprintf(out, PREF "ac_uint4 _ucprop_ranges[] = {");
   1362 
   1363     k = 0;
   1364     for (i = 0; i < NUMPROPS; i++) {
   1365 	if (proptbl[i].used > 0) {
   1366 	  for (j=0; j<proptbl[i].used; j++) {
   1367 	    if (k) fprintf(out, ",");
   1368 	    if (!(k&3)) fprintf(out,"\n\t");
   1369 	    else fprintf(out, " ");
   1370 	    k++;
   1371 	    fprintf(out, "0x%08lx", (unsigned long) proptbl[i].ranges[j]);
   1372 	  }
   1373 	}
   1374     }
   1375     fprintf(out, "\n};\n\n");
   1376 #else
   1377     /*
   1378      * Write the header.
   1379      */
   1380     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
   1381 
   1382     /*
   1383      * Write the byte count.
   1384      */
   1385     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1386 
   1387     /*
   1388      * Write the property list counts.
   1389      */
   1390     fwrite((char *) propcnt, sizeof(ac_uint2), nprops, out);
   1391 
   1392     /*
   1393      * Write the property lists.
   1394      */
   1395     for (i = 0; i < NUMPROPS; i++) {
   1396         if (proptbl[i].used > 0)
   1397           fwrite((char *) proptbl[i].ranges, sizeof(ac_uint4),
   1398                  proptbl[i].used, out);
   1399     }
   1400 
   1401     fclose(out);
   1402 #endif
   1403 
   1404     /*****************************************************************
   1405      *
   1406      * Generate the case mapping data.
   1407      *
   1408      *****************************************************************/
   1409 
   1410 #if HARDCODE_DATA
   1411     fprintf(out, PREF "ac_uint4 _uccase_size = %ld;\n\n",
   1412         (long) (upper_used + lower_used + title_used));
   1413 
   1414     fprintf(out, PREF "ac_uint2 _uccase_len[2] = {%ld, %ld};\n\n",
   1415         (long) upper_used, (long) lower_used);
   1416     fprintf(out, PREF "ac_uint4 _uccase_map[] = {");
   1417 
   1418     if (upper_used > 0)
   1419       /*
   1420        * Write the upper case table.
   1421        */
   1422       write_case(out, upper, upper_used, 1);
   1423 
   1424     if (lower_used > 0)
   1425       /*
   1426        * Write the lower case table.
   1427        */
   1428       write_case(out, lower, lower_used, !upper_used);
   1429 
   1430     if (title_used > 0)
   1431       /*
   1432        * Write the title case table.
   1433        */
   1434       write_case(out, title, title_used, !(upper_used||lower_used));
   1435 
   1436     if (!(upper_used || lower_used || title_used))
   1437 	fprintf(out, "\t0");
   1438 
   1439     fprintf(out, "\n};\n\n");
   1440 #else
   1441     /*
   1442      * Open the case.dat file.
   1443      */
   1444     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "case.dat", opath);
   1445     if ((out = fopen(path, "wb")) == 0)
   1446       return;
   1447 
   1448     /*
   1449      * Write the case mapping tables.
   1450      */
   1451     hdr[1] = upper_used + lower_used + title_used;
   1452     casecnt[0] = upper_used;
   1453     casecnt[1] = lower_used;
   1454 
   1455     /*
   1456      * Write the header.
   1457      */
   1458     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
   1459 
   1460     /*
   1461      * Write the upper and lower case table sizes.
   1462      */
   1463     fwrite((char *) casecnt, sizeof(ac_uint2), 2, out);
   1464 
   1465     if (upper_used > 0)
   1466       /*
   1467        * Write the upper case table.
   1468        */
   1469       fwrite((char *) upper, sizeof(_case_t), upper_used, out);
   1470 
   1471     if (lower_used > 0)
   1472       /*
   1473        * Write the lower case table.
   1474        */
   1475       fwrite((char *) lower, sizeof(_case_t), lower_used, out);
   1476 
   1477     if (title_used > 0)
   1478       /*
   1479        * Write the title case table.
   1480        */
   1481       fwrite((char *) title, sizeof(_case_t), title_used, out);
   1482 
   1483     fclose(out);
   1484 #endif
   1485 
   1486     /*****************************************************************
   1487      *
   1488      * Generate the composition data.
   1489      *
   1490      *****************************************************************/
   1491 
   1492     /*
   1493      * Create compositions from decomposition data
   1494      */
   1495     create_comps();
   1496 
   1497 #if HARDCODE_DATA
   1498     fprintf(out, PREF "ac_uint4 _uccomp_size = %ld;\n\n",
   1499         comps_used * 4L);
   1500 
   1501     fprintf(out, PREF "ac_uint4 _uccomp_data[] = {");
   1502 
   1503      /*
   1504       * Now, if comps exist, write them out.
   1505       */
   1506     if (comps_used > 0) {
   1507 	for (i=0; i<comps_used; i++) {
   1508 	    if (i) fprintf(out, ",");
   1509 	    fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx, 0x%08lx",
   1510 	        (unsigned long) comps[i].comp, (unsigned long) comps[i].count,
   1511 	        (unsigned long) comps[i].code1, (unsigned long) comps[i].code2);
   1512 	}
   1513     } else {
   1514 	fprintf(out, "\t0");
   1515     }
   1516     fprintf(out, "\n};\n\n");
   1517 #else
   1518     /*
   1519      * Open the comp.dat file.
   1520      */
   1521     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "comp.dat", opath);
   1522     if ((out = fopen(path, "wb")) == 0)
   1523 	return;
   1524 
   1525     /*
   1526      * Write the header.
   1527      */
   1528     hdr[1] = (ac_uint2) comps_used * 4;
   1529     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
   1530 
   1531     /*
   1532      * Write out the byte count to maintain header size.
   1533      */
   1534     bytes = comps_used * sizeof(_comp_t);
   1535     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1536 
   1537     /*
   1538      * Now, if comps exist, write them out.
   1539      */
   1540     if (comps_used > 0)
   1541         fwrite((char *) comps, sizeof(_comp_t), comps_used, out);
   1542 
   1543     fclose(out);
   1544 #endif
   1545 
   1546     /*****************************************************************
   1547      *
   1548      * Generate the decomposition data.
   1549      *
   1550      *****************************************************************/
   1551 
   1552     /*
   1553      * Fully expand all decompositions before generating the output file.
   1554      */
   1555     expand_decomp();
   1556 
   1557 #if HARDCODE_DATA
   1558     fprintf(out, PREF "ac_uint4 _ucdcmp_size = %ld;\n\n",
   1559         decomps_used * 2L);
   1560 
   1561     fprintf(out, PREF "ac_uint4 _ucdcmp_nodes[] = {");
   1562 
   1563     if (decomps_used) {
   1564 	/*
   1565 	 * Write the list of decomp nodes.
   1566 	 */
   1567 	for (i = idx = 0; i < decomps_used; i++) {
   1568 	    fprintf(out, "\n\t0x%08lx, 0x%08lx,",
   1569 	        (unsigned long) decomps[i].code, (unsigned long) idx);
   1570 	    idx += decomps[i].used;
   1571 	}
   1572 
   1573 	/*
   1574 	 * Write the sentinel index as the last decomp node.
   1575 	 */
   1576 	fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
   1577 
   1578 	fprintf(out, PREF "ac_uint4 _ucdcmp_decomp[] = {");
   1579 	/*
   1580 	 * Write the decompositions themselves.
   1581 	 */
   1582 	k = 0;
   1583 	for (i = 0; i < decomps_used; i++)
   1584 	  for (j=0; j<decomps[i].used; j++) {
   1585 	    if (k) fprintf(out, ",");
   1586 	    if (!(k&3)) fprintf(out,"\n\t");
   1587 	    else fprintf(out, " ");
   1588 	    k++;
   1589 	    fprintf(out, "0x%08lx", (unsigned long) decomps[i].decomp[j]);
   1590 	  }
   1591 	fprintf(out, "\n};\n\n");
   1592     }
   1593 #else
   1594     /*
   1595      * Open the decomp.dat file.
   1596      */
   1597     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "decomp.dat", opath);
   1598     if ((out = fopen(path, "wb")) == 0)
   1599       return;
   1600 
   1601     hdr[1] = decomps_used;
   1602 
   1603     /*
   1604      * Write the header.
   1605      */
   1606     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
   1607 
   1608     /*
   1609      * Write a temporary byte count which will be calculated as the
   1610      * decompositions are written out.
   1611      */
   1612     bytes = 0;
   1613     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1614 
   1615     if (decomps_used) {
   1616         /*
   1617          * Write the list of decomp nodes.
   1618          */
   1619         for (i = idx = 0; i < decomps_used; i++) {
   1620             fwrite((char *) &decomps[i].code, sizeof(ac_uint4), 1, out);
   1621             fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
   1622             idx += decomps[i].used;
   1623         }
   1624 
   1625         /*
   1626          * Write the sentinel index as the last decomp node.
   1627          */
   1628         fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
   1629 
   1630         /*
   1631          * Write the decompositions themselves.
   1632          */
   1633         for (i = 0; i < decomps_used; i++)
   1634           fwrite((char *) decomps[i].decomp, sizeof(ac_uint4),
   1635                  decomps[i].used, out);
   1636 
   1637         /*
   1638          * Seek back to the beginning and write the byte count.
   1639          */
   1640         bytes = (sizeof(ac_uint4) * idx) +
   1641             (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
   1642         fseek(out, sizeof(ac_uint2) << 1, 0L);
   1643         fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1644 
   1645         fclose(out);
   1646     }
   1647 #endif
   1648 
   1649 #ifdef HARDCODE_DATA
   1650     fprintf(out, PREF "ac_uint4 _uckdcmp_size = %ld;\n\n",
   1651         kdecomps_used * 2L);
   1652 
   1653     fprintf(out, PREF "ac_uint4 _uckdcmp_nodes[] = {");
   1654 
   1655     if (kdecomps_used) {
   1656 	/*
   1657 	 * Write the list of kdecomp nodes.
   1658 	 */
   1659 	for (i = idx = 0; i < kdecomps_used; i++) {
   1660 	    fprintf(out, "\n\t0x%08lx, 0x%08lx,",
   1661 	        (unsigned long) kdecomps[i].code, (unsigned long) idx);
   1662 	    idx += kdecomps[i].used;
   1663 	}
   1664 
   1665 	/*
   1666 	 * Write the sentinel index as the last decomp node.
   1667 	 */
   1668 	fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
   1669 
   1670 	fprintf(out, PREF "ac_uint4 _uckdcmp_decomp[] = {");
   1671 
   1672 	/*
   1673 	 * Write the decompositions themselves.
   1674 	 */
   1675 	k = 0;
   1676 	for (i = 0; i < kdecomps_used; i++)
   1677 	  for (j=0; j<kdecomps[i].used; j++) {
   1678 	    if (k) fprintf(out, ",");
   1679 	    if (!(k&3)) fprintf(out,"\n\t");
   1680 	    else fprintf(out, " ");
   1681 	    k++;
   1682 	    fprintf(out, "0x%08lx", (unsigned long) kdecomps[i].decomp[j]);
   1683 	  }
   1684 	fprintf(out, "\n};\n\n");
   1685     }
   1686 #else
   1687     /*
   1688      * Open the kdecomp.dat file.
   1689      */
   1690     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "kdecomp.dat", opath);
   1691     if ((out = fopen(path, "wb")) == 0)
   1692       return;
   1693 
   1694     hdr[1] = kdecomps_used;
   1695 
   1696     /*
   1697      * Write the header.
   1698      */
   1699     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
   1700 
   1701     /*
   1702      * Write a temporary byte count which will be calculated as the
   1703      * decompositions are written out.
   1704      */
   1705     bytes = 0;
   1706     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1707 
   1708     if (kdecomps_used) {
   1709         /*
   1710          * Write the list of kdecomp nodes.
   1711          */
   1712         for (i = idx = 0; i < kdecomps_used; i++) {
   1713             fwrite((char *) &kdecomps[i].code, sizeof(ac_uint4), 1, out);
   1714             fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
   1715             idx += kdecomps[i].used;
   1716         }
   1717 
   1718         /*
   1719          * Write the sentinel index as the last decomp node.
   1720          */
   1721         fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
   1722 
   1723         /*
   1724          * Write the decompositions themselves.
   1725          */
   1726         for (i = 0; i < kdecomps_used; i++)
   1727           fwrite((char *) kdecomps[i].decomp, sizeof(ac_uint4),
   1728                  kdecomps[i].used, out);
   1729 
   1730         /*
   1731          * Seek back to the beginning and write the byte count.
   1732          */
   1733         bytes = (sizeof(ac_uint4) * idx) +
   1734             (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
   1735         fseek(out, sizeof(ac_uint2) << 1, 0L);
   1736         fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1737 
   1738         fclose(out);
   1739     }
   1740 #endif
   1741 
   1742     /*****************************************************************
   1743      *
   1744      * Generate the combining class data.
   1745      *
   1746      *****************************************************************/
   1747 #ifdef HARDCODE_DATA
   1748     fprintf(out, PREF "ac_uint4 _uccmcl_size = %ld;\n\n", (long) ccl_used);
   1749 
   1750     fprintf(out, PREF "ac_uint4 _uccmcl_nodes[] = {");
   1751 
   1752     if (ccl_used > 0) {
   1753 	/*
   1754 	 * Write the combining class ranges out.
   1755 	 */
   1756 	for (i = 0; i<ccl_used; i++) {
   1757 	    if (i) fprintf(out, ",");
   1758 	    if (!(i&3)) fprintf(out, "\n\t");
   1759 	    else fprintf(out, " ");
   1760 	    fprintf(out, "0x%08lx", (unsigned long) ccl[i]);
   1761 	}
   1762     } else {
   1763 	fprintf(out, "\t0");
   1764     }
   1765     fprintf(out, "\n};\n\n");
   1766 #else
   1767     /*
   1768      * Open the cmbcl.dat file.
   1769      */
   1770     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "cmbcl.dat", opath);
   1771     if ((out = fopen(path, "wb")) == 0)
   1772       return;
   1773 
   1774     /*
   1775      * Set the number of ranges used.  Each range has a combining class which
   1776      * means each entry is a 3-tuple.
   1777      */
   1778     hdr[1] = ccl_used / 3;
   1779 
   1780     /*
   1781      * Write the header.
   1782      */
   1783     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
   1784 
   1785     /*
   1786      * Write out the byte count to maintain header size.
   1787      */
   1788     bytes = ccl_used * sizeof(ac_uint4);
   1789     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1790 
   1791     if (ccl_used > 0)
   1792       /*
   1793        * Write the combining class ranges out.
   1794        */
   1795       fwrite((char *) ccl, sizeof(ac_uint4), ccl_used, out);
   1796 
   1797     fclose(out);
   1798 #endif
   1799 
   1800     /*****************************************************************
   1801      *
   1802      * Generate the number data.
   1803      *
   1804      *****************************************************************/
   1805 
   1806 #if HARDCODE_DATA
   1807     fprintf(out, PREF "ac_uint4 _ucnum_size = %lu;\n\n",
   1808         (unsigned long)ncodes_used<<1);
   1809 
   1810     fprintf(out, PREF "ac_uint4 _ucnum_nodes[] = {");
   1811 
   1812     /*
   1813      * Now, if number mappings exist, write them out.
   1814      */
   1815     if (ncodes_used > 0) {
   1816 	for (i = 0; i<ncodes_used; i++) {
   1817 	    if (i) fprintf(out, ",");
   1818 	    if (!(i&1)) fprintf(out, "\n\t");
   1819 	    else fprintf(out, " ");
   1820 	    fprintf(out, "0x%08lx, 0x%08lx",
   1821 	        (unsigned long) ncodes[i].code, (unsigned long) ncodes[i].idx);
   1822 	}
   1823 	fprintf(out, "\n};\n\n");
   1824 
   1825 	fprintf(out, PREF "short _ucnum_vals[] = {");
   1826 	for (i = 0; i<nums_used; i++) {
   1827 	    if (i) fprintf(out, ",");
   1828 	    if (!(i&3)) fprintf(out, "\n\t");
   1829 	    else fprintf(out, " ");
   1830 	    if (nums[i].numerator < 0) {
   1831 		fprintf(out, "%6d, 0x%04x",
   1832 		  nums[i].numerator, nums[i].denominator);
   1833 	    } else {
   1834 		fprintf(out, "0x%04x, 0x%04x",
   1835 		  nums[i].numerator, nums[i].denominator);
   1836 	    }
   1837 	}
   1838 	fprintf(out, "\n};\n\n");
   1839     }
   1840 #else
   1841     /*
   1842      * Open the num.dat file.
   1843      */
   1844     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "num.dat", opath);
   1845     if ((out = fopen(path, "wb")) == 0)
   1846       return;
   1847 
   1848     /*
   1849      * The count part of the header will be the total number of codes that
   1850      * have numbers.
   1851      */
   1852     hdr[1] = (ac_uint2) (ncodes_used << 1);
   1853     bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
   1854 
   1855     /*
   1856      * Write the header.
   1857      */
   1858     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
   1859 
   1860     /*
   1861      * Write out the byte count to maintain header size.
   1862      */
   1863     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1864 
   1865     /*
   1866      * Now, if number mappings exist, write them out.
   1867      */
   1868     if (ncodes_used > 0) {
   1869         fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
   1870         fwrite((char *) nums, sizeof(_num_t), nums_used, out);
   1871     }
   1872 #endif
   1873 
   1874     fclose(out);
   1875 }
   1876 
   1877 static void
   1878 usage(char *prog)
   1879 {
   1880     fprintf(stderr,
   1881             "Usage: %s [-o output-directory|-x composition-exclusions]", prog);
   1882     fprintf(stderr, " datafile1 datafile2 ...\n\n");
   1883     fprintf(stderr,
   1884             "-o output-directory\n\t\tWrite the output files to a different");
   1885     fprintf(stderr, " directory (default: .).\n");
   1886     fprintf(stderr,
   1887             "-x composition-exclusion\n\t\tFile of composition codes");
   1888     fprintf(stderr, " that should be excluded.\n");
   1889     exit(1);
   1890 }
   1891 
   1892 int
   1893 main(int argc, char *argv[])
   1894 {
   1895     FILE *in;
   1896     char *prog, *opath;
   1897 
   1898     prog = lutil_progname( "ucgendat", argc, argv );
   1899 
   1900     opath = 0;
   1901     in = stdin;
   1902 
   1903     argc--;
   1904     argv++;
   1905 
   1906     while (argc > 0) {
   1907         if (argv[0][0] == '-') {
   1908             switch (argv[0][1]) {
   1909               case 'o':
   1910                 argc--;
   1911                 argv++;
   1912                 opath = argv[0];
   1913                 break;
   1914               case 'x':
   1915                 argc--;
   1916                 argv++;
   1917                 if ((in = fopen(argv[0], "r")) == 0)
   1918                   fprintf(stderr,
   1919                           "%s: unable to open composition exclusion file %s\n",
   1920                           prog, argv[0]);
   1921                 else {
   1922                     read_compexdata(in);
   1923                     fclose(in);
   1924                     in = 0;
   1925                 }
   1926                 break;
   1927               default:
   1928                 usage(prog);
   1929             }
   1930         } else {
   1931             if (in != stdin && in != NULL)
   1932               fclose(in);
   1933             if ((in = fopen(argv[0], "r")) == 0)
   1934               fprintf(stderr, "%s: unable to open ctype file %s\n",
   1935                       prog, argv[0]);
   1936             else {
   1937                 read_cdata(in);
   1938                 fclose(in);
   1939                 in = 0;
   1940 	    }
   1941         }
   1942         argc--;
   1943         argv++;
   1944     }
   1945 
   1946     if (opath == 0)
   1947       opath = ".";
   1948     write_cdata(opath);
   1949 
   1950     return 0;
   1951 }
   1952