Home | History | Annotate | Line # | Download | only in ucdata
ucgendat.c revision 1.1.1.4
      1 /*	$NetBSD: ucgendat.c,v 1.1.1.4 2014/05/28 09:58:44 tron Exp $	*/
      2 
      3 /* $OpenLDAP$ */
      4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
      5  *
      6  * Copyright 1998-2014 The OpenLDAP Foundation.
      7  * All rights reserved.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted only as authorized by the OpenLDAP
     11  * Public License.
     12  *
     13  * A copy of this license is available in file LICENSE in the
     14  * top-level directory of the distribution or, alternatively, at
     15  * <http://www.OpenLDAP.org/license.html>.
     16  */
     17 /* Copyright 2001 Computing Research Labs, New Mexico State University
     18  *
     19  * Permission is hereby granted, free of charge, to any person obtaining a
     20  * copy of this software and associated documentation files (the "Software"),
     21  * to deal in the Software without restriction, including without limitation
     22  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     23  * and/or sell copies of the Software, and to permit persons to whom the
     24  * Software is furnished to do so, subject to the following conditions:
     25  *
     26  * The above copyright notice and this permission notice shall be included in
     27  * all copies or substantial portions of the Software.
     28  *
     29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     32  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
     33  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
     34  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
     35  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     36  */
     37 /* Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp " */
     38 
     39 #include "portable.h"
     40 #include "ldap_config.h"
     41 
     42 #include <stdio.h>
     43 #include <ac/ctype.h>
     44 #include <ac/stdlib.h>
     45 #include <ac/string.h>
     46 #include <ac/unistd.h>
     47 
     48 #include <ac/bytes.h>
     49 
     50 #include <lutil.h>
     51 
     52 #ifndef HARDCODE_DATA
     53 #define	HARDCODE_DATA	1
     54 #endif
     55 
     56 #undef ishdigit
     57 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
     58                       ((cc) >= 'A' && (cc) <= 'F') ||\
     59                       ((cc) >= 'a' && (cc) <= 'f'))
     60 
     61 /*
     62  * A header written to the output file with the byte-order-mark and the number
     63  * of property nodes.
     64  */
     65 static ac_uint2 hdr[2] = {0xfeff, 0};
     66 
     67 #define NUMPROPS 50
     68 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
     69 
     70 typedef struct {
     71     char *name;
     72     int len;
     73 } _prop_t;
     74 
     75 /*
     76  * List of properties expected to be found in the Unicode Character Database
     77  * including some implementation specific properties.
     78  *
     79  * The implementation specific properties are:
     80  * Cm = Composed (can be decomposed)
     81  * Nb = Non-breaking
     82  * Sy = Symmetric (has left and right forms)
     83  * Hd = Hex digit
     84  * Qm = Quote marks
     85  * Mr = Mirroring
     86  * Ss = Space, other
     87  * Cp = Defined character
     88  */
     89 static _prop_t props[NUMPROPS] = {
     90     {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
     91     {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
     92     {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
     93     {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
     94     {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L",  1}, {"R",  1},
     95     {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B",  1},
     96     {"S",  1}, {"WS", 2}, {"ON", 2},
     97     {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
     98     {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2}
     99 };
    100 
    101 typedef struct {
    102     ac_uint4 *ranges;
    103     ac_uint2 used;
    104     ac_uint2 size;
    105 } _ranges_t;
    106 
    107 static _ranges_t proptbl[NUMPROPS];
    108 
    109 /*
    110  * Make sure this array is sized to be on a 4-byte boundary at compile time.
    111  */
    112 static ac_uint2 propcnt[NEEDPROPS];
    113 
    114 /*
    115  * Array used to collect a decomposition before adding it to the decomposition
    116  * table.
    117  */
    118 static ac_uint4 dectmp[64];
    119 static ac_uint4 dectmp_size;
    120 
    121 typedef struct {
    122     ac_uint4 code;
    123     ac_uint2 size;
    124     ac_uint2 used;
    125     ac_uint4 *decomp;
    126 } _decomp_t;
    127 
    128 /*
    129  * List of decomposition.  Created and expanded in order as the characters are
    130  * encountered. First list contains canonical mappings, second also includes
    131  * compatibility mappings.
    132  */
    133 static _decomp_t *decomps;
    134 static ac_uint4 decomps_used;
    135 static ac_uint4 decomps_size;
    136 
    137 static _decomp_t *kdecomps;
    138 static ac_uint4 kdecomps_used;
    139 static ac_uint4 kdecomps_size;
    140 
    141 /*
    142  * Composition exclusion table stuff.
    143  */
    144 #define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31)))
    145 #define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31)))
    146 static ac_uint4 compexs[8192];
    147 
    148 /*
    149  * Struct for holding a composition pair, and array of composition pairs
    150  */
    151 typedef struct {
    152     ac_uint4 comp;
    153     ac_uint4 count;
    154     ac_uint4 code1;
    155     ac_uint4 code2;
    156 } _comp_t;
    157 
    158 static _comp_t *comps;
    159 static ac_uint4 comps_used;
    160 
    161 /*
    162  * Types and lists for handling lists of case mappings.
    163  */
    164 typedef struct {
    165     ac_uint4 key;
    166     ac_uint4 other1;
    167     ac_uint4 other2;
    168 } _case_t;
    169 
    170 static _case_t *upper;
    171 static _case_t *lower;
    172 static _case_t *title;
    173 static ac_uint4 upper_used;
    174 static ac_uint4 upper_size;
    175 static ac_uint4 lower_used;
    176 static ac_uint4 lower_size;
    177 static ac_uint4 title_used;
    178 static ac_uint4 title_size;
    179 
    180 /*
    181  * Array used to collect case mappings before adding them to a list.
    182  */
    183 static ac_uint4 cases[3];
    184 
    185 /*
    186  * An array to hold ranges for combining classes.
    187  */
    188 static ac_uint4 *ccl;
    189 static ac_uint4 ccl_used;
    190 static ac_uint4 ccl_size;
    191 
    192 /*
    193  * Structures for handling numbers.
    194  */
    195 typedef struct {
    196     ac_uint4 code;
    197     ac_uint4 idx;
    198 } _codeidx_t;
    199 
    200 typedef struct {
    201     short numerator;
    202     short denominator;
    203 } _num_t;
    204 
    205 /*
    206  * Arrays to hold the mapping of codes to numbers.
    207  */
    208 static _codeidx_t *ncodes;
    209 static ac_uint4 ncodes_used;
    210 static ac_uint4 ncodes_size;
    211 
    212 static _num_t *nums;
    213 static ac_uint4 nums_used;
    214 static ac_uint4 nums_size;
    215 
    216 /*
    217  * Array for holding numbers.
    218  */
    219 static _num_t *nums;
    220 static ac_uint4 nums_used;
    221 static ac_uint4 nums_size;
    222 
    223 static void
    224 add_range(ac_uint4 start, ac_uint4 end, char *p1, char *p2)
    225 {
    226     int i, j, k, len;
    227     _ranges_t *rlp;
    228     char *name;
    229 
    230     for (k = 0; k < 2; k++) {
    231         if (k == 0) {
    232             name = p1;
    233             len = 2;
    234         } else {
    235             if (p2 == 0)
    236               break;
    237 
    238             name = p2;
    239             len = 1;
    240         }
    241 
    242         for (i = 0; i < NUMPROPS; i++) {
    243             if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
    244               break;
    245         }
    246 
    247         if (i == NUMPROPS)
    248           continue;
    249 
    250         rlp = &proptbl[i];
    251 
    252         /*
    253          * Resize the range list if necessary.
    254          */
    255         if (rlp->used == rlp->size) {
    256             if (rlp->size == 0)
    257               rlp->ranges = (ac_uint4 *)
    258                   malloc(sizeof(ac_uint4) << 3);
    259             else
    260               rlp->ranges = (ac_uint4 *)
    261                   realloc((char *) rlp->ranges,
    262                           sizeof(ac_uint4) * (rlp->size + 8));
    263             rlp->size += 8;
    264         }
    265 
    266         /*
    267          * If this is the first code for this property list, just add it
    268          * and return.
    269          */
    270         if (rlp->used == 0) {
    271             rlp->ranges[0] = start;
    272             rlp->ranges[1] = end;
    273             rlp->used += 2;
    274             continue;
    275         }
    276 
    277         /*
    278          * Optimize the case of adding the range to the end.
    279          */
    280         j = rlp->used - 1;
    281         if (start > rlp->ranges[j]) {
    282             j = rlp->used;
    283             rlp->ranges[j++] = start;
    284             rlp->ranges[j++] = end;
    285             rlp->used = j;
    286             continue;
    287         }
    288 
    289         /*
    290          * Need to locate the insertion point.
    291          */
    292         for (i = 0;
    293              i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
    294 
    295         /*
    296          * If the start value lies in the current range, then simply set the
    297          * new end point of the range to the end value passed as a parameter.
    298          */
    299         if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
    300             rlp->ranges[i + 1] = end;
    301             return;
    302         }
    303 
    304         /*
    305          * Shift following values up by two.
    306          */
    307         for (j = rlp->used; j > i; j -= 2) {
    308             rlp->ranges[j] = rlp->ranges[j - 2];
    309             rlp->ranges[j + 1] = rlp->ranges[j - 1];
    310         }
    311 
    312         /*
    313          * Add the new range at the insertion point.
    314          */
    315         rlp->ranges[i] = start;
    316         rlp->ranges[i + 1] = end;
    317         rlp->used += 2;
    318     }
    319 }
    320 
    321 static void
    322 ordered_range_insert(ac_uint4 c, char *name, int len)
    323 {
    324     int i, j;
    325     ac_uint4 s, e;
    326     _ranges_t *rlp;
    327 
    328     if (len == 0)
    329       return;
    330 
    331     /*
    332      * Deal with directionality codes introduced in Unicode 3.0.
    333      */
    334     if ((len == 2 && memcmp(name, "BN", 2) == 0) ||
    335         (len == 3 &&
    336          (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 ||
    337           memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 ||
    338           memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0))) {
    339         /*
    340          * Mark all of these as Other Neutral to preserve compatibility with
    341          * older versions.
    342          */
    343         len = 2;
    344         name = "ON";
    345     }
    346 
    347     for (i = 0; i < NUMPROPS; i++) {
    348         if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
    349           break;
    350     }
    351 
    352     if (i == NUMPROPS)
    353       return;
    354 
    355     /*
    356      * Have a match, so insert the code in order.
    357      */
    358     rlp = &proptbl[i];
    359 
    360     /*
    361      * Resize the range list if necessary.
    362      */
    363     if (rlp->used == rlp->size) {
    364         if (rlp->size == 0)
    365           rlp->ranges = (ac_uint4 *)
    366               malloc(sizeof(ac_uint4) << 3);
    367         else
    368           rlp->ranges = (ac_uint4 *)
    369               realloc((char *) rlp->ranges,
    370                       sizeof(ac_uint4) * (rlp->size + 8));
    371         rlp->size += 8;
    372     }
    373 
    374     /*
    375      * If this is the first code for this property list, just add it
    376      * and return.
    377      */
    378     if (rlp->used == 0) {
    379         rlp->ranges[0] = rlp->ranges[1] = c;
    380         rlp->used += 2;
    381         return;
    382     }
    383 
    384     /*
    385      * Optimize the cases of extending the last range and adding new ranges to
    386      * the end.
    387      */
    388     j = rlp->used - 1;
    389     e = rlp->ranges[j];
    390     s = rlp->ranges[j - 1];
    391 
    392     if (c == e + 1) {
    393         /*
    394          * Extend the last range.
    395          */
    396         rlp->ranges[j] = c;
    397         return;
    398     }
    399 
    400     if (c > e + 1) {
    401         /*
    402          * Start another range on the end.
    403          */
    404         j = rlp->used;
    405         rlp->ranges[j] = rlp->ranges[j + 1] = c;
    406         rlp->used += 2;
    407         return;
    408     }
    409 
    410     if (c >= s)
    411       /*
    412        * The code is a duplicate of a code in the last range, so just return.
    413        */
    414       return;
    415 
    416     /*
    417      * The code should be inserted somewhere before the last range in the
    418      * list.  Locate the insertion point.
    419      */
    420     for (i = 0;
    421          i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
    422 
    423     s = rlp->ranges[i];
    424     e = rlp->ranges[i + 1];
    425 
    426     if (c == e + 1)
    427       /*
    428        * Simply extend the current range.
    429        */
    430       rlp->ranges[i + 1] = c;
    431     else if (c < s) {
    432         /*
    433          * Add a new entry before the current location.  Shift all entries
    434          * before the current one up by one to make room.
    435          */
    436         for (j = rlp->used; j > i; j -= 2) {
    437             rlp->ranges[j] = rlp->ranges[j - 2];
    438             rlp->ranges[j + 1] = rlp->ranges[j - 1];
    439         }
    440         rlp->ranges[i] = rlp->ranges[i + 1] = c;
    441 
    442         rlp->used += 2;
    443     }
    444 }
    445 
    446 static void
    447 add_decomp(ac_uint4 code, short compat)
    448 {
    449     ac_uint4 i, j, size;
    450     _decomp_t **pdecomps;
    451     ac_uint4 *pdecomps_used;
    452     ac_uint4 *pdecomps_size;
    453 
    454     if (compat) {
    455 	pdecomps = &kdecomps;
    456 	pdecomps_used = &kdecomps_used;
    457 	pdecomps_size = &kdecomps_size;
    458     } else {
    459 	pdecomps = &decomps;
    460 	pdecomps_used = &decomps_used;
    461 	pdecomps_size = &decomps_size;
    462     }
    463 
    464     /*
    465      * Add the code to the composite property.
    466      */
    467     if (!compat) {
    468 	ordered_range_insert(code, "Cm", 2);
    469     }
    470 
    471     /*
    472      * Locate the insertion point for the code.
    473      */
    474     for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ;
    475 
    476     /*
    477      * Allocate space for a new decomposition.
    478      */
    479     if (*pdecomps_used == *pdecomps_size) {
    480         if (*pdecomps_size == 0)
    481           *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
    482         else
    483           *pdecomps = (_decomp_t *)
    484               realloc((char *) *pdecomps,
    485                       sizeof(_decomp_t) * (*pdecomps_size + 8));
    486         (void) memset((char *) (*pdecomps + *pdecomps_size), '\0',
    487                       sizeof(_decomp_t) << 3);
    488         *pdecomps_size += 8;
    489     }
    490 
    491     if (i < *pdecomps_used && code != (*pdecomps)[i].code) {
    492         /*
    493          * Shift the decomps up by one if the codes don't match.
    494          */
    495         for (j = *pdecomps_used; j > i; j--)
    496           (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1],
    497                         sizeof(_decomp_t));
    498     }
    499 
    500     /*
    501      * Insert or replace a decomposition.
    502      */
    503     size = dectmp_size + (4 - (dectmp_size & 3));
    504     if ((*pdecomps)[i].size < size) {
    505         if ((*pdecomps)[i].size == 0)
    506           (*pdecomps)[i].decomp = (ac_uint4 *)
    507               malloc(sizeof(ac_uint4) * size);
    508         else
    509           (*pdecomps)[i].decomp = (ac_uint4 *)
    510               realloc((char *) (*pdecomps)[i].decomp,
    511                       sizeof(ac_uint4) * size);
    512         (*pdecomps)[i].size = size;
    513     }
    514 
    515     if ((*pdecomps)[i].code != code)
    516       (*pdecomps_used)++;
    517 
    518     (*pdecomps)[i].code = code;
    519     (*pdecomps)[i].used = dectmp_size;
    520     (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp,
    521                   sizeof(ac_uint4) * dectmp_size);
    522 
    523     /*
    524      * NOTICE: This needs changing later so it is more general than simply
    525      * pairs.  This calculation is done here to simplify allocation elsewhere.
    526      */
    527     if (!compat && dectmp_size == 2)
    528       comps_used++;
    529 }
    530 
    531 static void
    532 add_title(ac_uint4 code)
    533 {
    534     ac_uint4 i, j;
    535 
    536     /*
    537      * Always map the code to itself.
    538      */
    539     cases[2] = code;
    540 
    541     if (title_used == title_size) {
    542         if (title_size == 0)
    543           title = (_case_t *) malloc(sizeof(_case_t) << 3);
    544         else
    545           title = (_case_t *) realloc((char *) title,
    546                                       sizeof(_case_t) * (title_size + 8));
    547         title_size += 8;
    548     }
    549 
    550     /*
    551      * Locate the insertion point.
    552      */
    553     for (i = 0; i < title_used && code > title[i].key; i++) ;
    554 
    555     if (i < title_used) {
    556         /*
    557          * Shift the array up by one.
    558          */
    559         for (j = title_used; j > i; j--)
    560           (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1],
    561                         sizeof(_case_t));
    562     }
    563 
    564     title[i].key = cases[2];    /* Title */
    565     title[i].other1 = cases[0]; /* Upper */
    566     title[i].other2 = cases[1]; /* Lower */
    567 
    568     title_used++;
    569 }
    570 
    571 static void
    572 add_upper(ac_uint4 code)
    573 {
    574     ac_uint4 i, j;
    575 
    576     /*
    577      * Always map the code to itself.
    578      */
    579     cases[0] = code;
    580 
    581     /*
    582      * If the title case character is not present, then make it the same as
    583      * the upper case.
    584      */
    585     if (cases[2] == 0)
    586       cases[2] = code;
    587 
    588     if (upper_used == upper_size) {
    589         if (upper_size == 0)
    590           upper = (_case_t *) malloc(sizeof(_case_t) << 3);
    591         else
    592           upper = (_case_t *) realloc((char *) upper,
    593                                       sizeof(_case_t) * (upper_size + 8));
    594         upper_size += 8;
    595     }
    596 
    597     /*
    598      * Locate the insertion point.
    599      */
    600     for (i = 0; i < upper_used && code > upper[i].key; i++) ;
    601 
    602     if (i < upper_used) {
    603         /*
    604          * Shift the array up by one.
    605          */
    606         for (j = upper_used; j > i; j--)
    607           (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1],
    608                         sizeof(_case_t));
    609     }
    610 
    611     upper[i].key = cases[0];    /* Upper */
    612     upper[i].other1 = cases[1]; /* Lower */
    613     upper[i].other2 = cases[2]; /* Title */
    614 
    615     upper_used++;
    616 }
    617 
    618 static void
    619 add_lower(ac_uint4 code)
    620 {
    621     ac_uint4 i, j;
    622 
    623     /*
    624      * Always map the code to itself.
    625      */
    626     cases[1] = code;
    627 
    628     /*
    629      * If the title case character is empty, then make it the same as the
    630      * upper case.
    631      */
    632     if (cases[2] == 0)
    633       cases[2] = cases[0];
    634 
    635     if (lower_used == lower_size) {
    636         if (lower_size == 0)
    637           lower = (_case_t *) malloc(sizeof(_case_t) << 3);
    638         else
    639           lower = (_case_t *) realloc((char *) lower,
    640                                       sizeof(_case_t) * (lower_size + 8));
    641         lower_size += 8;
    642     }
    643 
    644     /*
    645      * Locate the insertion point.
    646      */
    647     for (i = 0; i < lower_used && code > lower[i].key; i++) ;
    648 
    649     if (i < lower_used) {
    650         /*
    651          * Shift the array up by one.
    652          */
    653         for (j = lower_used; j > i; j--)
    654           (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1],
    655                         sizeof(_case_t));
    656     }
    657 
    658     lower[i].key = cases[1];    /* Lower */
    659     lower[i].other1 = cases[0]; /* Upper */
    660     lower[i].other2 = cases[2]; /* Title */
    661 
    662     lower_used++;
    663 }
    664 
    665 static void
    666 ordered_ccl_insert(ac_uint4 c, ac_uint4 ccl_code)
    667 {
    668     ac_uint4 i, j;
    669 
    670     if (ccl_used == ccl_size) {
    671         if (ccl_size == 0)
    672           ccl = (ac_uint4 *) malloc(sizeof(ac_uint4) * 24);
    673         else
    674           ccl = (ac_uint4 *)
    675               realloc((char *) ccl, sizeof(ac_uint4) * (ccl_size + 24));
    676         ccl_size += 24;
    677     }
    678 
    679     /*
    680      * Optimize adding the first item.
    681      */
    682     if (ccl_used == 0) {
    683         ccl[0] = ccl[1] = c;
    684         ccl[2] = ccl_code;
    685         ccl_used += 3;
    686         return;
    687     }
    688 
    689     /*
    690      * Handle the special case of extending the range on the end.  This
    691      * requires that the combining class codes are the same.
    692      */
    693     if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
    694         ccl[ccl_used - 2] = c;
    695         return;
    696     }
    697 
    698     /*
    699      * Handle the special case of adding another range on the end.
    700      */
    701     if (c > ccl[ccl_used - 2] + 1 ||
    702         (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
    703         ccl[ccl_used++] = c;
    704         ccl[ccl_used++] = c;
    705         ccl[ccl_used++] = ccl_code;
    706         return;
    707     }
    708 
    709     /*
    710      * Locate either the insertion point or range for the code.
    711      */
    712     for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
    713 
    714     if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
    715         /*
    716          * Extend an existing range.
    717          */
    718         ccl[i + 1] = c;
    719         return;
    720     } else if (c < ccl[i]) {
    721         /*
    722          * Start a new range before the current location.
    723          */
    724         for (j = ccl_used; j > i; j -= 3) {
    725             ccl[j] = ccl[j - 3];
    726             ccl[j - 1] = ccl[j - 4];
    727             ccl[j - 2] = ccl[j - 5];
    728         }
    729         ccl[i] = ccl[i + 1] = c;
    730         ccl[i + 2] = ccl_code;
    731     }
    732 }
    733 
    734 /*
    735  * Adds a number if it does not already exist and returns an index value
    736  * multiplied by 2.
    737  */
    738 static ac_uint4
    739 make_number(short num, short denom)
    740 {
    741     ac_uint4 n;
    742 
    743     /*
    744      * Determine if the number already exists.
    745      */
    746     for (n = 0; n < nums_used; n++) {
    747         if (nums[n].numerator == num && nums[n].denominator == denom)
    748           return n << 1;
    749     }
    750 
    751     if (nums_used == nums_size) {
    752         if (nums_size == 0)
    753           nums = (_num_t *) malloc(sizeof(_num_t) << 3);
    754         else
    755           nums = (_num_t *) realloc((char *) nums,
    756                                     sizeof(_num_t) * (nums_size + 8));
    757         nums_size += 8;
    758     }
    759 
    760     n = nums_used++;
    761     nums[n].numerator = num;
    762     nums[n].denominator = denom;
    763 
    764     return n << 1;
    765 }
    766 
    767 static void
    768 add_number(ac_uint4 code, short num, short denom)
    769 {
    770     ac_uint4 i, j;
    771 
    772     /*
    773      * Insert the code in order.
    774      */
    775     for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
    776 
    777     /*
    778      * Handle the case of the codes matching and simply replace the number
    779      * that was there before.
    780      */
    781     if (i < ncodes_used && code == ncodes[i].code) {
    782         ncodes[i].idx = make_number(num, denom);
    783         return;
    784     }
    785 
    786     /*
    787      * Resize the array if necessary.
    788      */
    789     if (ncodes_used == ncodes_size) {
    790         if (ncodes_size == 0)
    791           ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
    792         else
    793           ncodes = (_codeidx_t *)
    794               realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
    795 
    796         ncodes_size += 8;
    797     }
    798 
    799     /*
    800      * Shift things around to insert the code if necessary.
    801      */
    802     if (i < ncodes_used) {
    803         for (j = ncodes_used; j > i; j--) {
    804             ncodes[j].code = ncodes[j - 1].code;
    805             ncodes[j].idx = ncodes[j - 1].idx;
    806         }
    807     }
    808     ncodes[i].code = code;
    809     ncodes[i].idx = make_number(num, denom);
    810 
    811     ncodes_used++;
    812 }
    813 
    814 /*
    815  * This routine assumes that the line is a valid Unicode Character Database
    816  * entry.
    817  */
    818 static void
    819 read_cdata(FILE *in)
    820 {
    821     ac_uint4 i, lineno, skip, code, ccl_code;
    822     short wnum, neg, number[2], compat;
    823     char line[512], *s, *e;
    824 
    825     lineno = skip = 0;
    826     while (fgets(line, sizeof(line), in)) {
    827 	if( (s=strchr(line, '\n')) ) *s = '\0';
    828         lineno++;
    829 
    830         /*
    831          * Skip blank lines and lines that start with a '#'.
    832          */
    833         if (line[0] == 0 || line[0] == '#')
    834           continue;
    835 
    836         /*
    837          * If lines need to be skipped, do it here.
    838          */
    839         if (skip) {
    840             skip--;
    841             continue;
    842         }
    843 
    844         /*
    845          * Collect the code.  The code can be up to 6 hex digits in length to
    846          * allow surrogates to be specified.
    847          */
    848         for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
    849             code <<= 4;
    850             if (*s >= '0' && *s <= '9')
    851               code += *s - '0';
    852             else if (*s >= 'A' && *s <= 'F')
    853               code += (*s - 'A') + 10;
    854             else if (*s >= 'a' && *s <= 'f')
    855               code += (*s - 'a') + 10;
    856         }
    857 
    858         /*
    859          * Handle the following special cases:
    860          * 1. 4E00-9FA5 CJK Ideographs.
    861          * 2. AC00-D7A3 Hangul Syllables.
    862          * 3. D800-DFFF Surrogates.
    863          * 4. E000-F8FF Private Use Area.
    864          * 5. F900-FA2D Han compatibility.
    865 	 * ...Plus additional ranges in newer Unicode versions...
    866          */
    867         switch (code) {
    868 	  case 0x3400:
    869 	    /* CJK Ideograph Extension A */
    870             add_range(0x3400, 0x4db5, "Lo", "L");
    871 
    872             add_range(0x3400, 0x4db5, "Cp", 0);
    873 
    874 	    skip = 1;
    875 	    break;
    876           case 0x4e00:
    877             /*
    878              * The Han ideographs.
    879              */
    880             add_range(0x4e00, 0x9fff, "Lo", "L");
    881 
    882             /*
    883              * Add the characters to the defined category.
    884              */
    885             add_range(0x4e00, 0x9fa5, "Cp", 0);
    886 
    887             skip = 1;
    888             break;
    889           case 0xac00:
    890             /*
    891              * The Hangul syllables.
    892              */
    893             add_range(0xac00, 0xd7a3, "Lo", "L");
    894 
    895             /*
    896              * Add the characters to the defined category.
    897              */
    898             add_range(0xac00, 0xd7a3, "Cp", 0);
    899 
    900             skip = 1;
    901             break;
    902           case 0xd800:
    903             /*
    904              * Make a range of all surrogates and assume some default
    905              * properties.
    906              */
    907             add_range(0x010000, 0x10ffff, "Cs", "L");
    908             skip = 5;
    909             break;
    910           case 0xe000:
    911             /*
    912              * The Private Use area.  Add with a default set of properties.
    913              */
    914             add_range(0xe000, 0xf8ff, "Co", "L");
    915             skip = 1;
    916             break;
    917           case 0xf900:
    918             /*
    919              * The CJK compatibility area.
    920              */
    921             add_range(0xf900, 0xfaff, "Lo", "L");
    922 
    923             /*
    924              * Add the characters to the defined category.
    925              */
    926             add_range(0xf900, 0xfaff, "Cp", 0);
    927 
    928             skip = 1;
    929 	    break;
    930 	  case 0x20000:
    931 	    /* CJK Ideograph Extension B */
    932             add_range(0x20000, 0x2a6d6, "Lo", "L");
    933 
    934             add_range(0x20000, 0x2a6d6, "Cp", 0);
    935 
    936 	    skip = 1;
    937 	    break;
    938 	  case 0xf0000:
    939 	    /* Plane 15 private use */
    940 	    add_range(0xf0000, 0xffffd, "Co", "L");
    941 	    skip = 1;
    942 	    break;
    943 
    944 	  case 0x100000:
    945 	    /* Plane 16 private use */
    946 	    add_range(0x100000, 0x10fffd, "Co", "L");
    947 	    skip = 1;
    948 	    break;
    949         }
    950 
    951         if (skip)
    952           continue;
    953 
    954         /*
    955          * Add the code to the defined category.
    956          */
    957         ordered_range_insert(code, "Cp", 2);
    958 
    959         /*
    960          * Locate the first character property field.
    961          */
    962         for (i = 0; *s != 0 && i < 2; s++) {
    963             if (*s == ';')
    964               i++;
    965         }
    966         for (e = s; *e && *e != ';'; e++) ;
    967 
    968         ordered_range_insert(code, s, e - s);
    969 
    970         /*
    971          * Locate the combining class code.
    972          */
    973         for (s = e; *s != 0 && i < 3; s++) {
    974             if (*s == ';')
    975               i++;
    976         }
    977 
    978         /*
    979          * Convert the combining class code from decimal.
    980          */
    981         for (ccl_code = 0, e = s; *e && *e != ';'; e++)
    982           ccl_code = (ccl_code * 10) + (*e - '0');
    983 
    984         /*
    985          * Add the code if it not 0.
    986          */
    987         if (ccl_code != 0)
    988           ordered_ccl_insert(code, ccl_code);
    989 
    990         /*
    991          * Locate the second character property field.
    992          */
    993         for (s = e; *s != 0 && i < 4; s++) {
    994             if (*s == ';')
    995               i++;
    996         }
    997         for (e = s; *e && *e != ';'; e++) ;
    998 
    999         ordered_range_insert(code, s, e - s);
   1000 
   1001         /*
   1002          * Check for a decomposition.
   1003          */
   1004         s = ++e;
   1005         if (*s != ';') {
   1006 	    compat = *s == '<';
   1007 	    if (compat) {
   1008 		/*
   1009 		 * Skip compatibility formatting tag.
   1010 		 */
   1011 		while (*s++ != '>');
   1012 	    }
   1013             /*
   1014              * Collect the codes of the decomposition.
   1015              */
   1016             for (dectmp_size = 0; *s != ';'; ) {
   1017                 /*
   1018                  * Skip all leading non-hex digits.
   1019                  */
   1020                 while (!ishdigit(*s))
   1021  		  s++;
   1022 
   1023                 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
   1024                     dectmp[dectmp_size] <<= 4;
   1025                     if (*s >= '0' && *s <= '9')
   1026                       dectmp[dectmp_size] += *s - '0';
   1027                     else if (*s >= 'A' && *s <= 'F')
   1028                       dectmp[dectmp_size] += (*s - 'A') + 10;
   1029                     else if (*s >= 'a' && *s <= 'f')
   1030                       dectmp[dectmp_size] += (*s - 'a') + 10;
   1031                 }
   1032                 dectmp_size++;
   1033             }
   1034 
   1035             /*
   1036              * If there are any codes in the temporary decomposition array,
   1037              * then add the character with its decomposition.
   1038              */
   1039             if (dectmp_size > 0) {
   1040 		if (!compat) {
   1041 		    add_decomp(code, 0);
   1042 		}
   1043 		add_decomp(code, 1);
   1044 	    }
   1045         }
   1046 
   1047         /*
   1048          * Skip to the number field.
   1049          */
   1050         for (i = 0; i < 3 && *s; s++) {
   1051             if (*s == ';')
   1052               i++;
   1053         }
   1054 
   1055         /*
   1056          * Scan the number in.
   1057          */
   1058         number[0] = number[1] = 0;
   1059         for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
   1060             if (*e == '-') {
   1061                 neg = 1;
   1062                 continue;
   1063             }
   1064 
   1065             if (*e == '/') {
   1066                 /*
   1067                  * Move the the denominator of the fraction.
   1068                  */
   1069                 if (neg)
   1070                   number[wnum] *= -1;
   1071                 neg = 0;
   1072                 e++;
   1073                 wnum++;
   1074             }
   1075             number[wnum] = (number[wnum] * 10) + (*e - '0');
   1076         }
   1077 
   1078         if (e > s) {
   1079             /*
   1080              * Adjust the denominator in case of integers and add the number.
   1081              */
   1082             if (wnum == 0)
   1083               number[1] = 1;
   1084 
   1085             add_number(code, number[0], number[1]);
   1086         }
   1087 
   1088         /*
   1089          * Skip to the start of the possible case mappings.
   1090          */
   1091         for (s = e, i = 0; i < 4 && *s; s++) {
   1092             if (*s == ';')
   1093               i++;
   1094         }
   1095 
   1096         /*
   1097          * Collect the case mappings.
   1098          */
   1099         cases[0] = cases[1] = cases[2] = 0;
   1100         for (i = 0; i < 3; i++) {
   1101             while (ishdigit(*s)) {
   1102                 cases[i] <<= 4;
   1103                 if (*s >= '0' && *s <= '9')
   1104                   cases[i] += *s - '0';
   1105                 else if (*s >= 'A' && *s <= 'F')
   1106                   cases[i] += (*s - 'A') + 10;
   1107                 else if (*s >= 'a' && *s <= 'f')
   1108                   cases[i] += (*s - 'a') + 10;
   1109                 s++;
   1110             }
   1111             if (*s == ';')
   1112               s++;
   1113         }
   1114         if (cases[0] && cases[1])
   1115           /*
   1116            * Add the upper and lower mappings for a title case character.
   1117            */
   1118           add_title(code);
   1119         else if (cases[1])
   1120           /*
   1121            * Add the lower and title case mappings for the upper case
   1122            * character.
   1123            */
   1124           add_upper(code);
   1125         else if (cases[0])
   1126           /*
   1127            * Add the upper and title case mappings for the lower case
   1128            * character.
   1129            */
   1130           add_lower(code);
   1131     }
   1132 }
   1133 
   1134 static _decomp_t *
   1135 find_decomp(ac_uint4 code, short compat)
   1136 {
   1137     long l, r, m;
   1138     _decomp_t *decs;
   1139 
   1140     l = 0;
   1141     r = (compat ? kdecomps_used : decomps_used) - 1;
   1142     decs = compat ? kdecomps : decomps;
   1143     while (l <= r) {
   1144         m = (l + r) >> 1;
   1145         if (code > decs[m].code)
   1146           l = m + 1;
   1147         else if (code < decs[m].code)
   1148           r = m - 1;
   1149         else
   1150           return &decs[m];
   1151     }
   1152     return 0;
   1153 }
   1154 
   1155 static void
   1156 decomp_it(_decomp_t *d, short compat)
   1157 {
   1158     ac_uint4 i;
   1159     _decomp_t *dp;
   1160 
   1161     for (i = 0; i < d->used; i++) {
   1162         if ((dp = find_decomp(d->decomp[i], compat)) != 0)
   1163           decomp_it(dp, compat);
   1164         else
   1165           dectmp[dectmp_size++] = d->decomp[i];
   1166     }
   1167 }
   1168 
   1169 /*
   1170  * Expand all decompositions by recursively decomposing each character
   1171  * in the decomposition.
   1172  */
   1173 static void
   1174 expand_decomp(void)
   1175 {
   1176     ac_uint4 i;
   1177 
   1178     for (i = 0; i < decomps_used; i++) {
   1179         dectmp_size = 0;
   1180         decomp_it(&decomps[i], 0);
   1181         if (dectmp_size > 0)
   1182           add_decomp(decomps[i].code, 0);
   1183     }
   1184 
   1185     for (i = 0; i < kdecomps_used; i++) {
   1186         dectmp_size = 0;
   1187         decomp_it(&kdecomps[i], 1);
   1188         if (dectmp_size > 0)
   1189           add_decomp(kdecomps[i].code, 1);
   1190     }
   1191 }
   1192 
   1193 static int
   1194 cmpcomps(const void *v_comp1, const void *v_comp2)
   1195 {
   1196 	const _comp_t *comp1 = v_comp1, *comp2 = v_comp2;
   1197     long diff = comp1->code1 - comp2->code1;
   1198 
   1199     if (!diff)
   1200 	diff = comp1->code2 - comp2->code2;
   1201     return (int) diff;
   1202 }
   1203 
   1204 /*
   1205  * Load composition exclusion data
   1206  */
   1207 static void
   1208 read_compexdata(FILE *in)
   1209 {
   1210     ac_uint2 i;
   1211     ac_uint4 code;
   1212     char line[512], *s;
   1213 
   1214     (void) memset((char *) compexs, 0, sizeof(compexs));
   1215 
   1216     while (fgets(line, sizeof(line), in)) {
   1217 	if( (s=strchr(line, '\n')) ) *s = '\0';
   1218         /*
   1219          * Skip blank lines and lines that start with a '#'.
   1220          */
   1221         if (line[0] == 0 || line[0] == '#')
   1222 	    continue;
   1223 
   1224 	/*
   1225          * Collect the code.  Assume max 6 digits
   1226          */
   1227 
   1228 	for (s = line, i = code = 0; *s != '#' && i < 6; i++, s++) {
   1229 	    if (isspace((unsigned char)*s)) break;
   1230             code <<= 4;
   1231             if (*s >= '0' && *s <= '9')
   1232 		code += *s - '0';
   1233             else if (*s >= 'A' && *s <= 'F')
   1234 		code += (*s - 'A') + 10;
   1235             else if (*s >= 'a' && *s <= 'f')
   1236 		code += (*s - 'a') + 10;
   1237         }
   1238         COMPEX_SET(code);
   1239     }
   1240 }
   1241 
   1242 /*
   1243  * Creates array of compositions from decomposition array
   1244  */
   1245 static void
   1246 create_comps(void)
   1247 {
   1248     ac_uint4 i, cu;
   1249 
   1250     comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t));
   1251 
   1252     for (i = cu = 0; i < decomps_used; i++) {
   1253 	if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code))
   1254 	    continue;
   1255 	comps[cu].comp = decomps[i].code;
   1256 	comps[cu].count = 2;
   1257 	comps[cu].code1 = decomps[i].decomp[0];
   1258 	comps[cu].code2 = decomps[i].decomp[1];
   1259 	cu++;
   1260     }
   1261     comps_used = cu;
   1262     qsort(comps, comps_used, sizeof(_comp_t), cmpcomps);
   1263 }
   1264 
   1265 #if HARDCODE_DATA
   1266 static void
   1267 write_case(FILE *out, _case_t *tab, int num, int first)
   1268 {
   1269     int i;
   1270 
   1271     for (i=0; i<num; i++) {
   1272 	if (first) first = 0;
   1273 	else fprintf(out, ",");
   1274 	fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx",
   1275 		(unsigned long) tab[i].key, (unsigned long) tab[i].other1,
   1276 		(unsigned long) tab[i].other2);
   1277     }
   1278 }
   1279 
   1280 #define PREF "static const "
   1281 
   1282 #endif
   1283 
   1284 static void
   1285 write_cdata(char *opath)
   1286 {
   1287     FILE *out;
   1288 	ac_uint4 bytes;
   1289     ac_uint4 i, idx, nprops;
   1290 #if !(HARDCODE_DATA)
   1291     ac_uint2 casecnt[2];
   1292 #endif
   1293     char path[BUFSIZ];
   1294 #if HARDCODE_DATA
   1295     int j, k;
   1296 
   1297     /*****************************************************************
   1298      *
   1299      * Generate the ctype data.
   1300      *
   1301      *****************************************************************/
   1302 
   1303     /*
   1304      * Open the output file.
   1305      */
   1306     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "uctable.h", opath);
   1307     if ((out = fopen(path, "w")) == 0)
   1308       return;
   1309 #else
   1310     /*
   1311      * Open the ctype.dat file.
   1312      */
   1313     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "ctype.dat", opath);
   1314     if ((out = fopen(path, "wb")) == 0)
   1315       return;
   1316 #endif
   1317 
   1318     /*
   1319      * Collect the offsets for the properties.  The offsets array is
   1320      * on a 4-byte boundary to keep things efficient for architectures
   1321      * that need such a thing.
   1322      */
   1323     for (i = idx = 0; i < NUMPROPS; i++) {
   1324         propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
   1325         idx += proptbl[i].used;
   1326     }
   1327 
   1328     /*
   1329      * Add the sentinel index which is used by the binary search as the upper
   1330      * bound for a search.
   1331      */
   1332     propcnt[i] = idx;
   1333 
   1334     /*
   1335      * Record the actual number of property lists.  This may be different than
   1336      * the number of offsets actually written because of aligning on a 4-byte
   1337      * boundary.
   1338      */
   1339     hdr[1] = NUMPROPS;
   1340 
   1341     /*
   1342      * Calculate the byte count needed and pad the property counts array to a
   1343      * 4-byte boundary.
   1344      */
   1345     if ((bytes = sizeof(ac_uint2) * (NUMPROPS + 1)) & 3)
   1346       bytes += 4 - (bytes & 3);
   1347     nprops = bytes / sizeof(ac_uint2);
   1348     bytes += sizeof(ac_uint4) * idx;
   1349 
   1350 #if HARDCODE_DATA
   1351     fprintf(out, PREF "ac_uint4 _ucprop_size = %d;\n\n", NUMPROPS);
   1352 
   1353     fprintf(out, PREF "ac_uint2 _ucprop_offsets[] = {");
   1354 
   1355     for (i = 0; i<nprops; i++) {
   1356        if (i) fprintf(out, ",");
   1357        if (!(i&7)) fprintf(out, "\n\t");
   1358        else fprintf(out, " ");
   1359        fprintf(out, "0x%04x", propcnt[i]);
   1360     }
   1361     fprintf(out, "\n};\n\n");
   1362 
   1363     fprintf(out, PREF "ac_uint4 _ucprop_ranges[] = {");
   1364 
   1365     k = 0;
   1366     for (i = 0; i < NUMPROPS; i++) {
   1367 	if (proptbl[i].used > 0) {
   1368 	  for (j=0; j<proptbl[i].used; j++) {
   1369 	    if (k) fprintf(out, ",");
   1370 	    if (!(k&3)) fprintf(out,"\n\t");
   1371 	    else fprintf(out, " ");
   1372 	    k++;
   1373 	    fprintf(out, "0x%08lx", (unsigned long) proptbl[i].ranges[j]);
   1374 	  }
   1375 	}
   1376     }
   1377     fprintf(out, "\n};\n\n");
   1378 #else
   1379     /*
   1380      * Write the header.
   1381      */
   1382     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
   1383 
   1384     /*
   1385      * Write the byte count.
   1386      */
   1387     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1388 
   1389     /*
   1390      * Write the property list counts.
   1391      */
   1392     fwrite((char *) propcnt, sizeof(ac_uint2), nprops, out);
   1393 
   1394     /*
   1395      * Write the property lists.
   1396      */
   1397     for (i = 0; i < NUMPROPS; i++) {
   1398         if (proptbl[i].used > 0)
   1399           fwrite((char *) proptbl[i].ranges, sizeof(ac_uint4),
   1400                  proptbl[i].used, out);
   1401     }
   1402 
   1403     fclose(out);
   1404 #endif
   1405 
   1406     /*****************************************************************
   1407      *
   1408      * Generate the case mapping data.
   1409      *
   1410      *****************************************************************/
   1411 
   1412 #if HARDCODE_DATA
   1413     fprintf(out, PREF "ac_uint4 _uccase_size = %ld;\n\n",
   1414         (long) (upper_used + lower_used + title_used));
   1415 
   1416     fprintf(out, PREF "ac_uint2 _uccase_len[2] = {%ld, %ld};\n\n",
   1417         (long) upper_used, (long) lower_used);
   1418     fprintf(out, PREF "ac_uint4 _uccase_map[] = {");
   1419 
   1420     if (upper_used > 0)
   1421       /*
   1422        * Write the upper case table.
   1423        */
   1424       write_case(out, upper, upper_used, 1);
   1425 
   1426     if (lower_used > 0)
   1427       /*
   1428        * Write the lower case table.
   1429        */
   1430       write_case(out, lower, lower_used, !upper_used);
   1431 
   1432     if (title_used > 0)
   1433       /*
   1434        * Write the title case table.
   1435        */
   1436       write_case(out, title, title_used, !(upper_used||lower_used));
   1437 
   1438     if (!(upper_used || lower_used || title_used))
   1439 	fprintf(out, "\t0");
   1440 
   1441     fprintf(out, "\n};\n\n");
   1442 #else
   1443     /*
   1444      * Open the case.dat file.
   1445      */
   1446     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "case.dat", opath);
   1447     if ((out = fopen(path, "wb")) == 0)
   1448       return;
   1449 
   1450     /*
   1451      * Write the case mapping tables.
   1452      */
   1453     hdr[1] = upper_used + lower_used + title_used;
   1454     casecnt[0] = upper_used;
   1455     casecnt[1] = lower_used;
   1456 
   1457     /*
   1458      * Write the header.
   1459      */
   1460     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
   1461 
   1462     /*
   1463      * Write the upper and lower case table sizes.
   1464      */
   1465     fwrite((char *) casecnt, sizeof(ac_uint2), 2, out);
   1466 
   1467     if (upper_used > 0)
   1468       /*
   1469        * Write the upper case table.
   1470        */
   1471       fwrite((char *) upper, sizeof(_case_t), upper_used, out);
   1472 
   1473     if (lower_used > 0)
   1474       /*
   1475        * Write the lower case table.
   1476        */
   1477       fwrite((char *) lower, sizeof(_case_t), lower_used, out);
   1478 
   1479     if (title_used > 0)
   1480       /*
   1481        * Write the title case table.
   1482        */
   1483       fwrite((char *) title, sizeof(_case_t), title_used, out);
   1484 
   1485     fclose(out);
   1486 #endif
   1487 
   1488     /*****************************************************************
   1489      *
   1490      * Generate the composition data.
   1491      *
   1492      *****************************************************************/
   1493 
   1494     /*
   1495      * Create compositions from decomposition data
   1496      */
   1497     create_comps();
   1498 
   1499 #if HARDCODE_DATA
   1500     fprintf(out, PREF "ac_uint4 _uccomp_size = %ld;\n\n",
   1501         comps_used * 4L);
   1502 
   1503     fprintf(out, PREF "ac_uint4 _uccomp_data[] = {");
   1504 
   1505      /*
   1506       * Now, if comps exist, write them out.
   1507       */
   1508     if (comps_used > 0) {
   1509 	for (i=0; i<comps_used; i++) {
   1510 	    if (i) fprintf(out, ",");
   1511 	    fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx, 0x%08lx",
   1512 	        (unsigned long) comps[i].comp, (unsigned long) comps[i].count,
   1513 	        (unsigned long) comps[i].code1, (unsigned long) comps[i].code2);
   1514 	}
   1515     } else {
   1516 	fprintf(out, "\t0");
   1517     }
   1518     fprintf(out, "\n};\n\n");
   1519 #else
   1520     /*
   1521      * Open the comp.dat file.
   1522      */
   1523     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "comp.dat", opath);
   1524     if ((out = fopen(path, "wb")) == 0)
   1525 	return;
   1526 
   1527     /*
   1528      * Write the header.
   1529      */
   1530     hdr[1] = (ac_uint2) comps_used * 4;
   1531     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
   1532 
   1533     /*
   1534      * Write out the byte count to maintain header size.
   1535      */
   1536     bytes = comps_used * sizeof(_comp_t);
   1537     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1538 
   1539     /*
   1540      * Now, if comps exist, write them out.
   1541      */
   1542     if (comps_used > 0)
   1543         fwrite((char *) comps, sizeof(_comp_t), comps_used, out);
   1544 
   1545     fclose(out);
   1546 #endif
   1547 
   1548     /*****************************************************************
   1549      *
   1550      * Generate the decomposition data.
   1551      *
   1552      *****************************************************************/
   1553 
   1554     /*
   1555      * Fully expand all decompositions before generating the output file.
   1556      */
   1557     expand_decomp();
   1558 
   1559 #if HARDCODE_DATA
   1560     fprintf(out, PREF "ac_uint4 _ucdcmp_size = %ld;\n\n",
   1561         decomps_used * 2L);
   1562 
   1563     fprintf(out, PREF "ac_uint4 _ucdcmp_nodes[] = {");
   1564 
   1565     if (decomps_used) {
   1566 	/*
   1567 	 * Write the list of decomp nodes.
   1568 	 */
   1569 	for (i = idx = 0; i < decomps_used; i++) {
   1570 	    fprintf(out, "\n\t0x%08lx, 0x%08lx,",
   1571 	        (unsigned long) decomps[i].code, (unsigned long) idx);
   1572 	    idx += decomps[i].used;
   1573 	}
   1574 
   1575 	/*
   1576 	 * Write the sentinel index as the last decomp node.
   1577 	 */
   1578 	fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
   1579 
   1580 	fprintf(out, PREF "ac_uint4 _ucdcmp_decomp[] = {");
   1581 	/*
   1582 	 * Write the decompositions themselves.
   1583 	 */
   1584 	k = 0;
   1585 	for (i = 0; i < decomps_used; i++)
   1586 	  for (j=0; j<decomps[i].used; j++) {
   1587 	    if (k) fprintf(out, ",");
   1588 	    if (!(k&3)) fprintf(out,"\n\t");
   1589 	    else fprintf(out, " ");
   1590 	    k++;
   1591 	    fprintf(out, "0x%08lx", (unsigned long) decomps[i].decomp[j]);
   1592 	  }
   1593 	fprintf(out, "\n};\n\n");
   1594     }
   1595 #else
   1596     /*
   1597      * Open the decomp.dat file.
   1598      */
   1599     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "decomp.dat", opath);
   1600     if ((out = fopen(path, "wb")) == 0)
   1601       return;
   1602 
   1603     hdr[1] = decomps_used;
   1604 
   1605     /*
   1606      * Write the header.
   1607      */
   1608     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
   1609 
   1610     /*
   1611      * Write a temporary byte count which will be calculated as the
   1612      * decompositions are written out.
   1613      */
   1614     bytes = 0;
   1615     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1616 
   1617     if (decomps_used) {
   1618         /*
   1619          * Write the list of decomp nodes.
   1620          */
   1621         for (i = idx = 0; i < decomps_used; i++) {
   1622             fwrite((char *) &decomps[i].code, sizeof(ac_uint4), 1, out);
   1623             fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
   1624             idx += decomps[i].used;
   1625         }
   1626 
   1627         /*
   1628          * Write the sentinel index as the last decomp node.
   1629          */
   1630         fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
   1631 
   1632         /*
   1633          * Write the decompositions themselves.
   1634          */
   1635         for (i = 0; i < decomps_used; i++)
   1636           fwrite((char *) decomps[i].decomp, sizeof(ac_uint4),
   1637                  decomps[i].used, out);
   1638 
   1639         /*
   1640          * Seek back to the beginning and write the byte count.
   1641          */
   1642         bytes = (sizeof(ac_uint4) * idx) +
   1643             (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
   1644         fseek(out, sizeof(ac_uint2) << 1, 0L);
   1645         fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1646 
   1647         fclose(out);
   1648     }
   1649 #endif
   1650 
   1651 #ifdef HARDCODE_DATA
   1652     fprintf(out, PREF "ac_uint4 _uckdcmp_size = %ld;\n\n",
   1653         kdecomps_used * 2L);
   1654 
   1655     fprintf(out, PREF "ac_uint4 _uckdcmp_nodes[] = {");
   1656 
   1657     if (kdecomps_used) {
   1658 	/*
   1659 	 * Write the list of kdecomp nodes.
   1660 	 */
   1661 	for (i = idx = 0; i < kdecomps_used; i++) {
   1662 	    fprintf(out, "\n\t0x%08lx, 0x%08lx,",
   1663 	        (unsigned long) kdecomps[i].code, (unsigned long) idx);
   1664 	    idx += kdecomps[i].used;
   1665 	}
   1666 
   1667 	/*
   1668 	 * Write the sentinel index as the last decomp node.
   1669 	 */
   1670 	fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
   1671 
   1672 	fprintf(out, PREF "ac_uint4 _uckdcmp_decomp[] = {");
   1673 
   1674 	/*
   1675 	 * Write the decompositions themselves.
   1676 	 */
   1677 	k = 0;
   1678 	for (i = 0; i < kdecomps_used; i++)
   1679 	  for (j=0; j<kdecomps[i].used; j++) {
   1680 	    if (k) fprintf(out, ",");
   1681 	    if (!(k&3)) fprintf(out,"\n\t");
   1682 	    else fprintf(out, " ");
   1683 	    k++;
   1684 	    fprintf(out, "0x%08lx", (unsigned long) kdecomps[i].decomp[j]);
   1685 	  }
   1686 	fprintf(out, "\n};\n\n");
   1687     }
   1688 #else
   1689     /*
   1690      * Open the kdecomp.dat file.
   1691      */
   1692     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "kdecomp.dat", opath);
   1693     if ((out = fopen(path, "wb")) == 0)
   1694       return;
   1695 
   1696     hdr[1] = kdecomps_used;
   1697 
   1698     /*
   1699      * Write the header.
   1700      */
   1701     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
   1702 
   1703     /*
   1704      * Write a temporary byte count which will be calculated as the
   1705      * decompositions are written out.
   1706      */
   1707     bytes = 0;
   1708     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1709 
   1710     if (kdecomps_used) {
   1711         /*
   1712          * Write the list of kdecomp nodes.
   1713          */
   1714         for (i = idx = 0; i < kdecomps_used; i++) {
   1715             fwrite((char *) &kdecomps[i].code, sizeof(ac_uint4), 1, out);
   1716             fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
   1717             idx += kdecomps[i].used;
   1718         }
   1719 
   1720         /*
   1721          * Write the sentinel index as the last decomp node.
   1722          */
   1723         fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
   1724 
   1725         /*
   1726          * Write the decompositions themselves.
   1727          */
   1728         for (i = 0; i < kdecomps_used; i++)
   1729           fwrite((char *) kdecomps[i].decomp, sizeof(ac_uint4),
   1730                  kdecomps[i].used, out);
   1731 
   1732         /*
   1733          * Seek back to the beginning and write the byte count.
   1734          */
   1735         bytes = (sizeof(ac_uint4) * idx) +
   1736             (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
   1737         fseek(out, sizeof(ac_uint2) << 1, 0L);
   1738         fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1739 
   1740         fclose(out);
   1741     }
   1742 #endif
   1743 
   1744     /*****************************************************************
   1745      *
   1746      * Generate the combining class data.
   1747      *
   1748      *****************************************************************/
   1749 #ifdef HARDCODE_DATA
   1750     fprintf(out, PREF "ac_uint4 _uccmcl_size = %ld;\n\n", (long) ccl_used);
   1751 
   1752     fprintf(out, PREF "ac_uint4 _uccmcl_nodes[] = {");
   1753 
   1754     if (ccl_used > 0) {
   1755 	/*
   1756 	 * Write the combining class ranges out.
   1757 	 */
   1758 	for (i = 0; i<ccl_used; i++) {
   1759 	    if (i) fprintf(out, ",");
   1760 	    if (!(i&3)) fprintf(out, "\n\t");
   1761 	    else fprintf(out, " ");
   1762 	    fprintf(out, "0x%08lx", (unsigned long) ccl[i]);
   1763 	}
   1764     } else {
   1765 	fprintf(out, "\t0");
   1766     }
   1767     fprintf(out, "\n};\n\n");
   1768 #else
   1769     /*
   1770      * Open the cmbcl.dat file.
   1771      */
   1772     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "cmbcl.dat", opath);
   1773     if ((out = fopen(path, "wb")) == 0)
   1774       return;
   1775 
   1776     /*
   1777      * Set the number of ranges used.  Each range has a combining class which
   1778      * means each entry is a 3-tuple.
   1779      */
   1780     hdr[1] = ccl_used / 3;
   1781 
   1782     /*
   1783      * Write the header.
   1784      */
   1785     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
   1786 
   1787     /*
   1788      * Write out the byte count to maintain header size.
   1789      */
   1790     bytes = ccl_used * sizeof(ac_uint4);
   1791     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1792 
   1793     if (ccl_used > 0)
   1794       /*
   1795        * Write the combining class ranges out.
   1796        */
   1797       fwrite((char *) ccl, sizeof(ac_uint4), ccl_used, out);
   1798 
   1799     fclose(out);
   1800 #endif
   1801 
   1802     /*****************************************************************
   1803      *
   1804      * Generate the number data.
   1805      *
   1806      *****************************************************************/
   1807 
   1808 #if HARDCODE_DATA
   1809     fprintf(out, PREF "ac_uint4 _ucnum_size = %lu;\n\n",
   1810         (unsigned long)ncodes_used<<1);
   1811 
   1812     fprintf(out, PREF "ac_uint4 _ucnum_nodes[] = {");
   1813 
   1814     /*
   1815      * Now, if number mappings exist, write them out.
   1816      */
   1817     if (ncodes_used > 0) {
   1818 	for (i = 0; i<ncodes_used; i++) {
   1819 	    if (i) fprintf(out, ",");
   1820 	    if (!(i&1)) fprintf(out, "\n\t");
   1821 	    else fprintf(out, " ");
   1822 	    fprintf(out, "0x%08lx, 0x%08lx",
   1823 	        (unsigned long) ncodes[i].code, (unsigned long) ncodes[i].idx);
   1824 	}
   1825 	fprintf(out, "\n};\n\n");
   1826 
   1827 	fprintf(out, PREF "short _ucnum_vals[] = {");
   1828 	for (i = 0; i<nums_used; i++) {
   1829 	    if (i) fprintf(out, ",");
   1830 	    if (!(i&3)) fprintf(out, "\n\t");
   1831 	    else fprintf(out, " ");
   1832 	    if (nums[i].numerator < 0) {
   1833 		fprintf(out, "%6d, 0x%04x",
   1834 		  nums[i].numerator, nums[i].denominator);
   1835 	    } else {
   1836 		fprintf(out, "0x%04x, 0x%04x",
   1837 		  nums[i].numerator, nums[i].denominator);
   1838 	    }
   1839 	}
   1840 	fprintf(out, "\n};\n\n");
   1841     }
   1842 #else
   1843     /*
   1844      * Open the num.dat file.
   1845      */
   1846     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "num.dat", opath);
   1847     if ((out = fopen(path, "wb")) == 0)
   1848       return;
   1849 
   1850     /*
   1851      * The count part of the header will be the total number of codes that
   1852      * have numbers.
   1853      */
   1854     hdr[1] = (ac_uint2) (ncodes_used << 1);
   1855     bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
   1856 
   1857     /*
   1858      * Write the header.
   1859      */
   1860     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
   1861 
   1862     /*
   1863      * Write out the byte count to maintain header size.
   1864      */
   1865     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
   1866 
   1867     /*
   1868      * Now, if number mappings exist, write them out.
   1869      */
   1870     if (ncodes_used > 0) {
   1871         fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
   1872         fwrite((char *) nums, sizeof(_num_t), nums_used, out);
   1873     }
   1874 #endif
   1875 
   1876     fclose(out);
   1877 }
   1878 
   1879 static void
   1880 usage(char *prog)
   1881 {
   1882     fprintf(stderr,
   1883             "Usage: %s [-o output-directory|-x composition-exclusions]", prog);
   1884     fprintf(stderr, " datafile1 datafile2 ...\n\n");
   1885     fprintf(stderr,
   1886             "-o output-directory\n\t\tWrite the output files to a different");
   1887     fprintf(stderr, " directory (default: .).\n");
   1888     fprintf(stderr,
   1889             "-x composition-exclusion\n\t\tFile of composition codes");
   1890     fprintf(stderr, " that should be excluded.\n");
   1891     exit(1);
   1892 }
   1893 
   1894 int
   1895 main(int argc, char *argv[])
   1896 {
   1897     FILE *in;
   1898     char *prog, *opath;
   1899 
   1900     prog = lutil_progname( "ucgendat", argc, argv );
   1901 
   1902     opath = 0;
   1903     in = stdin;
   1904 
   1905     argc--;
   1906     argv++;
   1907 
   1908     while (argc > 0) {
   1909         if (argv[0][0] == '-') {
   1910             switch (argv[0][1]) {
   1911               case 'o':
   1912                 argc--;
   1913                 argv++;
   1914                 opath = argv[0];
   1915                 break;
   1916               case 'x':
   1917                 argc--;
   1918                 argv++;
   1919                 if ((in = fopen(argv[0], "r")) == 0)
   1920                   fprintf(stderr,
   1921                           "%s: unable to open composition exclusion file %s\n",
   1922                           prog, argv[0]);
   1923                 else {
   1924                     read_compexdata(in);
   1925                     fclose(in);
   1926                     in = 0;
   1927                 }
   1928                 break;
   1929               default:
   1930                 usage(prog);
   1931             }
   1932         } else {
   1933             if (in != stdin && in != NULL)
   1934               fclose(in);
   1935             if ((in = fopen(argv[0], "r")) == 0)
   1936               fprintf(stderr, "%s: unable to open ctype file %s\n",
   1937                       prog, argv[0]);
   1938             else {
   1939                 read_cdata(in);
   1940                 fclose(in);
   1941                 in = 0;
   1942 	    }
   1943         }
   1944         argc--;
   1945         argv++;
   1946     }
   1947 
   1948     if (opath == 0)
   1949       opath = ".";
   1950     write_cdata(opath);
   1951 
   1952     return 0;
   1953 }
   1954