ucgendat.c revision 1.1.1.3.12.1 1 /* $NetBSD: ucgendat.c,v 1.1.1.3.12.1 2014/08/19 23:52:00 tls Exp $ */
2
3 /* $OpenLDAP$ */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 *
6 * Copyright 1998-2014 The OpenLDAP Foundation.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted only as authorized by the OpenLDAP
11 * Public License.
12 *
13 * A copy of this license is available in file LICENSE in the
14 * top-level directory of the distribution or, alternatively, at
15 * <http://www.OpenLDAP.org/license.html>.
16 */
17 /* Copyright 2001 Computing Research Labs, New Mexico State University
18 *
19 * Permission is hereby granted, free of charge, to any person obtaining a
20 * copy of this software and associated documentation files (the "Software"),
21 * to deal in the Software without restriction, including without limitation
22 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
23 * and/or sell copies of the Software, and to permit persons to whom the
24 * Software is furnished to do so, subject to the following conditions:
25 *
26 * The above copyright notice and this permission notice shall be included in
27 * all copies or substantial portions of the Software.
28 *
29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
32 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
33 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
34 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
35 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36 */
37 /* Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp " */
38
39 #include "portable.h"
40 #include "ldap_config.h"
41
42 #include <stdio.h>
43 #include <ac/ctype.h>
44 #include <ac/stdlib.h>
45 #include <ac/string.h>
46 #include <ac/unistd.h>
47
48 #include <ac/bytes.h>
49
50 #include <lutil.h>
51
52 #ifndef HARDCODE_DATA
53 #define HARDCODE_DATA 1
54 #endif
55
56 #undef ishdigit
57 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
58 ((cc) >= 'A' && (cc) <= 'F') ||\
59 ((cc) >= 'a' && (cc) <= 'f'))
60
61 /*
62 * A header written to the output file with the byte-order-mark and the number
63 * of property nodes.
64 */
65 static ac_uint2 hdr[2] = {0xfeff, 0};
66
67 #define NUMPROPS 50
68 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
69
70 typedef struct {
71 char *name;
72 int len;
73 } _prop_t;
74
75 /*
76 * List of properties expected to be found in the Unicode Character Database
77 * including some implementation specific properties.
78 *
79 * The implementation specific properties are:
80 * Cm = Composed (can be decomposed)
81 * Nb = Non-breaking
82 * Sy = Symmetric (has left and right forms)
83 * Hd = Hex digit
84 * Qm = Quote marks
85 * Mr = Mirroring
86 * Ss = Space, other
87 * Cp = Defined character
88 */
89 static _prop_t props[NUMPROPS] = {
90 {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
91 {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
92 {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
93 {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
94 {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L", 1}, {"R", 1},
95 {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B", 1},
96 {"S", 1}, {"WS", 2}, {"ON", 2},
97 {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
98 {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2}
99 };
100
101 typedef struct {
102 ac_uint4 *ranges;
103 ac_uint2 used;
104 ac_uint2 size;
105 } _ranges_t;
106
107 static _ranges_t proptbl[NUMPROPS];
108
109 /*
110 * Make sure this array is sized to be on a 4-byte boundary at compile time.
111 */
112 static ac_uint2 propcnt[NEEDPROPS];
113
114 /*
115 * Array used to collect a decomposition before adding it to the decomposition
116 * table.
117 */
118 static ac_uint4 dectmp[64];
119 static ac_uint4 dectmp_size;
120
121 typedef struct {
122 ac_uint4 code;
123 ac_uint2 size;
124 ac_uint2 used;
125 ac_uint4 *decomp;
126 } _decomp_t;
127
128 /*
129 * List of decomposition. Created and expanded in order as the characters are
130 * encountered. First list contains canonical mappings, second also includes
131 * compatibility mappings.
132 */
133 static _decomp_t *decomps;
134 static ac_uint4 decomps_used;
135 static ac_uint4 decomps_size;
136
137 static _decomp_t *kdecomps;
138 static ac_uint4 kdecomps_used;
139 static ac_uint4 kdecomps_size;
140
141 /*
142 * Composition exclusion table stuff.
143 */
144 #define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31)))
145 #define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31)))
146 static ac_uint4 compexs[8192];
147
148 /*
149 * Struct for holding a composition pair, and array of composition pairs
150 */
151 typedef struct {
152 ac_uint4 comp;
153 ac_uint4 count;
154 ac_uint4 code1;
155 ac_uint4 code2;
156 } _comp_t;
157
158 static _comp_t *comps;
159 static ac_uint4 comps_used;
160
161 /*
162 * Types and lists for handling lists of case mappings.
163 */
164 typedef struct {
165 ac_uint4 key;
166 ac_uint4 other1;
167 ac_uint4 other2;
168 } _case_t;
169
170 static _case_t *upper;
171 static _case_t *lower;
172 static _case_t *title;
173 static ac_uint4 upper_used;
174 static ac_uint4 upper_size;
175 static ac_uint4 lower_used;
176 static ac_uint4 lower_size;
177 static ac_uint4 title_used;
178 static ac_uint4 title_size;
179
180 /*
181 * Array used to collect case mappings before adding them to a list.
182 */
183 static ac_uint4 cases[3];
184
185 /*
186 * An array to hold ranges for combining classes.
187 */
188 static ac_uint4 *ccl;
189 static ac_uint4 ccl_used;
190 static ac_uint4 ccl_size;
191
192 /*
193 * Structures for handling numbers.
194 */
195 typedef struct {
196 ac_uint4 code;
197 ac_uint4 idx;
198 } _codeidx_t;
199
200 typedef struct {
201 short numerator;
202 short denominator;
203 } _num_t;
204
205 /*
206 * Arrays to hold the mapping of codes to numbers.
207 */
208 static _codeidx_t *ncodes;
209 static ac_uint4 ncodes_used;
210 static ac_uint4 ncodes_size;
211
212 static _num_t *nums;
213 static ac_uint4 nums_used;
214 static ac_uint4 nums_size;
215
216 /*
217 * Array for holding numbers.
218 */
219 static _num_t *nums;
220 static ac_uint4 nums_used;
221 static ac_uint4 nums_size;
222
223 static void
224 add_range(ac_uint4 start, ac_uint4 end, char *p1, char *p2)
225 {
226 int i, j, k, len;
227 _ranges_t *rlp;
228 char *name;
229
230 for (k = 0; k < 2; k++) {
231 if (k == 0) {
232 name = p1;
233 len = 2;
234 } else {
235 if (p2 == 0)
236 break;
237
238 name = p2;
239 len = 1;
240 }
241
242 for (i = 0; i < NUMPROPS; i++) {
243 if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
244 break;
245 }
246
247 if (i == NUMPROPS)
248 continue;
249
250 rlp = &proptbl[i];
251
252 /*
253 * Resize the range list if necessary.
254 */
255 if (rlp->used == rlp->size) {
256 if (rlp->size == 0)
257 rlp->ranges = (ac_uint4 *)
258 malloc(sizeof(ac_uint4) << 3);
259 else
260 rlp->ranges = (ac_uint4 *)
261 realloc((char *) rlp->ranges,
262 sizeof(ac_uint4) * (rlp->size + 8));
263 rlp->size += 8;
264 }
265
266 /*
267 * If this is the first code for this property list, just add it
268 * and return.
269 */
270 if (rlp->used == 0) {
271 rlp->ranges[0] = start;
272 rlp->ranges[1] = end;
273 rlp->used += 2;
274 continue;
275 }
276
277 /*
278 * Optimize the case of adding the range to the end.
279 */
280 j = rlp->used - 1;
281 if (start > rlp->ranges[j]) {
282 j = rlp->used;
283 rlp->ranges[j++] = start;
284 rlp->ranges[j++] = end;
285 rlp->used = j;
286 continue;
287 }
288
289 /*
290 * Need to locate the insertion point.
291 */
292 for (i = 0;
293 i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
294
295 /*
296 * If the start value lies in the current range, then simply set the
297 * new end point of the range to the end value passed as a parameter.
298 */
299 if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
300 rlp->ranges[i + 1] = end;
301 return;
302 }
303
304 /*
305 * Shift following values up by two.
306 */
307 for (j = rlp->used; j > i; j -= 2) {
308 rlp->ranges[j] = rlp->ranges[j - 2];
309 rlp->ranges[j + 1] = rlp->ranges[j - 1];
310 }
311
312 /*
313 * Add the new range at the insertion point.
314 */
315 rlp->ranges[i] = start;
316 rlp->ranges[i + 1] = end;
317 rlp->used += 2;
318 }
319 }
320
321 static void
322 ordered_range_insert(ac_uint4 c, char *name, int len)
323 {
324 int i, j;
325 ac_uint4 s, e;
326 _ranges_t *rlp;
327
328 if (len == 0)
329 return;
330
331 /*
332 * Deal with directionality codes introduced in Unicode 3.0.
333 */
334 if ((len == 2 && memcmp(name, "BN", 2) == 0) ||
335 (len == 3 &&
336 (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 ||
337 memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 ||
338 memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0))) {
339 /*
340 * Mark all of these as Other Neutral to preserve compatibility with
341 * older versions.
342 */
343 len = 2;
344 name = "ON";
345 }
346
347 for (i = 0; i < NUMPROPS; i++) {
348 if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
349 break;
350 }
351
352 if (i == NUMPROPS)
353 return;
354
355 /*
356 * Have a match, so insert the code in order.
357 */
358 rlp = &proptbl[i];
359
360 /*
361 * Resize the range list if necessary.
362 */
363 if (rlp->used == rlp->size) {
364 if (rlp->size == 0)
365 rlp->ranges = (ac_uint4 *)
366 malloc(sizeof(ac_uint4) << 3);
367 else
368 rlp->ranges = (ac_uint4 *)
369 realloc((char *) rlp->ranges,
370 sizeof(ac_uint4) * (rlp->size + 8));
371 rlp->size += 8;
372 }
373
374 /*
375 * If this is the first code for this property list, just add it
376 * and return.
377 */
378 if (rlp->used == 0) {
379 rlp->ranges[0] = rlp->ranges[1] = c;
380 rlp->used += 2;
381 return;
382 }
383
384 /*
385 * Optimize the cases of extending the last range and adding new ranges to
386 * the end.
387 */
388 j = rlp->used - 1;
389 e = rlp->ranges[j];
390 s = rlp->ranges[j - 1];
391
392 if (c == e + 1) {
393 /*
394 * Extend the last range.
395 */
396 rlp->ranges[j] = c;
397 return;
398 }
399
400 if (c > e + 1) {
401 /*
402 * Start another range on the end.
403 */
404 j = rlp->used;
405 rlp->ranges[j] = rlp->ranges[j + 1] = c;
406 rlp->used += 2;
407 return;
408 }
409
410 if (c >= s)
411 /*
412 * The code is a duplicate of a code in the last range, so just return.
413 */
414 return;
415
416 /*
417 * The code should be inserted somewhere before the last range in the
418 * list. Locate the insertion point.
419 */
420 for (i = 0;
421 i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
422
423 s = rlp->ranges[i];
424 e = rlp->ranges[i + 1];
425
426 if (c == e + 1)
427 /*
428 * Simply extend the current range.
429 */
430 rlp->ranges[i + 1] = c;
431 else if (c < s) {
432 /*
433 * Add a new entry before the current location. Shift all entries
434 * before the current one up by one to make room.
435 */
436 for (j = rlp->used; j > i; j -= 2) {
437 rlp->ranges[j] = rlp->ranges[j - 2];
438 rlp->ranges[j + 1] = rlp->ranges[j - 1];
439 }
440 rlp->ranges[i] = rlp->ranges[i + 1] = c;
441
442 rlp->used += 2;
443 }
444 }
445
446 static void
447 add_decomp(ac_uint4 code, short compat)
448 {
449 ac_uint4 i, j, size;
450 _decomp_t **pdecomps;
451 ac_uint4 *pdecomps_used;
452 ac_uint4 *pdecomps_size;
453
454 if (compat) {
455 pdecomps = &kdecomps;
456 pdecomps_used = &kdecomps_used;
457 pdecomps_size = &kdecomps_size;
458 } else {
459 pdecomps = &decomps;
460 pdecomps_used = &decomps_used;
461 pdecomps_size = &decomps_size;
462 }
463
464 /*
465 * Add the code to the composite property.
466 */
467 if (!compat) {
468 ordered_range_insert(code, "Cm", 2);
469 }
470
471 /*
472 * Locate the insertion point for the code.
473 */
474 for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ;
475
476 /*
477 * Allocate space for a new decomposition.
478 */
479 if (*pdecomps_used == *pdecomps_size) {
480 if (*pdecomps_size == 0)
481 *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
482 else
483 *pdecomps = (_decomp_t *)
484 realloc((char *) *pdecomps,
485 sizeof(_decomp_t) * (*pdecomps_size + 8));
486 (void) memset((char *) (*pdecomps + *pdecomps_size), '\0',
487 sizeof(_decomp_t) << 3);
488 *pdecomps_size += 8;
489 }
490
491 if (i < *pdecomps_used && code != (*pdecomps)[i].code) {
492 /*
493 * Shift the decomps up by one if the codes don't match.
494 */
495 for (j = *pdecomps_used; j > i; j--)
496 (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1],
497 sizeof(_decomp_t));
498 }
499
500 /*
501 * Insert or replace a decomposition.
502 */
503 size = dectmp_size + (4 - (dectmp_size & 3));
504 if ((*pdecomps)[i].size < size) {
505 if ((*pdecomps)[i].size == 0)
506 (*pdecomps)[i].decomp = (ac_uint4 *)
507 malloc(sizeof(ac_uint4) * size);
508 else
509 (*pdecomps)[i].decomp = (ac_uint4 *)
510 realloc((char *) (*pdecomps)[i].decomp,
511 sizeof(ac_uint4) * size);
512 (*pdecomps)[i].size = size;
513 }
514
515 if ((*pdecomps)[i].code != code)
516 (*pdecomps_used)++;
517
518 (*pdecomps)[i].code = code;
519 (*pdecomps)[i].used = dectmp_size;
520 (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp,
521 sizeof(ac_uint4) * dectmp_size);
522
523 /*
524 * NOTICE: This needs changing later so it is more general than simply
525 * pairs. This calculation is done here to simplify allocation elsewhere.
526 */
527 if (!compat && dectmp_size == 2)
528 comps_used++;
529 }
530
531 static void
532 add_title(ac_uint4 code)
533 {
534 ac_uint4 i, j;
535
536 /*
537 * Always map the code to itself.
538 */
539 cases[2] = code;
540
541 if (title_used == title_size) {
542 if (title_size == 0)
543 title = (_case_t *) malloc(sizeof(_case_t) << 3);
544 else
545 title = (_case_t *) realloc((char *) title,
546 sizeof(_case_t) * (title_size + 8));
547 title_size += 8;
548 }
549
550 /*
551 * Locate the insertion point.
552 */
553 for (i = 0; i < title_used && code > title[i].key; i++) ;
554
555 if (i < title_used) {
556 /*
557 * Shift the array up by one.
558 */
559 for (j = title_used; j > i; j--)
560 (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1],
561 sizeof(_case_t));
562 }
563
564 title[i].key = cases[2]; /* Title */
565 title[i].other1 = cases[0]; /* Upper */
566 title[i].other2 = cases[1]; /* Lower */
567
568 title_used++;
569 }
570
571 static void
572 add_upper(ac_uint4 code)
573 {
574 ac_uint4 i, j;
575
576 /*
577 * Always map the code to itself.
578 */
579 cases[0] = code;
580
581 /*
582 * If the title case character is not present, then make it the same as
583 * the upper case.
584 */
585 if (cases[2] == 0)
586 cases[2] = code;
587
588 if (upper_used == upper_size) {
589 if (upper_size == 0)
590 upper = (_case_t *) malloc(sizeof(_case_t) << 3);
591 else
592 upper = (_case_t *) realloc((char *) upper,
593 sizeof(_case_t) * (upper_size + 8));
594 upper_size += 8;
595 }
596
597 /*
598 * Locate the insertion point.
599 */
600 for (i = 0; i < upper_used && code > upper[i].key; i++) ;
601
602 if (i < upper_used) {
603 /*
604 * Shift the array up by one.
605 */
606 for (j = upper_used; j > i; j--)
607 (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1],
608 sizeof(_case_t));
609 }
610
611 upper[i].key = cases[0]; /* Upper */
612 upper[i].other1 = cases[1]; /* Lower */
613 upper[i].other2 = cases[2]; /* Title */
614
615 upper_used++;
616 }
617
618 static void
619 add_lower(ac_uint4 code)
620 {
621 ac_uint4 i, j;
622
623 /*
624 * Always map the code to itself.
625 */
626 cases[1] = code;
627
628 /*
629 * If the title case character is empty, then make it the same as the
630 * upper case.
631 */
632 if (cases[2] == 0)
633 cases[2] = cases[0];
634
635 if (lower_used == lower_size) {
636 if (lower_size == 0)
637 lower = (_case_t *) malloc(sizeof(_case_t) << 3);
638 else
639 lower = (_case_t *) realloc((char *) lower,
640 sizeof(_case_t) * (lower_size + 8));
641 lower_size += 8;
642 }
643
644 /*
645 * Locate the insertion point.
646 */
647 for (i = 0; i < lower_used && code > lower[i].key; i++) ;
648
649 if (i < lower_used) {
650 /*
651 * Shift the array up by one.
652 */
653 for (j = lower_used; j > i; j--)
654 (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1],
655 sizeof(_case_t));
656 }
657
658 lower[i].key = cases[1]; /* Lower */
659 lower[i].other1 = cases[0]; /* Upper */
660 lower[i].other2 = cases[2]; /* Title */
661
662 lower_used++;
663 }
664
665 static void
666 ordered_ccl_insert(ac_uint4 c, ac_uint4 ccl_code)
667 {
668 ac_uint4 i, j;
669
670 if (ccl_used == ccl_size) {
671 if (ccl_size == 0)
672 ccl = (ac_uint4 *) malloc(sizeof(ac_uint4) * 24);
673 else
674 ccl = (ac_uint4 *)
675 realloc((char *) ccl, sizeof(ac_uint4) * (ccl_size + 24));
676 ccl_size += 24;
677 }
678
679 /*
680 * Optimize adding the first item.
681 */
682 if (ccl_used == 0) {
683 ccl[0] = ccl[1] = c;
684 ccl[2] = ccl_code;
685 ccl_used += 3;
686 return;
687 }
688
689 /*
690 * Handle the special case of extending the range on the end. This
691 * requires that the combining class codes are the same.
692 */
693 if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
694 ccl[ccl_used - 2] = c;
695 return;
696 }
697
698 /*
699 * Handle the special case of adding another range on the end.
700 */
701 if (c > ccl[ccl_used - 2] + 1 ||
702 (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
703 ccl[ccl_used++] = c;
704 ccl[ccl_used++] = c;
705 ccl[ccl_used++] = ccl_code;
706 return;
707 }
708
709 /*
710 * Locate either the insertion point or range for the code.
711 */
712 for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
713
714 if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
715 /*
716 * Extend an existing range.
717 */
718 ccl[i + 1] = c;
719 return;
720 } else if (c < ccl[i]) {
721 /*
722 * Start a new range before the current location.
723 */
724 for (j = ccl_used; j > i; j -= 3) {
725 ccl[j] = ccl[j - 3];
726 ccl[j - 1] = ccl[j - 4];
727 ccl[j - 2] = ccl[j - 5];
728 }
729 ccl[i] = ccl[i + 1] = c;
730 ccl[i + 2] = ccl_code;
731 }
732 }
733
734 /*
735 * Adds a number if it does not already exist and returns an index value
736 * multiplied by 2.
737 */
738 static ac_uint4
739 make_number(short num, short denom)
740 {
741 ac_uint4 n;
742
743 /*
744 * Determine if the number already exists.
745 */
746 for (n = 0; n < nums_used; n++) {
747 if (nums[n].numerator == num && nums[n].denominator == denom)
748 return n << 1;
749 }
750
751 if (nums_used == nums_size) {
752 if (nums_size == 0)
753 nums = (_num_t *) malloc(sizeof(_num_t) << 3);
754 else
755 nums = (_num_t *) realloc((char *) nums,
756 sizeof(_num_t) * (nums_size + 8));
757 nums_size += 8;
758 }
759
760 n = nums_used++;
761 nums[n].numerator = num;
762 nums[n].denominator = denom;
763
764 return n << 1;
765 }
766
767 static void
768 add_number(ac_uint4 code, short num, short denom)
769 {
770 ac_uint4 i, j;
771
772 /*
773 * Insert the code in order.
774 */
775 for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
776
777 /*
778 * Handle the case of the codes matching and simply replace the number
779 * that was there before.
780 */
781 if (i < ncodes_used && code == ncodes[i].code) {
782 ncodes[i].idx = make_number(num, denom);
783 return;
784 }
785
786 /*
787 * Resize the array if necessary.
788 */
789 if (ncodes_used == ncodes_size) {
790 if (ncodes_size == 0)
791 ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
792 else
793 ncodes = (_codeidx_t *)
794 realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
795
796 ncodes_size += 8;
797 }
798
799 /*
800 * Shift things around to insert the code if necessary.
801 */
802 if (i < ncodes_used) {
803 for (j = ncodes_used; j > i; j--) {
804 ncodes[j].code = ncodes[j - 1].code;
805 ncodes[j].idx = ncodes[j - 1].idx;
806 }
807 }
808 ncodes[i].code = code;
809 ncodes[i].idx = make_number(num, denom);
810
811 ncodes_used++;
812 }
813
814 /*
815 * This routine assumes that the line is a valid Unicode Character Database
816 * entry.
817 */
818 static void
819 read_cdata(FILE *in)
820 {
821 ac_uint4 i, lineno, skip, code, ccl_code;
822 short wnum, neg, number[2], compat;
823 char line[512], *s, *e;
824
825 lineno = skip = 0;
826 while (fgets(line, sizeof(line), in)) {
827 if( (s=strchr(line, '\n')) ) *s = '\0';
828 lineno++;
829
830 /*
831 * Skip blank lines and lines that start with a '#'.
832 */
833 if (line[0] == 0 || line[0] == '#')
834 continue;
835
836 /*
837 * If lines need to be skipped, do it here.
838 */
839 if (skip) {
840 skip--;
841 continue;
842 }
843
844 /*
845 * Collect the code. The code can be up to 6 hex digits in length to
846 * allow surrogates to be specified.
847 */
848 for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
849 code <<= 4;
850 if (*s >= '0' && *s <= '9')
851 code += *s - '0';
852 else if (*s >= 'A' && *s <= 'F')
853 code += (*s - 'A') + 10;
854 else if (*s >= 'a' && *s <= 'f')
855 code += (*s - 'a') + 10;
856 }
857
858 /*
859 * Handle the following special cases:
860 * 1. 4E00-9FA5 CJK Ideographs.
861 * 2. AC00-D7A3 Hangul Syllables.
862 * 3. D800-DFFF Surrogates.
863 * 4. E000-F8FF Private Use Area.
864 * 5. F900-FA2D Han compatibility.
865 * ...Plus additional ranges in newer Unicode versions...
866 */
867 switch (code) {
868 case 0x3400:
869 /* CJK Ideograph Extension A */
870 add_range(0x3400, 0x4db5, "Lo", "L");
871
872 add_range(0x3400, 0x4db5, "Cp", 0);
873
874 skip = 1;
875 break;
876 case 0x4e00:
877 /*
878 * The Han ideographs.
879 */
880 add_range(0x4e00, 0x9fff, "Lo", "L");
881
882 /*
883 * Add the characters to the defined category.
884 */
885 add_range(0x4e00, 0x9fa5, "Cp", 0);
886
887 skip = 1;
888 break;
889 case 0xac00:
890 /*
891 * The Hangul syllables.
892 */
893 add_range(0xac00, 0xd7a3, "Lo", "L");
894
895 /*
896 * Add the characters to the defined category.
897 */
898 add_range(0xac00, 0xd7a3, "Cp", 0);
899
900 skip = 1;
901 break;
902 case 0xd800:
903 /*
904 * Make a range of all surrogates and assume some default
905 * properties.
906 */
907 add_range(0x010000, 0x10ffff, "Cs", "L");
908 skip = 5;
909 break;
910 case 0xe000:
911 /*
912 * The Private Use area. Add with a default set of properties.
913 */
914 add_range(0xe000, 0xf8ff, "Co", "L");
915 skip = 1;
916 break;
917 case 0xf900:
918 /*
919 * The CJK compatibility area.
920 */
921 add_range(0xf900, 0xfaff, "Lo", "L");
922
923 /*
924 * Add the characters to the defined category.
925 */
926 add_range(0xf900, 0xfaff, "Cp", 0);
927
928 skip = 1;
929 break;
930 case 0x20000:
931 /* CJK Ideograph Extension B */
932 add_range(0x20000, 0x2a6d6, "Lo", "L");
933
934 add_range(0x20000, 0x2a6d6, "Cp", 0);
935
936 skip = 1;
937 break;
938 case 0xf0000:
939 /* Plane 15 private use */
940 add_range(0xf0000, 0xffffd, "Co", "L");
941 skip = 1;
942 break;
943
944 case 0x100000:
945 /* Plane 16 private use */
946 add_range(0x100000, 0x10fffd, "Co", "L");
947 skip = 1;
948 break;
949 }
950
951 if (skip)
952 continue;
953
954 /*
955 * Add the code to the defined category.
956 */
957 ordered_range_insert(code, "Cp", 2);
958
959 /*
960 * Locate the first character property field.
961 */
962 for (i = 0; *s != 0 && i < 2; s++) {
963 if (*s == ';')
964 i++;
965 }
966 for (e = s; *e && *e != ';'; e++) ;
967
968 ordered_range_insert(code, s, e - s);
969
970 /*
971 * Locate the combining class code.
972 */
973 for (s = e; *s != 0 && i < 3; s++) {
974 if (*s == ';')
975 i++;
976 }
977
978 /*
979 * Convert the combining class code from decimal.
980 */
981 for (ccl_code = 0, e = s; *e && *e != ';'; e++)
982 ccl_code = (ccl_code * 10) + (*e - '0');
983
984 /*
985 * Add the code if it not 0.
986 */
987 if (ccl_code != 0)
988 ordered_ccl_insert(code, ccl_code);
989
990 /*
991 * Locate the second character property field.
992 */
993 for (s = e; *s != 0 && i < 4; s++) {
994 if (*s == ';')
995 i++;
996 }
997 for (e = s; *e && *e != ';'; e++) ;
998
999 ordered_range_insert(code, s, e - s);
1000
1001 /*
1002 * Check for a decomposition.
1003 */
1004 s = ++e;
1005 if (*s != ';') {
1006 compat = *s == '<';
1007 if (compat) {
1008 /*
1009 * Skip compatibility formatting tag.
1010 */
1011 while (*s++ != '>');
1012 }
1013 /*
1014 * Collect the codes of the decomposition.
1015 */
1016 for (dectmp_size = 0; *s != ';'; ) {
1017 /*
1018 * Skip all leading non-hex digits.
1019 */
1020 while (!ishdigit(*s))
1021 s++;
1022
1023 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
1024 dectmp[dectmp_size] <<= 4;
1025 if (*s >= '0' && *s <= '9')
1026 dectmp[dectmp_size] += *s - '0';
1027 else if (*s >= 'A' && *s <= 'F')
1028 dectmp[dectmp_size] += (*s - 'A') + 10;
1029 else if (*s >= 'a' && *s <= 'f')
1030 dectmp[dectmp_size] += (*s - 'a') + 10;
1031 }
1032 dectmp_size++;
1033 }
1034
1035 /*
1036 * If there are any codes in the temporary decomposition array,
1037 * then add the character with its decomposition.
1038 */
1039 if (dectmp_size > 0) {
1040 if (!compat) {
1041 add_decomp(code, 0);
1042 }
1043 add_decomp(code, 1);
1044 }
1045 }
1046
1047 /*
1048 * Skip to the number field.
1049 */
1050 for (i = 0; i < 3 && *s; s++) {
1051 if (*s == ';')
1052 i++;
1053 }
1054
1055 /*
1056 * Scan the number in.
1057 */
1058 number[0] = number[1] = 0;
1059 for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
1060 if (*e == '-') {
1061 neg = 1;
1062 continue;
1063 }
1064
1065 if (*e == '/') {
1066 /*
1067 * Move the the denominator of the fraction.
1068 */
1069 if (neg)
1070 number[wnum] *= -1;
1071 neg = 0;
1072 e++;
1073 wnum++;
1074 }
1075 number[wnum] = (number[wnum] * 10) + (*e - '0');
1076 }
1077
1078 if (e > s) {
1079 /*
1080 * Adjust the denominator in case of integers and add the number.
1081 */
1082 if (wnum == 0)
1083 number[1] = 1;
1084
1085 add_number(code, number[0], number[1]);
1086 }
1087
1088 /*
1089 * Skip to the start of the possible case mappings.
1090 */
1091 for (s = e, i = 0; i < 4 && *s; s++) {
1092 if (*s == ';')
1093 i++;
1094 }
1095
1096 /*
1097 * Collect the case mappings.
1098 */
1099 cases[0] = cases[1] = cases[2] = 0;
1100 for (i = 0; i < 3; i++) {
1101 while (ishdigit(*s)) {
1102 cases[i] <<= 4;
1103 if (*s >= '0' && *s <= '9')
1104 cases[i] += *s - '0';
1105 else if (*s >= 'A' && *s <= 'F')
1106 cases[i] += (*s - 'A') + 10;
1107 else if (*s >= 'a' && *s <= 'f')
1108 cases[i] += (*s - 'a') + 10;
1109 s++;
1110 }
1111 if (*s == ';')
1112 s++;
1113 }
1114 if (cases[0] && cases[1])
1115 /*
1116 * Add the upper and lower mappings for a title case character.
1117 */
1118 add_title(code);
1119 else if (cases[1])
1120 /*
1121 * Add the lower and title case mappings for the upper case
1122 * character.
1123 */
1124 add_upper(code);
1125 else if (cases[0])
1126 /*
1127 * Add the upper and title case mappings for the lower case
1128 * character.
1129 */
1130 add_lower(code);
1131 }
1132 }
1133
1134 static _decomp_t *
1135 find_decomp(ac_uint4 code, short compat)
1136 {
1137 long l, r, m;
1138 _decomp_t *decs;
1139
1140 l = 0;
1141 r = (compat ? kdecomps_used : decomps_used) - 1;
1142 decs = compat ? kdecomps : decomps;
1143 while (l <= r) {
1144 m = (l + r) >> 1;
1145 if (code > decs[m].code)
1146 l = m + 1;
1147 else if (code < decs[m].code)
1148 r = m - 1;
1149 else
1150 return &decs[m];
1151 }
1152 return 0;
1153 }
1154
1155 static void
1156 decomp_it(_decomp_t *d, short compat)
1157 {
1158 ac_uint4 i;
1159 _decomp_t *dp;
1160
1161 for (i = 0; i < d->used; i++) {
1162 if ((dp = find_decomp(d->decomp[i], compat)) != 0)
1163 decomp_it(dp, compat);
1164 else
1165 dectmp[dectmp_size++] = d->decomp[i];
1166 }
1167 }
1168
1169 /*
1170 * Expand all decompositions by recursively decomposing each character
1171 * in the decomposition.
1172 */
1173 static void
1174 expand_decomp(void)
1175 {
1176 ac_uint4 i;
1177
1178 for (i = 0; i < decomps_used; i++) {
1179 dectmp_size = 0;
1180 decomp_it(&decomps[i], 0);
1181 if (dectmp_size > 0)
1182 add_decomp(decomps[i].code, 0);
1183 }
1184
1185 for (i = 0; i < kdecomps_used; i++) {
1186 dectmp_size = 0;
1187 decomp_it(&kdecomps[i], 1);
1188 if (dectmp_size > 0)
1189 add_decomp(kdecomps[i].code, 1);
1190 }
1191 }
1192
1193 static int
1194 cmpcomps(const void *v_comp1, const void *v_comp2)
1195 {
1196 const _comp_t *comp1 = v_comp1, *comp2 = v_comp2;
1197 long diff = comp1->code1 - comp2->code1;
1198
1199 if (!diff)
1200 diff = comp1->code2 - comp2->code2;
1201 return (int) diff;
1202 }
1203
1204 /*
1205 * Load composition exclusion data
1206 */
1207 static void
1208 read_compexdata(FILE *in)
1209 {
1210 ac_uint2 i;
1211 ac_uint4 code;
1212 char line[512], *s;
1213
1214 (void) memset((char *) compexs, 0, sizeof(compexs));
1215
1216 while (fgets(line, sizeof(line), in)) {
1217 if( (s=strchr(line, '\n')) ) *s = '\0';
1218 /*
1219 * Skip blank lines and lines that start with a '#'.
1220 */
1221 if (line[0] == 0 || line[0] == '#')
1222 continue;
1223
1224 /*
1225 * Collect the code. Assume max 6 digits
1226 */
1227
1228 for (s = line, i = code = 0; *s != '#' && i < 6; i++, s++) {
1229 if (isspace((unsigned char)*s)) break;
1230 code <<= 4;
1231 if (*s >= '0' && *s <= '9')
1232 code += *s - '0';
1233 else if (*s >= 'A' && *s <= 'F')
1234 code += (*s - 'A') + 10;
1235 else if (*s >= 'a' && *s <= 'f')
1236 code += (*s - 'a') + 10;
1237 }
1238 COMPEX_SET(code);
1239 }
1240 }
1241
1242 /*
1243 * Creates array of compositions from decomposition array
1244 */
1245 static void
1246 create_comps(void)
1247 {
1248 ac_uint4 i, cu;
1249
1250 comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t));
1251
1252 for (i = cu = 0; i < decomps_used; i++) {
1253 if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code))
1254 continue;
1255 comps[cu].comp = decomps[i].code;
1256 comps[cu].count = 2;
1257 comps[cu].code1 = decomps[i].decomp[0];
1258 comps[cu].code2 = decomps[i].decomp[1];
1259 cu++;
1260 }
1261 comps_used = cu;
1262 qsort(comps, comps_used, sizeof(_comp_t), cmpcomps);
1263 }
1264
1265 #if HARDCODE_DATA
1266 static void
1267 write_case(FILE *out, _case_t *tab, int num, int first)
1268 {
1269 int i;
1270
1271 for (i=0; i<num; i++) {
1272 if (first) first = 0;
1273 else fprintf(out, ",");
1274 fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx",
1275 (unsigned long) tab[i].key, (unsigned long) tab[i].other1,
1276 (unsigned long) tab[i].other2);
1277 }
1278 }
1279
1280 #define PREF "static const "
1281
1282 #endif
1283
1284 static void
1285 write_cdata(char *opath)
1286 {
1287 FILE *out;
1288 ac_uint4 bytes;
1289 ac_uint4 i, idx, nprops;
1290 #if !(HARDCODE_DATA)
1291 ac_uint2 casecnt[2];
1292 #endif
1293 char path[BUFSIZ];
1294 #if HARDCODE_DATA
1295 int j, k;
1296
1297 /*****************************************************************
1298 *
1299 * Generate the ctype data.
1300 *
1301 *****************************************************************/
1302
1303 /*
1304 * Open the output file.
1305 */
1306 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "uctable.h", opath);
1307 if ((out = fopen(path, "w")) == 0)
1308 return;
1309 #else
1310 /*
1311 * Open the ctype.dat file.
1312 */
1313 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "ctype.dat", opath);
1314 if ((out = fopen(path, "wb")) == 0)
1315 return;
1316 #endif
1317
1318 /*
1319 * Collect the offsets for the properties. The offsets array is
1320 * on a 4-byte boundary to keep things efficient for architectures
1321 * that need such a thing.
1322 */
1323 for (i = idx = 0; i < NUMPROPS; i++) {
1324 propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
1325 idx += proptbl[i].used;
1326 }
1327
1328 /*
1329 * Add the sentinel index which is used by the binary search as the upper
1330 * bound for a search.
1331 */
1332 propcnt[i] = idx;
1333
1334 /*
1335 * Record the actual number of property lists. This may be different than
1336 * the number of offsets actually written because of aligning on a 4-byte
1337 * boundary.
1338 */
1339 hdr[1] = NUMPROPS;
1340
1341 /*
1342 * Calculate the byte count needed and pad the property counts array to a
1343 * 4-byte boundary.
1344 */
1345 if ((bytes = sizeof(ac_uint2) * (NUMPROPS + 1)) & 3)
1346 bytes += 4 - (bytes & 3);
1347 nprops = bytes / sizeof(ac_uint2);
1348 bytes += sizeof(ac_uint4) * idx;
1349
1350 #if HARDCODE_DATA
1351 fprintf(out, PREF "ac_uint4 _ucprop_size = %d;\n\n", NUMPROPS);
1352
1353 fprintf(out, PREF "ac_uint2 _ucprop_offsets[] = {");
1354
1355 for (i = 0; i<nprops; i++) {
1356 if (i) fprintf(out, ",");
1357 if (!(i&7)) fprintf(out, "\n\t");
1358 else fprintf(out, " ");
1359 fprintf(out, "0x%04x", propcnt[i]);
1360 }
1361 fprintf(out, "\n};\n\n");
1362
1363 fprintf(out, PREF "ac_uint4 _ucprop_ranges[] = {");
1364
1365 k = 0;
1366 for (i = 0; i < NUMPROPS; i++) {
1367 if (proptbl[i].used > 0) {
1368 for (j=0; j<proptbl[i].used; j++) {
1369 if (k) fprintf(out, ",");
1370 if (!(k&3)) fprintf(out,"\n\t");
1371 else fprintf(out, " ");
1372 k++;
1373 fprintf(out, "0x%08lx", (unsigned long) proptbl[i].ranges[j]);
1374 }
1375 }
1376 }
1377 fprintf(out, "\n};\n\n");
1378 #else
1379 /*
1380 * Write the header.
1381 */
1382 fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1383
1384 /*
1385 * Write the byte count.
1386 */
1387 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1388
1389 /*
1390 * Write the property list counts.
1391 */
1392 fwrite((char *) propcnt, sizeof(ac_uint2), nprops, out);
1393
1394 /*
1395 * Write the property lists.
1396 */
1397 for (i = 0; i < NUMPROPS; i++) {
1398 if (proptbl[i].used > 0)
1399 fwrite((char *) proptbl[i].ranges, sizeof(ac_uint4),
1400 proptbl[i].used, out);
1401 }
1402
1403 fclose(out);
1404 #endif
1405
1406 /*****************************************************************
1407 *
1408 * Generate the case mapping data.
1409 *
1410 *****************************************************************/
1411
1412 #if HARDCODE_DATA
1413 fprintf(out, PREF "ac_uint4 _uccase_size = %ld;\n\n",
1414 (long) (upper_used + lower_used + title_used));
1415
1416 fprintf(out, PREF "ac_uint2 _uccase_len[2] = {%ld, %ld};\n\n",
1417 (long) upper_used, (long) lower_used);
1418 fprintf(out, PREF "ac_uint4 _uccase_map[] = {");
1419
1420 if (upper_used > 0)
1421 /*
1422 * Write the upper case table.
1423 */
1424 write_case(out, upper, upper_used, 1);
1425
1426 if (lower_used > 0)
1427 /*
1428 * Write the lower case table.
1429 */
1430 write_case(out, lower, lower_used, !upper_used);
1431
1432 if (title_used > 0)
1433 /*
1434 * Write the title case table.
1435 */
1436 write_case(out, title, title_used, !(upper_used||lower_used));
1437
1438 if (!(upper_used || lower_used || title_used))
1439 fprintf(out, "\t0");
1440
1441 fprintf(out, "\n};\n\n");
1442 #else
1443 /*
1444 * Open the case.dat file.
1445 */
1446 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "case.dat", opath);
1447 if ((out = fopen(path, "wb")) == 0)
1448 return;
1449
1450 /*
1451 * Write the case mapping tables.
1452 */
1453 hdr[1] = upper_used + lower_used + title_used;
1454 casecnt[0] = upper_used;
1455 casecnt[1] = lower_used;
1456
1457 /*
1458 * Write the header.
1459 */
1460 fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1461
1462 /*
1463 * Write the upper and lower case table sizes.
1464 */
1465 fwrite((char *) casecnt, sizeof(ac_uint2), 2, out);
1466
1467 if (upper_used > 0)
1468 /*
1469 * Write the upper case table.
1470 */
1471 fwrite((char *) upper, sizeof(_case_t), upper_used, out);
1472
1473 if (lower_used > 0)
1474 /*
1475 * Write the lower case table.
1476 */
1477 fwrite((char *) lower, sizeof(_case_t), lower_used, out);
1478
1479 if (title_used > 0)
1480 /*
1481 * Write the title case table.
1482 */
1483 fwrite((char *) title, sizeof(_case_t), title_used, out);
1484
1485 fclose(out);
1486 #endif
1487
1488 /*****************************************************************
1489 *
1490 * Generate the composition data.
1491 *
1492 *****************************************************************/
1493
1494 /*
1495 * Create compositions from decomposition data
1496 */
1497 create_comps();
1498
1499 #if HARDCODE_DATA
1500 fprintf(out, PREF "ac_uint4 _uccomp_size = %ld;\n\n",
1501 comps_used * 4L);
1502
1503 fprintf(out, PREF "ac_uint4 _uccomp_data[] = {");
1504
1505 /*
1506 * Now, if comps exist, write them out.
1507 */
1508 if (comps_used > 0) {
1509 for (i=0; i<comps_used; i++) {
1510 if (i) fprintf(out, ",");
1511 fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx, 0x%08lx",
1512 (unsigned long) comps[i].comp, (unsigned long) comps[i].count,
1513 (unsigned long) comps[i].code1, (unsigned long) comps[i].code2);
1514 }
1515 } else {
1516 fprintf(out, "\t0");
1517 }
1518 fprintf(out, "\n};\n\n");
1519 #else
1520 /*
1521 * Open the comp.dat file.
1522 */
1523 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "comp.dat", opath);
1524 if ((out = fopen(path, "wb")) == 0)
1525 return;
1526
1527 /*
1528 * Write the header.
1529 */
1530 hdr[1] = (ac_uint2) comps_used * 4;
1531 fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1532
1533 /*
1534 * Write out the byte count to maintain header size.
1535 */
1536 bytes = comps_used * sizeof(_comp_t);
1537 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1538
1539 /*
1540 * Now, if comps exist, write them out.
1541 */
1542 if (comps_used > 0)
1543 fwrite((char *) comps, sizeof(_comp_t), comps_used, out);
1544
1545 fclose(out);
1546 #endif
1547
1548 /*****************************************************************
1549 *
1550 * Generate the decomposition data.
1551 *
1552 *****************************************************************/
1553
1554 /*
1555 * Fully expand all decompositions before generating the output file.
1556 */
1557 expand_decomp();
1558
1559 #if HARDCODE_DATA
1560 fprintf(out, PREF "ac_uint4 _ucdcmp_size = %ld;\n\n",
1561 decomps_used * 2L);
1562
1563 fprintf(out, PREF "ac_uint4 _ucdcmp_nodes[] = {");
1564
1565 if (decomps_used) {
1566 /*
1567 * Write the list of decomp nodes.
1568 */
1569 for (i = idx = 0; i < decomps_used; i++) {
1570 fprintf(out, "\n\t0x%08lx, 0x%08lx,",
1571 (unsigned long) decomps[i].code, (unsigned long) idx);
1572 idx += decomps[i].used;
1573 }
1574
1575 /*
1576 * Write the sentinel index as the last decomp node.
1577 */
1578 fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
1579
1580 fprintf(out, PREF "ac_uint4 _ucdcmp_decomp[] = {");
1581 /*
1582 * Write the decompositions themselves.
1583 */
1584 k = 0;
1585 for (i = 0; i < decomps_used; i++)
1586 for (j=0; j<decomps[i].used; j++) {
1587 if (k) fprintf(out, ",");
1588 if (!(k&3)) fprintf(out,"\n\t");
1589 else fprintf(out, " ");
1590 k++;
1591 fprintf(out, "0x%08lx", (unsigned long) decomps[i].decomp[j]);
1592 }
1593 fprintf(out, "\n};\n\n");
1594 }
1595 #else
1596 /*
1597 * Open the decomp.dat file.
1598 */
1599 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "decomp.dat", opath);
1600 if ((out = fopen(path, "wb")) == 0)
1601 return;
1602
1603 hdr[1] = decomps_used;
1604
1605 /*
1606 * Write the header.
1607 */
1608 fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1609
1610 /*
1611 * Write a temporary byte count which will be calculated as the
1612 * decompositions are written out.
1613 */
1614 bytes = 0;
1615 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1616
1617 if (decomps_used) {
1618 /*
1619 * Write the list of decomp nodes.
1620 */
1621 for (i = idx = 0; i < decomps_used; i++) {
1622 fwrite((char *) &decomps[i].code, sizeof(ac_uint4), 1, out);
1623 fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1624 idx += decomps[i].used;
1625 }
1626
1627 /*
1628 * Write the sentinel index as the last decomp node.
1629 */
1630 fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1631
1632 /*
1633 * Write the decompositions themselves.
1634 */
1635 for (i = 0; i < decomps_used; i++)
1636 fwrite((char *) decomps[i].decomp, sizeof(ac_uint4),
1637 decomps[i].used, out);
1638
1639 /*
1640 * Seek back to the beginning and write the byte count.
1641 */
1642 bytes = (sizeof(ac_uint4) * idx) +
1643 (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
1644 fseek(out, sizeof(ac_uint2) << 1, 0L);
1645 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1646
1647 fclose(out);
1648 }
1649 #endif
1650
1651 #ifdef HARDCODE_DATA
1652 fprintf(out, PREF "ac_uint4 _uckdcmp_size = %ld;\n\n",
1653 kdecomps_used * 2L);
1654
1655 fprintf(out, PREF "ac_uint4 _uckdcmp_nodes[] = {");
1656
1657 if (kdecomps_used) {
1658 /*
1659 * Write the list of kdecomp nodes.
1660 */
1661 for (i = idx = 0; i < kdecomps_used; i++) {
1662 fprintf(out, "\n\t0x%08lx, 0x%08lx,",
1663 (unsigned long) kdecomps[i].code, (unsigned long) idx);
1664 idx += kdecomps[i].used;
1665 }
1666
1667 /*
1668 * Write the sentinel index as the last decomp node.
1669 */
1670 fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
1671
1672 fprintf(out, PREF "ac_uint4 _uckdcmp_decomp[] = {");
1673
1674 /*
1675 * Write the decompositions themselves.
1676 */
1677 k = 0;
1678 for (i = 0; i < kdecomps_used; i++)
1679 for (j=0; j<kdecomps[i].used; j++) {
1680 if (k) fprintf(out, ",");
1681 if (!(k&3)) fprintf(out,"\n\t");
1682 else fprintf(out, " ");
1683 k++;
1684 fprintf(out, "0x%08lx", (unsigned long) kdecomps[i].decomp[j]);
1685 }
1686 fprintf(out, "\n};\n\n");
1687 }
1688 #else
1689 /*
1690 * Open the kdecomp.dat file.
1691 */
1692 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "kdecomp.dat", opath);
1693 if ((out = fopen(path, "wb")) == 0)
1694 return;
1695
1696 hdr[1] = kdecomps_used;
1697
1698 /*
1699 * Write the header.
1700 */
1701 fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1702
1703 /*
1704 * Write a temporary byte count which will be calculated as the
1705 * decompositions are written out.
1706 */
1707 bytes = 0;
1708 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1709
1710 if (kdecomps_used) {
1711 /*
1712 * Write the list of kdecomp nodes.
1713 */
1714 for (i = idx = 0; i < kdecomps_used; i++) {
1715 fwrite((char *) &kdecomps[i].code, sizeof(ac_uint4), 1, out);
1716 fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1717 idx += kdecomps[i].used;
1718 }
1719
1720 /*
1721 * Write the sentinel index as the last decomp node.
1722 */
1723 fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1724
1725 /*
1726 * Write the decompositions themselves.
1727 */
1728 for (i = 0; i < kdecomps_used; i++)
1729 fwrite((char *) kdecomps[i].decomp, sizeof(ac_uint4),
1730 kdecomps[i].used, out);
1731
1732 /*
1733 * Seek back to the beginning and write the byte count.
1734 */
1735 bytes = (sizeof(ac_uint4) * idx) +
1736 (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
1737 fseek(out, sizeof(ac_uint2) << 1, 0L);
1738 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1739
1740 fclose(out);
1741 }
1742 #endif
1743
1744 /*****************************************************************
1745 *
1746 * Generate the combining class data.
1747 *
1748 *****************************************************************/
1749 #ifdef HARDCODE_DATA
1750 fprintf(out, PREF "ac_uint4 _uccmcl_size = %ld;\n\n", (long) ccl_used);
1751
1752 fprintf(out, PREF "ac_uint4 _uccmcl_nodes[] = {");
1753
1754 if (ccl_used > 0) {
1755 /*
1756 * Write the combining class ranges out.
1757 */
1758 for (i = 0; i<ccl_used; i++) {
1759 if (i) fprintf(out, ",");
1760 if (!(i&3)) fprintf(out, "\n\t");
1761 else fprintf(out, " ");
1762 fprintf(out, "0x%08lx", (unsigned long) ccl[i]);
1763 }
1764 } else {
1765 fprintf(out, "\t0");
1766 }
1767 fprintf(out, "\n};\n\n");
1768 #else
1769 /*
1770 * Open the cmbcl.dat file.
1771 */
1772 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "cmbcl.dat", opath);
1773 if ((out = fopen(path, "wb")) == 0)
1774 return;
1775
1776 /*
1777 * Set the number of ranges used. Each range has a combining class which
1778 * means each entry is a 3-tuple.
1779 */
1780 hdr[1] = ccl_used / 3;
1781
1782 /*
1783 * Write the header.
1784 */
1785 fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1786
1787 /*
1788 * Write out the byte count to maintain header size.
1789 */
1790 bytes = ccl_used * sizeof(ac_uint4);
1791 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1792
1793 if (ccl_used > 0)
1794 /*
1795 * Write the combining class ranges out.
1796 */
1797 fwrite((char *) ccl, sizeof(ac_uint4), ccl_used, out);
1798
1799 fclose(out);
1800 #endif
1801
1802 /*****************************************************************
1803 *
1804 * Generate the number data.
1805 *
1806 *****************************************************************/
1807
1808 #if HARDCODE_DATA
1809 fprintf(out, PREF "ac_uint4 _ucnum_size = %lu;\n\n",
1810 (unsigned long)ncodes_used<<1);
1811
1812 fprintf(out, PREF "ac_uint4 _ucnum_nodes[] = {");
1813
1814 /*
1815 * Now, if number mappings exist, write them out.
1816 */
1817 if (ncodes_used > 0) {
1818 for (i = 0; i<ncodes_used; i++) {
1819 if (i) fprintf(out, ",");
1820 if (!(i&1)) fprintf(out, "\n\t");
1821 else fprintf(out, " ");
1822 fprintf(out, "0x%08lx, 0x%08lx",
1823 (unsigned long) ncodes[i].code, (unsigned long) ncodes[i].idx);
1824 }
1825 fprintf(out, "\n};\n\n");
1826
1827 fprintf(out, PREF "short _ucnum_vals[] = {");
1828 for (i = 0; i<nums_used; i++) {
1829 if (i) fprintf(out, ",");
1830 if (!(i&3)) fprintf(out, "\n\t");
1831 else fprintf(out, " ");
1832 if (nums[i].numerator < 0) {
1833 fprintf(out, "%6d, 0x%04x",
1834 nums[i].numerator, nums[i].denominator);
1835 } else {
1836 fprintf(out, "0x%04x, 0x%04x",
1837 nums[i].numerator, nums[i].denominator);
1838 }
1839 }
1840 fprintf(out, "\n};\n\n");
1841 }
1842 #else
1843 /*
1844 * Open the num.dat file.
1845 */
1846 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "num.dat", opath);
1847 if ((out = fopen(path, "wb")) == 0)
1848 return;
1849
1850 /*
1851 * The count part of the header will be the total number of codes that
1852 * have numbers.
1853 */
1854 hdr[1] = (ac_uint2) (ncodes_used << 1);
1855 bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
1856
1857 /*
1858 * Write the header.
1859 */
1860 fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1861
1862 /*
1863 * Write out the byte count to maintain header size.
1864 */
1865 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1866
1867 /*
1868 * Now, if number mappings exist, write them out.
1869 */
1870 if (ncodes_used > 0) {
1871 fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
1872 fwrite((char *) nums, sizeof(_num_t), nums_used, out);
1873 }
1874 #endif
1875
1876 fclose(out);
1877 }
1878
1879 static void
1880 usage(char *prog)
1881 {
1882 fprintf(stderr,
1883 "Usage: %s [-o output-directory|-x composition-exclusions]", prog);
1884 fprintf(stderr, " datafile1 datafile2 ...\n\n");
1885 fprintf(stderr,
1886 "-o output-directory\n\t\tWrite the output files to a different");
1887 fprintf(stderr, " directory (default: .).\n");
1888 fprintf(stderr,
1889 "-x composition-exclusion\n\t\tFile of composition codes");
1890 fprintf(stderr, " that should be excluded.\n");
1891 exit(1);
1892 }
1893
1894 int
1895 main(int argc, char *argv[])
1896 {
1897 FILE *in;
1898 char *prog, *opath;
1899
1900 prog = lutil_progname( "ucgendat", argc, argv );
1901
1902 opath = 0;
1903 in = stdin;
1904
1905 argc--;
1906 argv++;
1907
1908 while (argc > 0) {
1909 if (argv[0][0] == '-') {
1910 switch (argv[0][1]) {
1911 case 'o':
1912 argc--;
1913 argv++;
1914 opath = argv[0];
1915 break;
1916 case 'x':
1917 argc--;
1918 argv++;
1919 if ((in = fopen(argv[0], "r")) == 0)
1920 fprintf(stderr,
1921 "%s: unable to open composition exclusion file %s\n",
1922 prog, argv[0]);
1923 else {
1924 read_compexdata(in);
1925 fclose(in);
1926 in = 0;
1927 }
1928 break;
1929 default:
1930 usage(prog);
1931 }
1932 } else {
1933 if (in != stdin && in != NULL)
1934 fclose(in);
1935 if ((in = fopen(argv[0], "r")) == 0)
1936 fprintf(stderr, "%s: unable to open ctype file %s\n",
1937 prog, argv[0]);
1938 else {
1939 read_cdata(in);
1940 fclose(in);
1941 in = 0;
1942 }
1943 }
1944 argc--;
1945 argv++;
1946 }
1947
1948 if (opath == 0)
1949 opath = ".";
1950 write_cdata(opath);
1951
1952 return 0;
1953 }
1954