uniname.c revision 1.1 1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002, 2005-2006 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
17
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21
22 /* Specification. */
23 #include "uniname.h"
24
25 #include <assert.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <string.h>
29
30 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
31
32
33 /* Table of Unicode character names, derived from UnicodeData.txt. */
34 #define uint16_t unsigned short
35 #define uint32_t unsigned int
36 #include "uninames.h"
37 /* It contains:
38 static const char unicode_name_words[34594] = ...;
39 #define UNICODE_CHARNAME_NUM_WORDS 5906
40 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
41 #define UNICODE_CHARNAME_WORD_HANGUL 3624
42 #define UNICODE_CHARNAME_WORD_SYLLABLE 4654
43 #define UNICODE_CHARNAME_WORD_CJK 401
44 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 5755
45 static const uint16_t unicode_names[62620] = ...;
46 static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[15257] = ...;
47 static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[15257] = ...;
48 #define UNICODE_CHARNAME_MAX_LENGTH 83
49 #define UNICODE_CHARNAME_MAX_WORDS 13
50 */
51
52 /* Returns the word with a given index. */
53 static const char *
54 unicode_name_word (unsigned int index, unsigned int *lengthp)
55 {
56 unsigned int i1;
57 unsigned int i2;
58 unsigned int i;
59
60 assert (index < UNICODE_CHARNAME_NUM_WORDS);
61
62 /* Binary search for i with
63 unicode_name_by_length[i].ind_offset <= index
64 and
65 index < unicode_name_by_length[i+1].ind_offset
66 */
67
68 i1 = 0;
69 i2 = SIZEOF (unicode_name_by_length) - 1;
70 while (i2 - i1 > 1)
71 {
72 unsigned int i = (i1 + i2) >> 1;
73 if (unicode_name_by_length[i].ind_offset <= index)
74 i1 = i;
75 else
76 i2 = i;
77 }
78 i = i1;
79 assert (unicode_name_by_length[i].ind_offset <= index
80 && index < unicode_name_by_length[i+1].ind_offset);
81 *lengthp = i;
82 return &unicode_name_words[unicode_name_by_length[i].extra_offset
83 + (index-unicode_name_by_length[i].ind_offset)*i];
84 }
85
86 /* Looks up the index of a word. */
87 static int
88 unicode_name_word_lookup (const char *word, unsigned int length)
89 {
90 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
91 {
92 /* Binary search among the words of given length. */
93 unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
94 unsigned int i0 = unicode_name_by_length[length].ind_offset;
95 unsigned int i1 = i0;
96 unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
97 while (i2 - i1 > 0)
98 {
99 unsigned int i = (i1 + i2) >> 1;
100 const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
101 const char *w = word;
102 unsigned int n = length;
103 for (;;)
104 {
105 if (*p < *w)
106 {
107 if (i1 == i)
108 return -1;
109 /* Note here: i1 < i < i2. */
110 i1 = i;
111 break;
112 }
113 if (*p > *w)
114 {
115 /* Note here: i1 <= i < i2. */
116 i2 = i;
117 break;
118 }
119 p++; w++; n--;
120 if (n == 0)
121 return i;
122 }
123 }
124 }
125 return -1;
126 }
127
128 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
129 sections 3.11 and 4.4. */
130 static const char jamo_initial_short_name[19][3] =
131 {
132 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
133 "C", "K", "T", "P", "H"
134 };
135 static const char jamo_medial_short_name[21][4] =
136 {
137 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
138 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
139 };
140 static const char jamo_final_short_name[28][3] =
141 {
142 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
143 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
144 };
145
146 /* Looks up the name of a Unicode character, in uppercase ASCII.
147 Returns the filled buf, or NULL if the character does not have a name. */
148 char *
149 unicode_character_name (unsigned int c, char *buf)
150 {
151 if (c >= 0xAC00 && c <= 0xD7A3)
152 {
153 /* Special case for Hangul syllables. Keeps the tables small. */
154 char *ptr;
155 unsigned int tmp;
156 unsigned int index1;
157 unsigned int index2;
158 unsigned int index3;
159 const char *q;
160
161 /* buf needs to have at least 16 + 7 bytes here. */
162 memcpy (buf, "HANGUL SYLLABLE ", 16);
163 ptr = buf + 16;
164
165 tmp = c - 0xAC00;
166 index3 = tmp % 28; tmp = tmp / 28;
167 index2 = tmp % 21; tmp = tmp / 21;
168 index1 = tmp;
169
170 q = jamo_initial_short_name[index1];
171 while (*q != '\0')
172 *ptr++ = *q++;
173 q = jamo_medial_short_name[index2];
174 while (*q != '\0')
175 *ptr++ = *q++;
176 q = jamo_final_short_name[index3];
177 while (*q != '\0')
178 *ptr++ = *q++;
179 *ptr = '\0';
180 return buf;
181 }
182 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
183 || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
184 {
185 /* Special case for CJK compatibility ideographs. Keeps the tables
186 small. */
187 char *ptr;
188 int i;
189
190 /* buf needs to have at least 28 + 5 bytes here. */
191 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
192 ptr = buf + 28;
193
194 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
195 {
196 unsigned int x = (c >> i) & 0xf;
197 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
198 }
199 *ptr = '\0';
200 return buf;
201 }
202 else
203 {
204 const uint16_t *words;
205
206 /* Transform the code so that it fits in 16 bits. */
207 switch (c >> 12)
208 {
209 case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
210 break;
211 case 0x0A:
212 c -= 0x05000;
213 break;
214 case 0x0F:
215 c -= 0x09000;
216 break;
217 case 0x10:
218 c -= 0x09000;
219 break;
220 case 0x1D:
221 c -= 0x15000;
222 break;
223 case 0x2F:
224 c -= 0x26000;
225 break;
226 case 0xE0:
227 c -= 0xD6000;
228 break;
229 default:
230 return NULL;
231 }
232
233 {
234 /* Binary search in unicode_code_to_name. */
235 unsigned int i1 = 0;
236 unsigned int i2 = SIZEOF (unicode_code_to_name);
237 for (;;)
238 {
239 unsigned int i = (i1 + i2) >> 1;
240 if (unicode_code_to_name[i].code == c)
241 {
242 words = &unicode_names[unicode_code_to_name[i].name];
243 break;
244 }
245 else if (unicode_code_to_name[i].code < c)
246 {
247 if (i1 == i)
248 {
249 words = NULL;
250 break;
251 }
252 /* Note here: i1 < i < i2. */
253 i1 = i;
254 }
255 else if (unicode_code_to_name[i].code > c)
256 {
257 if (i2 == i)
258 {
259 words = NULL;
260 break;
261 }
262 /* Note here: i1 <= i < i2. */
263 i2 = i;
264 }
265 }
266 }
267 if (words != NULL)
268 {
269 /* Found it in unicode_code_to_name. Now concatenate the words. */
270 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
271 char *ptr = buf;
272 for (;;)
273 {
274 unsigned int wordlen;
275 const char *word = unicode_name_word (*words>>1, &wordlen);
276 do
277 *ptr++ = *word++;
278 while (--wordlen > 0);
279 if ((*words & 1) == 0)
280 break;
281 *ptr++ = ' ';
282 words++;
283 }
284 *ptr = '\0';
285 return buf;
286 }
287 return NULL;
288 }
289 }
290
291 /* Looks up the Unicode character with a given name, in upper- or lowercase
292 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
293 unsigned int
294 unicode_name_character (const char *name)
295 {
296 unsigned int len = strlen (name);
297 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
298 {
299 /* Test for "word1 word2 ..." syntax. */
300 char buf[UNICODE_CHARNAME_MAX_LENGTH];
301 char *ptr = buf;
302 for (;;)
303 {
304 char c = *name++;
305 if (!(c >= ' ' && c <= '~'))
306 break;
307 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
308 if (--len == 0)
309 goto filled_buf;
310 }
311 if (false)
312 filled_buf:
313 {
314 /* Convert the constituents to uint16_t words. */
315 uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
316 uint16_t *wordptr = words;
317 {
318 const char *p1 = buf;
319 for (;;)
320 {
321 {
322 int word;
323 const char *p2 = p1;
324 while (p2 < ptr && *p2 != ' ')
325 p2++;
326 word = unicode_name_word_lookup (p1, p2 - p1);
327 if (word < 0)
328 break;
329 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
330 break;
331 *wordptr++ = word;
332 if (p2 == ptr)
333 goto filled_words;
334 p1 = p2 + 1;
335 }
336 /* Special case for Hangul syllables. Keeps the tables small. */
337 if (wordptr == &words[2]
338 && words[0] == UNICODE_CHARNAME_WORD_HANGUL
339 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
340 {
341 /* Split the last word [p1..ptr) into three parts:
342 1) [BCDGHJKMNPRST]
343 2) [AEIOUWY]
344 3) [BCDGHIJKLMNPST]
345 */
346 const char *p2;
347 const char *p3;
348 const char *p4;
349
350 p2 = p1;
351 while (p2 < ptr
352 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
353 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
354 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
355 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
356 || *p2 == 'T'))
357 p2++;
358 p3 = p2;
359 while (p3 < ptr
360 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
361 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
362 || *p3 == 'Y'))
363 p3++;
364 p4 = p3;
365 while (p4 < ptr
366 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
367 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
368 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
369 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
370 || *p4 == 'S' || *p4 == 'T'))
371 p4++;
372 if (p4 == ptr)
373 {
374 unsigned int n1 = p2 - p1;
375 unsigned int n2 = p3 - p2;
376 unsigned int n3 = p4 - p3;
377
378 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
379 {
380 unsigned int index1;
381
382 for (index1 = 0; index1 < 19; index1++)
383 if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
384 && jamo_initial_short_name[index1][n1] == '\0')
385 {
386 unsigned int index2;
387
388 for (index2 = 0; index2 < 21; index2++)
389 if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
390 && jamo_medial_short_name[index2][n2] == '\0')
391 {
392 unsigned int index3;
393
394 for (index3 = 0; index3 < 28; index3++)
395 if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
396 && jamo_final_short_name[index3][n3] == '\0')
397 {
398 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
399 }
400 break;
401 }
402 break;
403 }
404 }
405 }
406 }
407 /* Special case for CJK compatibility ideographs. Keeps the
408 tables small. */
409 if (wordptr == &words[2]
410 && words[0] == UNICODE_CHARNAME_WORD_CJK
411 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
412 && p1 + 14 <= ptr
413 && p1 + 15 >= ptr
414 && memcmp (p1, "IDEOGRAPH-", 10) == 0)
415 {
416 const char *p2 = p1 + 10;
417
418 if (*p2 != '0')
419 {
420 unsigned int c = 0;
421
422 for (;;)
423 {
424 if (*p2 >= '0' && *p2 <= '9')
425 c += (*p2 - '0');
426 else if (*p2 >= 'A' && *p2 <= 'F')
427 c += (*p2 - 'A' + 10);
428 else
429 break;
430 p2++;
431 if (p2 == ptr)
432 {
433 if ((c >= 0xF900 && c <= 0xFA2D)
434 || (c >= 0xFA30 && c <= 0xFA6A)
435 || (c >= 0xFA70 && c <= 0xFAD9)
436 || (c >= 0x2F800 && c <= 0x2FA1D))
437 return c;
438 else
439 break;
440 }
441 c = c << 4;
442 }
443 }
444 }
445 }
446 }
447 if (false)
448 filled_words:
449 {
450 /* Multiply by 2, to simplify later comparisons. */
451 unsigned int words_length = wordptr - words;
452 {
453 int i = words_length - 1;
454 words[i] = 2 * words[i];
455 for (; --i >= 0; )
456 words[i] = 2 * words[i] + 1;
457 }
458 /* Binary search in unicode_name_to_code. */
459 {
460 unsigned int i1 = 0;
461 unsigned int i2 = SIZEOF (unicode_name_to_code);
462 for (;;)
463 {
464 unsigned int i = (i1 + i2) >> 1;
465 const uint16_t *w = words;
466 const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
467 unsigned int n = words_length;
468 for (;;)
469 {
470 if (*p < *w)
471 {
472 if (i1 == i)
473 goto name_not_found;
474 /* Note here: i1 < i < i2. */
475 i1 = i;
476 break;
477 }
478 else if (*p > *w)
479 {
480 if (i2 == i)
481 goto name_not_found;
482 /* Note here: i1 <= i < i2. */
483 i2 = i;
484 break;
485 }
486 p++; w++; n--;
487 if (n == 0)
488 {
489 unsigned int c = unicode_name_to_code[i].code;
490
491 /* Undo the transformation to 16-bit space. */
492 static const unsigned int offset[11] =
493 {
494 0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
495 0x05000, 0x09000, 0x09000, 0x15000, 0x26000,
496 0xD6000
497 };
498 return c + offset[c >> 12];
499 }
500 }
501 }
502 }
503 name_not_found: ;
504 }
505 }
506 }
507 return UNINAME_INVALID;
508 }
509