uniname.c revision 1.1 1 1.1 christos /* Association between Unicode characters and their names.
2 1.1 christos Copyright (C) 2000-2002, 2005-2006 Free Software Foundation, Inc.
3 1.1 christos
4 1.1 christos This program is free software; you can redistribute it and/or modify
5 1.1 christos it under the terms of the GNU General Public License as published by
6 1.1 christos the Free Software Foundation; either version 2, or (at your option)
7 1.1 christos any later version.
8 1.1 christos
9 1.1 christos This program is distributed in the hope that it will be useful,
10 1.1 christos but WITHOUT ANY WARRANTY; without even the implied warranty of
11 1.1 christos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 1.1 christos GNU General Public License for more details.
13 1.1 christos
14 1.1 christos You should have received a copy of the GNU General Public License
15 1.1 christos along with this program; if not, write to the Free Software Foundation,
16 1.1 christos Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
17 1.1 christos
18 1.1 christos #ifdef HAVE_CONFIG_H
19 1.1 christos # include <config.h>
20 1.1 christos #endif
21 1.1 christos
22 1.1 christos /* Specification. */
23 1.1 christos #include "uniname.h"
24 1.1 christos
25 1.1 christos #include <assert.h>
26 1.1 christos #include <stdbool.h>
27 1.1 christos #include <stdio.h>
28 1.1 christos #include <string.h>
29 1.1 christos
30 1.1 christos #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
31 1.1 christos
32 1.1 christos
33 1.1 christos /* Table of Unicode character names, derived from UnicodeData.txt. */
34 1.1 christos #define uint16_t unsigned short
35 1.1 christos #define uint32_t unsigned int
36 1.1 christos #include "uninames.h"
37 1.1 christos /* It contains:
38 1.1 christos static const char unicode_name_words[34594] = ...;
39 1.1 christos #define UNICODE_CHARNAME_NUM_WORDS 5906
40 1.1 christos static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
41 1.1 christos #define UNICODE_CHARNAME_WORD_HANGUL 3624
42 1.1 christos #define UNICODE_CHARNAME_WORD_SYLLABLE 4654
43 1.1 christos #define UNICODE_CHARNAME_WORD_CJK 401
44 1.1 christos #define UNICODE_CHARNAME_WORD_COMPATIBILITY 5755
45 1.1 christos static const uint16_t unicode_names[62620] = ...;
46 1.1 christos static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[15257] = ...;
47 1.1 christos static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[15257] = ...;
48 1.1 christos #define UNICODE_CHARNAME_MAX_LENGTH 83
49 1.1 christos #define UNICODE_CHARNAME_MAX_WORDS 13
50 1.1 christos */
51 1.1 christos
52 1.1 christos /* Returns the word with a given index. */
53 1.1 christos static const char *
54 1.1 christos unicode_name_word (unsigned int index, unsigned int *lengthp)
55 1.1 christos {
56 1.1 christos unsigned int i1;
57 1.1 christos unsigned int i2;
58 1.1 christos unsigned int i;
59 1.1 christos
60 1.1 christos assert (index < UNICODE_CHARNAME_NUM_WORDS);
61 1.1 christos
62 1.1 christos /* Binary search for i with
63 1.1 christos unicode_name_by_length[i].ind_offset <= index
64 1.1 christos and
65 1.1 christos index < unicode_name_by_length[i+1].ind_offset
66 1.1 christos */
67 1.1 christos
68 1.1 christos i1 = 0;
69 1.1 christos i2 = SIZEOF (unicode_name_by_length) - 1;
70 1.1 christos while (i2 - i1 > 1)
71 1.1 christos {
72 1.1 christos unsigned int i = (i1 + i2) >> 1;
73 1.1 christos if (unicode_name_by_length[i].ind_offset <= index)
74 1.1 christos i1 = i;
75 1.1 christos else
76 1.1 christos i2 = i;
77 1.1 christos }
78 1.1 christos i = i1;
79 1.1 christos assert (unicode_name_by_length[i].ind_offset <= index
80 1.1 christos && index < unicode_name_by_length[i+1].ind_offset);
81 1.1 christos *lengthp = i;
82 1.1 christos return &unicode_name_words[unicode_name_by_length[i].extra_offset
83 1.1 christos + (index-unicode_name_by_length[i].ind_offset)*i];
84 1.1 christos }
85 1.1 christos
86 1.1 christos /* Looks up the index of a word. */
87 1.1 christos static int
88 1.1 christos unicode_name_word_lookup (const char *word, unsigned int length)
89 1.1 christos {
90 1.1 christos if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
91 1.1 christos {
92 1.1 christos /* Binary search among the words of given length. */
93 1.1 christos unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
94 1.1 christos unsigned int i0 = unicode_name_by_length[length].ind_offset;
95 1.1 christos unsigned int i1 = i0;
96 1.1 christos unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
97 1.1 christos while (i2 - i1 > 0)
98 1.1 christos {
99 1.1 christos unsigned int i = (i1 + i2) >> 1;
100 1.1 christos const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
101 1.1 christos const char *w = word;
102 1.1 christos unsigned int n = length;
103 1.1 christos for (;;)
104 1.1 christos {
105 1.1 christos if (*p < *w)
106 1.1 christos {
107 1.1 christos if (i1 == i)
108 1.1 christos return -1;
109 1.1 christos /* Note here: i1 < i < i2. */
110 1.1 christos i1 = i;
111 1.1 christos break;
112 1.1 christos }
113 1.1 christos if (*p > *w)
114 1.1 christos {
115 1.1 christos /* Note here: i1 <= i < i2. */
116 1.1 christos i2 = i;
117 1.1 christos break;
118 1.1 christos }
119 1.1 christos p++; w++; n--;
120 1.1 christos if (n == 0)
121 1.1 christos return i;
122 1.1 christos }
123 1.1 christos }
124 1.1 christos }
125 1.1 christos return -1;
126 1.1 christos }
127 1.1 christos
128 1.1 christos /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
129 1.1 christos sections 3.11 and 4.4. */
130 1.1 christos static const char jamo_initial_short_name[19][3] =
131 1.1 christos {
132 1.1 christos "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
133 1.1 christos "C", "K", "T", "P", "H"
134 1.1 christos };
135 1.1 christos static const char jamo_medial_short_name[21][4] =
136 1.1 christos {
137 1.1 christos "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
138 1.1 christos "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
139 1.1 christos };
140 1.1 christos static const char jamo_final_short_name[28][3] =
141 1.1 christos {
142 1.1 christos "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
143 1.1 christos "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
144 1.1 christos };
145 1.1 christos
146 1.1 christos /* Looks up the name of a Unicode character, in uppercase ASCII.
147 1.1 christos Returns the filled buf, or NULL if the character does not have a name. */
148 1.1 christos char *
149 1.1 christos unicode_character_name (unsigned int c, char *buf)
150 1.1 christos {
151 1.1 christos if (c >= 0xAC00 && c <= 0xD7A3)
152 1.1 christos {
153 1.1 christos /* Special case for Hangul syllables. Keeps the tables small. */
154 1.1 christos char *ptr;
155 1.1 christos unsigned int tmp;
156 1.1 christos unsigned int index1;
157 1.1 christos unsigned int index2;
158 1.1 christos unsigned int index3;
159 1.1 christos const char *q;
160 1.1 christos
161 1.1 christos /* buf needs to have at least 16 + 7 bytes here. */
162 1.1 christos memcpy (buf, "HANGUL SYLLABLE ", 16);
163 1.1 christos ptr = buf + 16;
164 1.1 christos
165 1.1 christos tmp = c - 0xAC00;
166 1.1 christos index3 = tmp % 28; tmp = tmp / 28;
167 1.1 christos index2 = tmp % 21; tmp = tmp / 21;
168 1.1 christos index1 = tmp;
169 1.1 christos
170 1.1 christos q = jamo_initial_short_name[index1];
171 1.1 christos while (*q != '\0')
172 1.1 christos *ptr++ = *q++;
173 1.1 christos q = jamo_medial_short_name[index2];
174 1.1 christos while (*q != '\0')
175 1.1 christos *ptr++ = *q++;
176 1.1 christos q = jamo_final_short_name[index3];
177 1.1 christos while (*q != '\0')
178 1.1 christos *ptr++ = *q++;
179 1.1 christos *ptr = '\0';
180 1.1 christos return buf;
181 1.1 christos }
182 1.1 christos else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
183 1.1 christos || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
184 1.1 christos {
185 1.1 christos /* Special case for CJK compatibility ideographs. Keeps the tables
186 1.1 christos small. */
187 1.1 christos char *ptr;
188 1.1 christos int i;
189 1.1 christos
190 1.1 christos /* buf needs to have at least 28 + 5 bytes here. */
191 1.1 christos memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
192 1.1 christos ptr = buf + 28;
193 1.1 christos
194 1.1 christos for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
195 1.1 christos {
196 1.1 christos unsigned int x = (c >> i) & 0xf;
197 1.1 christos *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
198 1.1 christos }
199 1.1 christos *ptr = '\0';
200 1.1 christos return buf;
201 1.1 christos }
202 1.1 christos else
203 1.1 christos {
204 1.1 christos const uint16_t *words;
205 1.1 christos
206 1.1 christos /* Transform the code so that it fits in 16 bits. */
207 1.1 christos switch (c >> 12)
208 1.1 christos {
209 1.1 christos case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
210 1.1 christos break;
211 1.1 christos case 0x0A:
212 1.1 christos c -= 0x05000;
213 1.1 christos break;
214 1.1 christos case 0x0F:
215 1.1 christos c -= 0x09000;
216 1.1 christos break;
217 1.1 christos case 0x10:
218 1.1 christos c -= 0x09000;
219 1.1 christos break;
220 1.1 christos case 0x1D:
221 1.1 christos c -= 0x15000;
222 1.1 christos break;
223 1.1 christos case 0x2F:
224 1.1 christos c -= 0x26000;
225 1.1 christos break;
226 1.1 christos case 0xE0:
227 1.1 christos c -= 0xD6000;
228 1.1 christos break;
229 1.1 christos default:
230 1.1 christos return NULL;
231 1.1 christos }
232 1.1 christos
233 1.1 christos {
234 1.1 christos /* Binary search in unicode_code_to_name. */
235 1.1 christos unsigned int i1 = 0;
236 1.1 christos unsigned int i2 = SIZEOF (unicode_code_to_name);
237 1.1 christos for (;;)
238 1.1 christos {
239 1.1 christos unsigned int i = (i1 + i2) >> 1;
240 1.1 christos if (unicode_code_to_name[i].code == c)
241 1.1 christos {
242 1.1 christos words = &unicode_names[unicode_code_to_name[i].name];
243 1.1 christos break;
244 1.1 christos }
245 1.1 christos else if (unicode_code_to_name[i].code < c)
246 1.1 christos {
247 1.1 christos if (i1 == i)
248 1.1 christos {
249 1.1 christos words = NULL;
250 1.1 christos break;
251 1.1 christos }
252 1.1 christos /* Note here: i1 < i < i2. */
253 1.1 christos i1 = i;
254 1.1 christos }
255 1.1 christos else if (unicode_code_to_name[i].code > c)
256 1.1 christos {
257 1.1 christos if (i2 == i)
258 1.1 christos {
259 1.1 christos words = NULL;
260 1.1 christos break;
261 1.1 christos }
262 1.1 christos /* Note here: i1 <= i < i2. */
263 1.1 christos i2 = i;
264 1.1 christos }
265 1.1 christos }
266 1.1 christos }
267 1.1 christos if (words != NULL)
268 1.1 christos {
269 1.1 christos /* Found it in unicode_code_to_name. Now concatenate the words. */
270 1.1 christos /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
271 1.1 christos char *ptr = buf;
272 1.1 christos for (;;)
273 1.1 christos {
274 1.1 christos unsigned int wordlen;
275 1.1 christos const char *word = unicode_name_word (*words>>1, &wordlen);
276 1.1 christos do
277 1.1 christos *ptr++ = *word++;
278 1.1 christos while (--wordlen > 0);
279 1.1 christos if ((*words & 1) == 0)
280 1.1 christos break;
281 1.1 christos *ptr++ = ' ';
282 1.1 christos words++;
283 1.1 christos }
284 1.1 christos *ptr = '\0';
285 1.1 christos return buf;
286 1.1 christos }
287 1.1 christos return NULL;
288 1.1 christos }
289 1.1 christos }
290 1.1 christos
291 1.1 christos /* Looks up the Unicode character with a given name, in upper- or lowercase
292 1.1 christos ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
293 1.1 christos unsigned int
294 1.1 christos unicode_name_character (const char *name)
295 1.1 christos {
296 1.1 christos unsigned int len = strlen (name);
297 1.1 christos if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
298 1.1 christos {
299 1.1 christos /* Test for "word1 word2 ..." syntax. */
300 1.1 christos char buf[UNICODE_CHARNAME_MAX_LENGTH];
301 1.1 christos char *ptr = buf;
302 1.1 christos for (;;)
303 1.1 christos {
304 1.1 christos char c = *name++;
305 1.1 christos if (!(c >= ' ' && c <= '~'))
306 1.1 christos break;
307 1.1 christos *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
308 1.1 christos if (--len == 0)
309 1.1 christos goto filled_buf;
310 1.1 christos }
311 1.1 christos if (false)
312 1.1 christos filled_buf:
313 1.1 christos {
314 1.1 christos /* Convert the constituents to uint16_t words. */
315 1.1 christos uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
316 1.1 christos uint16_t *wordptr = words;
317 1.1 christos {
318 1.1 christos const char *p1 = buf;
319 1.1 christos for (;;)
320 1.1 christos {
321 1.1 christos {
322 1.1 christos int word;
323 1.1 christos const char *p2 = p1;
324 1.1 christos while (p2 < ptr && *p2 != ' ')
325 1.1 christos p2++;
326 1.1 christos word = unicode_name_word_lookup (p1, p2 - p1);
327 1.1 christos if (word < 0)
328 1.1 christos break;
329 1.1 christos if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
330 1.1 christos break;
331 1.1 christos *wordptr++ = word;
332 1.1 christos if (p2 == ptr)
333 1.1 christos goto filled_words;
334 1.1 christos p1 = p2 + 1;
335 1.1 christos }
336 1.1 christos /* Special case for Hangul syllables. Keeps the tables small. */
337 1.1 christos if (wordptr == &words[2]
338 1.1 christos && words[0] == UNICODE_CHARNAME_WORD_HANGUL
339 1.1 christos && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
340 1.1 christos {
341 1.1 christos /* Split the last word [p1..ptr) into three parts:
342 1.1 christos 1) [BCDGHJKMNPRST]
343 1.1 christos 2) [AEIOUWY]
344 1.1 christos 3) [BCDGHIJKLMNPST]
345 1.1 christos */
346 1.1 christos const char *p2;
347 1.1 christos const char *p3;
348 1.1 christos const char *p4;
349 1.1 christos
350 1.1 christos p2 = p1;
351 1.1 christos while (p2 < ptr
352 1.1 christos && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
353 1.1 christos || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
354 1.1 christos || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
355 1.1 christos || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
356 1.1 christos || *p2 == 'T'))
357 1.1 christos p2++;
358 1.1 christos p3 = p2;
359 1.1 christos while (p3 < ptr
360 1.1 christos && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
361 1.1 christos || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
362 1.1 christos || *p3 == 'Y'))
363 1.1 christos p3++;
364 1.1 christos p4 = p3;
365 1.1 christos while (p4 < ptr
366 1.1 christos && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
367 1.1 christos || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
368 1.1 christos || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
369 1.1 christos || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
370 1.1 christos || *p4 == 'S' || *p4 == 'T'))
371 1.1 christos p4++;
372 1.1 christos if (p4 == ptr)
373 1.1 christos {
374 1.1 christos unsigned int n1 = p2 - p1;
375 1.1 christos unsigned int n2 = p3 - p2;
376 1.1 christos unsigned int n3 = p4 - p3;
377 1.1 christos
378 1.1 christos if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
379 1.1 christos {
380 1.1 christos unsigned int index1;
381 1.1 christos
382 1.1 christos for (index1 = 0; index1 < 19; index1++)
383 1.1 christos if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
384 1.1 christos && jamo_initial_short_name[index1][n1] == '\0')
385 1.1 christos {
386 1.1 christos unsigned int index2;
387 1.1 christos
388 1.1 christos for (index2 = 0; index2 < 21; index2++)
389 1.1 christos if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
390 1.1 christos && jamo_medial_short_name[index2][n2] == '\0')
391 1.1 christos {
392 1.1 christos unsigned int index3;
393 1.1 christos
394 1.1 christos for (index3 = 0; index3 < 28; index3++)
395 1.1 christos if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
396 1.1 christos && jamo_final_short_name[index3][n3] == '\0')
397 1.1 christos {
398 1.1 christos return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
399 1.1 christos }
400 1.1 christos break;
401 1.1 christos }
402 1.1 christos break;
403 1.1 christos }
404 1.1 christos }
405 1.1 christos }
406 1.1 christos }
407 1.1 christos /* Special case for CJK compatibility ideographs. Keeps the
408 1.1 christos tables small. */
409 1.1 christos if (wordptr == &words[2]
410 1.1 christos && words[0] == UNICODE_CHARNAME_WORD_CJK
411 1.1 christos && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
412 1.1 christos && p1 + 14 <= ptr
413 1.1 christos && p1 + 15 >= ptr
414 1.1 christos && memcmp (p1, "IDEOGRAPH-", 10) == 0)
415 1.1 christos {
416 1.1 christos const char *p2 = p1 + 10;
417 1.1 christos
418 1.1 christos if (*p2 != '0')
419 1.1 christos {
420 1.1 christos unsigned int c = 0;
421 1.1 christos
422 1.1 christos for (;;)
423 1.1 christos {
424 1.1 christos if (*p2 >= '0' && *p2 <= '9')
425 1.1 christos c += (*p2 - '0');
426 1.1 christos else if (*p2 >= 'A' && *p2 <= 'F')
427 1.1 christos c += (*p2 - 'A' + 10);
428 1.1 christos else
429 1.1 christos break;
430 1.1 christos p2++;
431 1.1 christos if (p2 == ptr)
432 1.1 christos {
433 1.1 christos if ((c >= 0xF900 && c <= 0xFA2D)
434 1.1 christos || (c >= 0xFA30 && c <= 0xFA6A)
435 1.1 christos || (c >= 0xFA70 && c <= 0xFAD9)
436 1.1 christos || (c >= 0x2F800 && c <= 0x2FA1D))
437 1.1 christos return c;
438 1.1 christos else
439 1.1 christos break;
440 1.1 christos }
441 1.1 christos c = c << 4;
442 1.1 christos }
443 1.1 christos }
444 1.1 christos }
445 1.1 christos }
446 1.1 christos }
447 1.1 christos if (false)
448 1.1 christos filled_words:
449 1.1 christos {
450 1.1 christos /* Multiply by 2, to simplify later comparisons. */
451 1.1 christos unsigned int words_length = wordptr - words;
452 1.1 christos {
453 1.1 christos int i = words_length - 1;
454 1.1 christos words[i] = 2 * words[i];
455 1.1 christos for (; --i >= 0; )
456 1.1 christos words[i] = 2 * words[i] + 1;
457 1.1 christos }
458 1.1 christos /* Binary search in unicode_name_to_code. */
459 1.1 christos {
460 1.1 christos unsigned int i1 = 0;
461 1.1 christos unsigned int i2 = SIZEOF (unicode_name_to_code);
462 1.1 christos for (;;)
463 1.1 christos {
464 1.1 christos unsigned int i = (i1 + i2) >> 1;
465 1.1 christos const uint16_t *w = words;
466 1.1 christos const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
467 1.1 christos unsigned int n = words_length;
468 1.1 christos for (;;)
469 1.1 christos {
470 1.1 christos if (*p < *w)
471 1.1 christos {
472 1.1 christos if (i1 == i)
473 1.1 christos goto name_not_found;
474 1.1 christos /* Note here: i1 < i < i2. */
475 1.1 christos i1 = i;
476 1.1 christos break;
477 1.1 christos }
478 1.1 christos else if (*p > *w)
479 1.1 christos {
480 1.1 christos if (i2 == i)
481 1.1 christos goto name_not_found;
482 1.1 christos /* Note here: i1 <= i < i2. */
483 1.1 christos i2 = i;
484 1.1 christos break;
485 1.1 christos }
486 1.1 christos p++; w++; n--;
487 1.1 christos if (n == 0)
488 1.1 christos {
489 1.1 christos unsigned int c = unicode_name_to_code[i].code;
490 1.1 christos
491 1.1 christos /* Undo the transformation to 16-bit space. */
492 1.1 christos static const unsigned int offset[11] =
493 1.1 christos {
494 1.1 christos 0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
495 1.1 christos 0x05000, 0x09000, 0x09000, 0x15000, 0x26000,
496 1.1 christos 0xD6000
497 1.1 christos };
498 1.1 christos return c + offset[c >> 12];
499 1.1 christos }
500 1.1 christos }
501 1.1 christos }
502 1.1 christos }
503 1.1 christos name_not_found: ;
504 1.1 christos }
505 1.1 christos }
506 1.1 christos }
507 1.1 christos return UNINAME_INVALID;
508 1.1 christos }
509