Home | History | Annotate | Line # | Download | only in libuniname
uniname.c revision 1.1
      1 /* Association between Unicode characters and their names.
      2    Copyright (C) 2000-2002, 2005-2006 Free Software Foundation, Inc.
      3 
      4    This program is free software; you can redistribute it and/or modify
      5    it under the terms of the GNU General Public License as published by
      6    the Free Software Foundation; either version 2, or (at your option)
      7    any later version.
      8 
      9    This program is distributed in the hope that it will be useful,
     10    but WITHOUT ANY WARRANTY; without even the implied warranty of
     11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     12    GNU General Public License for more details.
     13 
     14    You should have received a copy of the GNU General Public License
     15    along with this program; if not, write to the Free Software Foundation,
     16    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
     17 
     18 #ifdef HAVE_CONFIG_H
     19 # include <config.h>
     20 #endif
     21 
     22 /* Specification.  */
     23 #include "uniname.h"
     24 
     25 #include <assert.h>
     26 #include <stdbool.h>
     27 #include <stdio.h>
     28 #include <string.h>
     29 
     30 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
     31 
     32 
     33 /* Table of Unicode character names, derived from UnicodeData.txt.  */
     34 #define uint16_t unsigned short
     35 #define uint32_t unsigned int
     36 #include "uninames.h"
     37 /* It contains:
     38   static const char unicode_name_words[34594] = ...;
     39   #define UNICODE_CHARNAME_NUM_WORDS 5906
     40   static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
     41   #define UNICODE_CHARNAME_WORD_HANGUL 3624
     42   #define UNICODE_CHARNAME_WORD_SYLLABLE 4654
     43   #define UNICODE_CHARNAME_WORD_CJK 401
     44   #define UNICODE_CHARNAME_WORD_COMPATIBILITY 5755
     45   static const uint16_t unicode_names[62620] = ...;
     46   static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[15257] = ...;
     47   static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[15257] = ...;
     48   #define UNICODE_CHARNAME_MAX_LENGTH 83
     49   #define UNICODE_CHARNAME_MAX_WORDS 13
     50 */
     51 
     52 /* Returns the word with a given index.  */
     53 static const char *
     54 unicode_name_word (unsigned int index, unsigned int *lengthp)
     55 {
     56   unsigned int i1;
     57   unsigned int i2;
     58   unsigned int i;
     59 
     60   assert (index < UNICODE_CHARNAME_NUM_WORDS);
     61 
     62   /* Binary search for i with
     63        unicode_name_by_length[i].ind_offset <= index
     64      and
     65        index < unicode_name_by_length[i+1].ind_offset
     66    */
     67 
     68   i1 = 0;
     69   i2 = SIZEOF (unicode_name_by_length) - 1;
     70   while (i2 - i1 > 1)
     71     {
     72       unsigned int i = (i1 + i2) >> 1;
     73       if (unicode_name_by_length[i].ind_offset <= index)
     74 	i1 = i;
     75       else
     76 	i2 = i;
     77     }
     78   i = i1;
     79   assert (unicode_name_by_length[i].ind_offset <= index
     80 	  && index < unicode_name_by_length[i+1].ind_offset);
     81   *lengthp = i;
     82   return &unicode_name_words[unicode_name_by_length[i].extra_offset
     83 			     + (index-unicode_name_by_length[i].ind_offset)*i];
     84 }
     85 
     86 /* Looks up the index of a word.  */
     87 static int
     88 unicode_name_word_lookup (const char *word, unsigned int length)
     89 {
     90   if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
     91     {
     92       /* Binary search among the words of given length.  */
     93       unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
     94       unsigned int i0 = unicode_name_by_length[length].ind_offset;
     95       unsigned int i1 = i0;
     96       unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
     97       while (i2 - i1 > 0)
     98 	{
     99 	  unsigned int i = (i1 + i2) >> 1;
    100 	  const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
    101 	  const char *w = word;
    102 	  unsigned int n = length;
    103 	  for (;;)
    104 	    {
    105 	      if (*p < *w)
    106 		{
    107 		  if (i1 == i)
    108 		    return -1;
    109 		  /* Note here: i1 < i < i2.  */
    110 		  i1 = i;
    111 		  break;
    112 		}
    113 	      if (*p > *w)
    114 		{
    115 		  /* Note here: i1 <= i < i2.  */
    116 		  i2 = i;
    117 		  break;
    118 		}
    119 	      p++; w++; n--;
    120 	      if (n == 0)
    121 		return i;
    122 	    }
    123 	}
    124     }
    125   return -1;
    126 }
    127 
    128 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
    129    sections 3.11 and 4.4.  */
    130 static const char jamo_initial_short_name[19][3] =
    131 {
    132   "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
    133   "C", "K", "T", "P", "H"
    134 };
    135 static const char jamo_medial_short_name[21][4] =
    136 {
    137   "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
    138   "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
    139 };
    140 static const char jamo_final_short_name[28][3] =
    141 {
    142   "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
    143   "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
    144 };
    145 
    146 /* Looks up the name of a Unicode character, in uppercase ASCII.
    147    Returns the filled buf, or NULL if the character does not have a name.  */
    148 char *
    149 unicode_character_name (unsigned int c, char *buf)
    150 {
    151   if (c >= 0xAC00 && c <= 0xD7A3)
    152     {
    153       /* Special case for Hangul syllables. Keeps the tables small.  */
    154       char *ptr;
    155       unsigned int tmp;
    156       unsigned int index1;
    157       unsigned int index2;
    158       unsigned int index3;
    159       const char *q;
    160 
    161       /* buf needs to have at least 16 + 7 bytes here.  */
    162       memcpy (buf, "HANGUL SYLLABLE ", 16);
    163       ptr = buf + 16;
    164 
    165       tmp = c - 0xAC00;
    166       index3 = tmp % 28; tmp = tmp / 28;
    167       index2 = tmp % 21; tmp = tmp / 21;
    168       index1 = tmp;
    169 
    170       q = jamo_initial_short_name[index1];
    171       while (*q != '\0')
    172 	*ptr++ = *q++;
    173       q = jamo_medial_short_name[index2];
    174       while (*q != '\0')
    175 	*ptr++ = *q++;
    176       q = jamo_final_short_name[index3];
    177       while (*q != '\0')
    178 	*ptr++ = *q++;
    179       *ptr = '\0';
    180       return buf;
    181     }
    182   else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
    183 	   || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
    184     {
    185       /* Special case for CJK compatibility ideographs. Keeps the tables
    186 	 small.  */
    187       char *ptr;
    188       int i;
    189 
    190       /* buf needs to have at least 28 + 5 bytes here.  */
    191       memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
    192       ptr = buf + 28;
    193 
    194       for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
    195 	{
    196 	  unsigned int x = (c >> i) & 0xf;
    197 	  *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
    198 	}
    199       *ptr = '\0';
    200       return buf;
    201     }
    202   else
    203     {
    204       const uint16_t *words;
    205 
    206       /* Transform the code so that it fits in 16 bits.  */
    207       switch (c >> 12)
    208 	{
    209 	case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
    210 	  break;
    211 	case 0x0A:
    212 	  c -= 0x05000;
    213 	  break;
    214 	case 0x0F:
    215 	  c -= 0x09000;
    216 	  break;
    217 	case 0x10:
    218 	  c -= 0x09000;
    219 	  break;
    220 	case 0x1D:
    221 	  c -= 0x15000;
    222 	  break;
    223 	case 0x2F:
    224 	  c -= 0x26000;
    225 	  break;
    226 	case 0xE0:
    227 	  c -= 0xD6000;
    228 	  break;
    229 	default:
    230 	  return NULL;
    231 	}
    232 
    233       {
    234 	/* Binary search in unicode_code_to_name.  */
    235 	unsigned int i1 = 0;
    236 	unsigned int i2 = SIZEOF (unicode_code_to_name);
    237 	for (;;)
    238 	  {
    239 	    unsigned int i = (i1 + i2) >> 1;
    240 	    if (unicode_code_to_name[i].code == c)
    241 	      {
    242 		words = &unicode_names[unicode_code_to_name[i].name];
    243 		break;
    244 	      }
    245 	    else if (unicode_code_to_name[i].code < c)
    246 	      {
    247 		if (i1 == i)
    248 		  {
    249 		    words = NULL;
    250 		    break;
    251 		  }
    252 		/* Note here: i1 < i < i2.  */
    253 		i1 = i;
    254 	      }
    255 	    else if (unicode_code_to_name[i].code > c)
    256 	      {
    257 		if (i2 == i)
    258 		  {
    259 		    words = NULL;
    260 		    break;
    261 		  }
    262 		/* Note here: i1 <= i < i2.  */
    263 		i2 = i;
    264 	      }
    265 	  }
    266       }
    267       if (words != NULL)
    268 	{
    269 	  /* Found it in unicode_code_to_name. Now concatenate the words.  */
    270 	  /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
    271 	  char *ptr = buf;
    272 	  for (;;)
    273 	    {
    274 	      unsigned int wordlen;
    275 	      const char *word = unicode_name_word (*words>>1, &wordlen);
    276 	      do
    277 		*ptr++ = *word++;
    278 	      while (--wordlen > 0);
    279 	      if ((*words & 1) == 0)
    280 		break;
    281 	      *ptr++ = ' ';
    282 	      words++;
    283 	    }
    284 	  *ptr = '\0';
    285 	  return buf;
    286 	}
    287       return NULL;
    288     }
    289 }
    290 
    291 /* Looks up the Unicode character with a given name, in upper- or lowercase
    292    ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
    293 unsigned int
    294 unicode_name_character (const char *name)
    295 {
    296   unsigned int len = strlen (name);
    297   if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
    298     {
    299       /* Test for "word1 word2 ..." syntax.  */
    300       char buf[UNICODE_CHARNAME_MAX_LENGTH];
    301       char *ptr = buf;
    302       for (;;)
    303 	{
    304 	  char c = *name++;
    305 	  if (!(c >= ' ' && c <= '~'))
    306 	    break;
    307 	  *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
    308 	  if (--len == 0)
    309 	    goto filled_buf;
    310 	}
    311       if (false)
    312       filled_buf:
    313 	{
    314 	  /* Convert the constituents to uint16_t words.  */
    315 	  uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
    316 	  uint16_t *wordptr = words;
    317 	  {
    318 	    const char *p1 = buf;
    319 	    for (;;)
    320 	      {
    321 		{
    322 		  int word;
    323 		  const char *p2 = p1;
    324 		  while (p2 < ptr && *p2 != ' ')
    325 		    p2++;
    326 		  word = unicode_name_word_lookup (p1, p2 - p1);
    327 		  if (word < 0)
    328 		    break;
    329 		  if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
    330 		    break;
    331 		  *wordptr++ = word;
    332 		  if (p2 == ptr)
    333 		    goto filled_words;
    334 		  p1 = p2 + 1;
    335 		}
    336 		/* Special case for Hangul syllables. Keeps the tables small. */
    337 		if (wordptr == &words[2]
    338 		    && words[0] == UNICODE_CHARNAME_WORD_HANGUL
    339 		    && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
    340 		  {
    341 		    /* Split the last word [p1..ptr) into three parts:
    342 			 1) [BCDGHJKMNPRST]
    343 			 2) [AEIOUWY]
    344 			 3) [BCDGHIJKLMNPST]
    345 		     */
    346 		    const char *p2;
    347 		    const char *p3;
    348 		    const char *p4;
    349 
    350 		    p2 = p1;
    351 		    while (p2 < ptr
    352 			   && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
    353 			       || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
    354 			       || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
    355 			       || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
    356 			       || *p2 == 'T'))
    357 		      p2++;
    358 		    p3 = p2;
    359 		    while (p3 < ptr
    360 			   && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
    361 			       || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
    362 			       || *p3 == 'Y'))
    363 		      p3++;
    364 		    p4 = p3;
    365 		    while (p4 < ptr
    366 			   && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
    367 			       || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
    368 			       || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
    369 			       || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
    370 			       || *p4 == 'S' || *p4 == 'T'))
    371 		      p4++;
    372 		    if (p4 == ptr)
    373 		      {
    374 			unsigned int n1 = p2 - p1;
    375 			unsigned int n2 = p3 - p2;
    376 			unsigned int n3 = p4 - p3;
    377 
    378 			if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
    379 			  {
    380 			    unsigned int index1;
    381 
    382 			    for (index1 = 0; index1 < 19; index1++)
    383 			      if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
    384 				  && jamo_initial_short_name[index1][n1] == '\0')
    385 				{
    386 				  unsigned int index2;
    387 
    388 				  for (index2 = 0; index2 < 21; index2++)
    389 				    if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
    390 					&& jamo_medial_short_name[index2][n2] == '\0')
    391 				      {
    392 					unsigned int index3;
    393 
    394 					for (index3 = 0; index3 < 28; index3++)
    395 					  if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
    396 					      && jamo_final_short_name[index3][n3] == '\0')
    397 					    {
    398 					      return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
    399 					    }
    400 					break;
    401 				      }
    402 				  break;
    403 				}
    404 			  }
    405 		      }
    406 		  }
    407 		/* Special case for CJK compatibility ideographs. Keeps the
    408 		   tables small.  */
    409 		if (wordptr == &words[2]
    410 		    && words[0] == UNICODE_CHARNAME_WORD_CJK
    411 		    && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
    412 		    && p1 + 14 <= ptr
    413 		    && p1 + 15 >= ptr
    414 		    && memcmp (p1, "IDEOGRAPH-", 10) == 0)
    415 		  {
    416 		    const char *p2 = p1 + 10;
    417 
    418 		    if (*p2 != '0')
    419 		      {
    420 			unsigned int c = 0;
    421 
    422 			for (;;)
    423 			  {
    424 			    if (*p2 >= '0' && *p2 <= '9')
    425 			      c += (*p2 - '0');
    426 			    else if (*p2 >= 'A' && *p2 <= 'F')
    427 			      c += (*p2 - 'A' + 10);
    428 			    else
    429 			      break;
    430 			    p2++;
    431 			    if (p2 == ptr)
    432 			      {
    433 				if ((c >= 0xF900 && c <= 0xFA2D)
    434 				    || (c >= 0xFA30 && c <= 0xFA6A)
    435 				    || (c >= 0xFA70 && c <= 0xFAD9)
    436 				    || (c >= 0x2F800 && c <= 0x2FA1D))
    437 				  return c;
    438 				else
    439 				  break;
    440 			      }
    441 			    c = c << 4;
    442 			  }
    443 		      }
    444 		  }
    445 	      }
    446 	  }
    447 	  if (false)
    448 	  filled_words:
    449 	    {
    450 	      /* Multiply by 2, to simplify later comparisons.  */
    451 	      unsigned int words_length = wordptr - words;
    452 	      {
    453 		int i = words_length - 1;
    454 		words[i] = 2 * words[i];
    455 		for (; --i >= 0; )
    456 		  words[i] = 2 * words[i] + 1;
    457 	      }
    458 	      /* Binary search in unicode_name_to_code.  */
    459 	      {
    460 		unsigned int i1 = 0;
    461 		unsigned int i2 = SIZEOF (unicode_name_to_code);
    462 		for (;;)
    463 		  {
    464 		    unsigned int i = (i1 + i2) >> 1;
    465 		    const uint16_t *w = words;
    466 		    const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
    467 		    unsigned int n = words_length;
    468 		    for (;;)
    469 		      {
    470 			if (*p < *w)
    471 			  {
    472 			    if (i1 == i)
    473 			      goto name_not_found;
    474 			    /* Note here: i1 < i < i2.  */
    475 			    i1 = i;
    476 			    break;
    477 			  }
    478 			else if (*p > *w)
    479 			  {
    480 			    if (i2 == i)
    481 			      goto name_not_found;
    482 			    /* Note here: i1 <= i < i2.  */
    483 			    i2 = i;
    484 			    break;
    485 			  }
    486 			p++; w++; n--;
    487 			if (n == 0)
    488 			  {
    489 			    unsigned int c = unicode_name_to_code[i].code;
    490 
    491 			    /* Undo the transformation to 16-bit space.  */
    492 			    static const unsigned int offset[11] =
    493 			      {
    494 				0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
    495 				0x05000, 0x09000, 0x09000, 0x15000, 0x26000,
    496 				0xD6000
    497 			      };
    498 			    return c + offset[c >> 12];
    499 			  }
    500 		      }
    501 		  }
    502 	      }
    503 	    name_not_found: ;
    504 	    }
    505 	}
    506     }
    507   return UNINAME_INVALID;
    508 }
    509