gettext-tools/libuniname/uniname.c

1.1  christos /* Association between Unicode characters and their names.
1.1  christos    Copyright (C) 2000-2002, 2005-2006 Free Software Foundation, Inc.
1.1  christos
1.1  christos    This program is free software; you can redistribute it and/or modify
1.1  christos    it under the terms of the GNU General Public License as published by
1.1  christos    the Free Software Foundation; either version 2, or (at your option)
1.1  christos    any later version.
1.1  christos
1.1  christos    This program is distributed in the hope that it will be useful,
1.1  christos    but WITHOUT ANY WARRANTY; without even the implied warranty of
1.1  christos    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1.1  christos    GNU General Public License for more details.
1.1  christos
1.1  christos    You should have received a copy of the GNU General Public License
1.1  christos    along with this program; if not, write to the Free Software Foundation,
1.1  christos    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
1.1  christos
1.1  christos #ifdef HAVE_CONFIG_H
1.1  christos # include <config.h>
1.1  christos #endif
1.1  christos
1.1  christos /* Specification.  */
1.1  christos #include "uniname.h"
1.1  christos
1.1  christos #include <assert.h>
1.1  christos #include <stdbool.h>
1.1  christos #include <stdio.h>
1.1  christos #include <string.h>
1.1  christos
1.1  christos #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
1.1  christos
1.1  christos
1.1  christos /* Table of Unicode character names, derived from UnicodeData.txt.  */
1.1  christos #define uint16_t unsigned short
1.1  christos #define uint32_t unsigned int
1.1  christos #include "uninames.h"
1.1  christos /* It contains:
1.1  christos   static const char unicode_name_words[34594] = ...;
1.1  christos   #define UNICODE_CHARNAME_NUM_WORDS 5906
1.1  christos   static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
1.1  christos   #define UNICODE_CHARNAME_WORD_HANGUL 3624
1.1  christos   #define UNICODE_CHARNAME_WORD_SYLLABLE 4654
1.1  christos   #define UNICODE_CHARNAME_WORD_CJK 401
1.1  christos   #define UNICODE_CHARNAME_WORD_COMPATIBILITY 5755
1.1  christos   static const uint16_t unicode_names[62620] = ...;
1.1  christos   static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[15257] = ...;
1.1  christos   static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[15257] = ...;
1.1  christos   #define UNICODE_CHARNAME_MAX_LENGTH 83
1.1  christos   #define UNICODE_CHARNAME_MAX_WORDS 13
1.1  christos */
1.1  christos
1.1  christos /* Returns the word with a given index.  */
1.1  christos static const char *
1.1  christos unicode_name_word (unsigned int index, unsigned int *lengthp)
1.1  christos {
1.1  christos   unsigned int i1;
1.1  christos   unsigned int i2;
1.1  christos   unsigned int i;
1.1  christos
1.1  christos   assert (index < UNICODE_CHARNAME_NUM_WORDS);
1.1  christos
1.1  christos   /* Binary search for i with
1.1  christos        unicode_name_by_length[i].ind_offset <= index
1.1  christos      and
1.1  christos        index < unicode_name_by_length[i+1].ind_offset
1.1  christos    */
1.1  christos
1.1  christos   i1 = 0;
1.1  christos   i2 = SIZEOF (unicode_name_by_length) - 1;
1.1  christos   while (i2 - i1 > 1)
1.1  christos     {
1.1  christos       unsigned int i = (i1 + i2) >> 1;
1.1  christos       if (unicode_name_by_length[i].ind_offset <= index)
1.1  christos 	i1 = i;
1.1  christos       else
1.1  christos 	i2 = i;
1.1  christos     }
1.1  christos   i = i1;
1.1  christos   assert (unicode_name_by_length[i].ind_offset <= index
1.1  christos 	  && index < unicode_name_by_length[i+1].ind_offset);
1.1  christos   *lengthp = i;
1.1  christos   return &unicode_name_words[unicode_name_by_length[i].extra_offset
1.1  christos 			     + (index-unicode_name_by_length[i].ind_offset)*i];
1.1  christos }
1.1  christos
1.1  christos /* Looks up the index of a word.  */
1.1  christos static int
1.1  christos unicode_name_word_lookup (const char *word, unsigned int length)
1.1  christos {
1.1  christos   if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
1.1  christos     {
1.1  christos       /* Binary search among the words of given length.  */
1.1  christos       unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
1.1  christos       unsigned int i0 = unicode_name_by_length[length].ind_offset;
1.1  christos       unsigned int i1 = i0;
1.1  christos       unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
1.1  christos       while (i2 - i1 > 0)
1.1  christos 	{
1.1  christos 	  unsigned int i = (i1 + i2) >> 1;
1.1  christos 	  const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
1.1  christos 	  const char *w = word;
1.1  christos 	  unsigned int n = length;
1.1  christos 	  for (;;)
1.1  christos 	    {
1.1  christos 	      if (*p < *w)
1.1  christos 		{
1.1  christos 		  if (i1 == i)
1.1  christos 		    return -1;
1.1  christos 		  /* Note here: i1 < i < i2.  */
1.1  christos 		  i1 = i;
1.1  christos 		  break;
1.1  christos 		}
1.1  christos 	      if (*p > *w)
1.1  christos 		{
1.1  christos 		  /* Note here: i1 <= i < i2.  */
1.1  christos 		  i2 = i;
1.1  christos 		  break;
1.1  christos 		}
1.1  christos 	      p++; w++; n--;
1.1  christos 	      if (n == 0)
1.1  christos 		return i;
1.1  christos 	    }
1.1  christos 	}
1.1  christos     }
1.1  christos   return -1;
1.1  christos }
1.1  christos
1.1  christos /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
1.1  christos    sections 3.11 and 4.4.  */
1.1  christos static const char jamo_initial_short_name[19][3] =
1.1  christos {
1.1  christos   "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
1.1  christos   "C", "K", "T", "P", "H"
1.1  christos };
1.1  christos static const char jamo_medial_short_name[21][4] =
1.1  christos {
1.1  christos   "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
1.1  christos   "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
1.1  christos };
1.1  christos static const char jamo_final_short_name[28][3] =
1.1  christos {
1.1  christos   "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
1.1  christos   "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
1.1  christos };
1.1  christos
1.1  christos /* Looks up the name of a Unicode character, in uppercase ASCII.
1.1  christos    Returns the filled buf, or NULL if the character does not have a name.  */
1.1  christos char *
1.1  christos unicode_character_name (unsigned int c, char *buf)
1.1  christos {
1.1  christos   if (c >= 0xAC00 && c <= 0xD7A3)
1.1  christos     {
1.1  christos       /* Special case for Hangul syllables. Keeps the tables small.  */
1.1  christos       char *ptr;
1.1  christos       unsigned int tmp;
1.1  christos       unsigned int index1;
1.1  christos       unsigned int index2;
1.1  christos       unsigned int index3;
1.1  christos       const char *q;
1.1  christos
1.1  christos       /* buf needs to have at least 16 + 7 bytes here.  */
1.1  christos       memcpy (buf, "HANGUL SYLLABLE ", 16);
1.1  christos       ptr = buf + 16;
1.1  christos
1.1  christos       tmp = c - 0xAC00;
1.1  christos       index3 = tmp % 28; tmp = tmp / 28;
1.1  christos       index2 = tmp % 21; tmp = tmp / 21;
1.1  christos       index1 = tmp;
1.1  christos
1.1  christos       q = jamo_initial_short_name[index1];
1.1  christos       while (*q != '\0')
1.1  christos 	*ptr++ = *q++;
1.1  christos       q = jamo_medial_short_name[index2];
1.1  christos       while (*q != '\0')
1.1  christos 	*ptr++ = *q++;
1.1  christos       q = jamo_final_short_name[index3];
1.1  christos       while (*q != '\0')
1.1  christos 	*ptr++ = *q++;
1.1  christos       *ptr = '\0';
1.1  christos       return buf;
1.1  christos     }
1.1  christos   else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
1.1  christos 	   || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
1.1  christos     {
1.1  christos       /* Special case for CJK compatibility ideographs. Keeps the tables
1.1  christos 	 small.  */
1.1  christos       char *ptr;
1.1  christos       int i;
1.1  christos
1.1  christos       /* buf needs to have at least 28 + 5 bytes here.  */
1.1  christos       memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
1.1  christos       ptr = buf + 28;
1.1  christos
1.1  christos       for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
1.1  christos 	{
1.1  christos 	  unsigned int x = (c >> i) & 0xf;
1.1  christos 	  *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
1.1  christos 	}
1.1  christos       *ptr = '\0';
1.1  christos       return buf;
1.1  christos     }
1.1  christos   else
1.1  christos     {
1.1  christos       const uint16_t *words;
1.1  christos
1.1  christos       /* Transform the code so that it fits in 16 bits.  */
1.1  christos       switch (c >> 12)
1.1  christos 	{
1.1  christos 	case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
1.1  christos 	  break;
1.1  christos 	case 0x0A:
1.1  christos 	  c -= 0x05000;
1.1  christos 	  break;
1.1  christos 	case 0x0F:
1.1  christos 	  c -= 0x09000;
1.1  christos 	  break;
1.1  christos 	case 0x10:
1.1  christos 	  c -= 0x09000;
1.1  christos 	  break;
1.1  christos 	case 0x1D:
1.1  christos 	  c -= 0x15000;
1.1  christos 	  break;
1.1  christos 	case 0x2F:
1.1  christos 	  c -= 0x26000;
1.1  christos 	  break;
1.1  christos 	case 0xE0:
1.1  christos 	  c -= 0xD6000;
1.1  christos 	  break;
1.1  christos 	default:
1.1  christos 	  return NULL;
1.1  christos 	}
1.1  christos
1.1  christos       {
1.1  christos 	/* Binary search in unicode_code_to_name.  */
1.1  christos 	unsigned int i1 = 0;
1.1  christos 	unsigned int i2 = SIZEOF (unicode_code_to_name);
1.1  christos 	for (;;)
1.1  christos 	  {
1.1  christos 	    unsigned int i = (i1 + i2) >> 1;
1.1  christos 	    if (unicode_code_to_name[i].code == c)
1.1  christos 	      {
1.1  christos 		words = &unicode_names[unicode_code_to_name[i].name];
1.1  christos 		break;
1.1  christos 	      }
1.1  christos 	    else if (unicode_code_to_name[i].code < c)
1.1  christos 	      {
1.1  christos 		if (i1 == i)
1.1  christos 		  {
1.1  christos 		    words = NULL;
1.1  christos 		    break;
1.1  christos 		  }
1.1  christos 		/* Note here: i1 < i < i2.  */
1.1  christos 		i1 = i;
1.1  christos 	      }
1.1  christos 	    else if (unicode_code_to_name[i].code > c)
1.1  christos 	      {
1.1  christos 		if (i2 == i)
1.1  christos 		  {
1.1  christos 		    words = NULL;
1.1  christos 		    break;
1.1  christos 		  }
1.1  christos 		/* Note here: i1 <= i < i2.  */
1.1  christos 		i2 = i;
1.1  christos 	      }
1.1  christos 	  }
1.1  christos       }
1.1  christos       if (words != NULL)
1.1  christos 	{
1.1  christos 	  /* Found it in unicode_code_to_name. Now concatenate the words.  */
1.1  christos 	  /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
1.1  christos 	  char *ptr = buf;
1.1  christos 	  for (;;)
1.1  christos 	    {
1.1  christos 	      unsigned int wordlen;
1.1  christos 	      const char *word = unicode_name_word (*words>>1, &wordlen);
1.1  christos 	      do
1.1  christos 		*ptr++ = *word++;
1.1  christos 	      while (--wordlen > 0);
1.1  christos 	      if ((*words & 1) == 0)
1.1  christos 		break;
1.1  christos 	      *ptr++ = ' ';
1.1  christos 	      words++;
1.1  christos 	    }
1.1  christos 	  *ptr = '\0';
1.1  christos 	  return buf;
1.1  christos 	}
1.1  christos       return NULL;
1.1  christos     }
1.1  christos }
1.1  christos
1.1  christos /* Looks up the Unicode character with a given name, in upper- or lowercase
1.1  christos    ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
1.1  christos unsigned int
1.1  christos unicode_name_character (const char *name)
1.1  christos {
1.1  christos   unsigned int len = strlen (name);
1.1  christos   if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
1.1  christos     {
1.1  christos       /* Test for "word1 word2 ..." syntax.  */
1.1  christos       char buf[UNICODE_CHARNAME_MAX_LENGTH];
1.1  christos       char *ptr = buf;
1.1  christos       for (;;)
1.1  christos 	{
1.1  christos 	  char c = *name++;
1.1  christos 	  if (!(c >= ' ' && c <= '~'))
1.1  christos 	    break;
1.1  christos 	  *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
1.1  christos 	  if (--len == 0)
1.1  christos 	    goto filled_buf;
1.1  christos 	}
1.1  christos       if (false)
1.1  christos       filled_buf:
1.1  christos 	{
1.1  christos 	  /* Convert the constituents to uint16_t words.  */
1.1  christos 	  uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
1.1  christos 	  uint16_t *wordptr = words;
1.1  christos 	  {
1.1  christos 	    const char *p1 = buf;
1.1  christos 	    for (;;)
1.1  christos 	      {
1.1  christos 		{
1.1  christos 		  int word;
1.1  christos 		  const char *p2 = p1;
1.1  christos 		  while (p2 < ptr && *p2 != ' ')
1.1  christos 		    p2++;
1.1  christos 		  word = unicode_name_word_lookup (p1, p2 - p1);
1.1  christos 		  if (word < 0)
1.1  christos 		    break;
1.1  christos 		  if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
1.1  christos 		    break;
1.1  christos 		  *wordptr++ = word;
1.1  christos 		  if (p2 == ptr)
1.1  christos 		    goto filled_words;
1.1  christos 		  p1 = p2 + 1;
1.1  christos 		}
1.1  christos 		/* Special case for Hangul syllables. Keeps the tables small. */
1.1  christos 		if (wordptr == &words[2]
1.1  christos 		    && words[0] == UNICODE_CHARNAME_WORD_HANGUL
1.1  christos 		    && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
1.1  christos 		  {
1.1  christos 		    /* Split the last word [p1..ptr) into three parts:
1.1  christos 			 1) [BCDGHJKMNPRST]
1.1  christos 			 2) [AEIOUWY]
1.1  christos 			 3) [BCDGHIJKLMNPST]
1.1  christos 		     */
1.1  christos 		    const char *p2;
1.1  christos 		    const char *p3;
1.1  christos 		    const char *p4;
1.1  christos
1.1  christos 		    p2 = p1;
1.1  christos 		    while (p2 < ptr
1.1  christos 			   && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
1.1  christos 			       || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
1.1  christos 			       || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
1.1  christos 			       || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
1.1  christos 			       || *p2 == 'T'))
1.1  christos 		      p2++;
1.1  christos 		    p3 = p2;
1.1  christos 		    while (p3 < ptr
1.1  christos 			   && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
1.1  christos 			       || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
1.1  christos 			       || *p3 == 'Y'))
1.1  christos 		      p3++;
1.1  christos 		    p4 = p3;
1.1  christos 		    while (p4 < ptr
1.1  christos 			   && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
1.1  christos 			       || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
1.1  christos 			       || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
1.1  christos 			       || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
1.1  christos 			       || *p4 == 'S' || *p4 == 'T'))
1.1  christos 		      p4++;
1.1  christos 		    if (p4 == ptr)
1.1  christos 		      {
1.1  christos 			unsigned int n1 = p2 - p1;
1.1  christos 			unsigned int n2 = p3 - p2;
1.1  christos 			unsigned int n3 = p4 - p3;
1.1  christos
1.1  christos 			if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
1.1  christos 			  {
1.1  christos 			    unsigned int index1;
1.1  christos
1.1  christos 			    for (index1 = 0; index1 < 19; index1++)
1.1  christos 			      if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
1.1  christos 				  && jamo_initial_short_name[index1][n1] == '\0')
1.1  christos 				{
1.1  christos 				  unsigned int index2;
1.1  christos
1.1  christos 				  for (index2 = 0; index2 < 21; index2++)
1.1  christos 				    if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
1.1  christos 					&& jamo_medial_short_name[index2][n2] == '\0')
1.1  christos 				      {
1.1  christos 					unsigned int index3;
1.1  christos
1.1  christos 					for (index3 = 0; index3 < 28; index3++)
1.1  christos 					  if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
1.1  christos 					      && jamo_final_short_name[index3][n3] == '\0')
1.1  christos 					    {
1.1  christos 					      return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
1.1  christos 					    }
1.1  christos 					break;
1.1  christos 				      }
1.1  christos 				  break;
1.1  christos 				}
1.1  christos 			  }
1.1  christos 		      }
1.1  christos 		  }
1.1  christos 		/* Special case for CJK compatibility ideographs. Keeps the
1.1  christos 		   tables small.  */
1.1  christos 		if (wordptr == &words[2]
1.1  christos 		    && words[0] == UNICODE_CHARNAME_WORD_CJK
1.1  christos 		    && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
1.1  christos 		    && p1 + 14 <= ptr
1.1  christos 		    && p1 + 15 >= ptr
1.1  christos 		    && memcmp (p1, "IDEOGRAPH-", 10) == 0)
1.1  christos 		  {
1.1  christos 		    const char *p2 = p1 + 10;
1.1  christos
1.1  christos 		    if (*p2 != '0')
1.1  christos 		      {
1.1  christos 			unsigned int c = 0;
1.1  christos
1.1  christos 			for (;;)
1.1  christos 			  {
1.1  christos 			    if (*p2 >= '0' && *p2 <= '9')
1.1  christos 			      c += (*p2 - '0');
1.1  christos 			    else if (*p2 >= 'A' && *p2 <= 'F')
1.1  christos 			      c += (*p2 - 'A' + 10);
1.1  christos 			    else
1.1  christos 			      break;
1.1  christos 			    p2++;
1.1  christos 			    if (p2 == ptr)
1.1  christos 			      {
1.1  christos 				if ((c >= 0xF900 && c <= 0xFA2D)
1.1  christos 				    || (c >= 0xFA30 && c <= 0xFA6A)
1.1  christos 				    || (c >= 0xFA70 && c <= 0xFAD9)
1.1  christos 				    || (c >= 0x2F800 && c <= 0x2FA1D))
1.1  christos 				  return c;
1.1  christos 				else
1.1  christos 				  break;
1.1  christos 			      }
1.1  christos 			    c = c << 4;
1.1  christos 			  }
1.1  christos 		      }
1.1  christos 		  }
1.1  christos 	      }
1.1  christos 	  }
1.1  christos 	  if (false)
1.1  christos 	  filled_words:
1.1  christos 	    {
1.1  christos 	      /* Multiply by 2, to simplify later comparisons.  */
1.1  christos 	      unsigned int words_length = wordptr - words;
1.1  christos 	      {
1.1  christos 		int i = words_length - 1;
1.1  christos 		words[i] = 2 * words[i];
1.1  christos 		for (; --i >= 0; )
1.1  christos 		  words[i] = 2 * words[i] + 1;
1.1  christos 	      }
1.1  christos 	      /* Binary search in unicode_name_to_code.  */
1.1  christos 	      {
1.1  christos 		unsigned int i1 = 0;
1.1  christos 		unsigned int i2 = SIZEOF (unicode_name_to_code);
1.1  christos 		for (;;)
1.1  christos 		  {
1.1  christos 		    unsigned int i = (i1 + i2) >> 1;
1.1  christos 		    const uint16_t *w = words;
1.1  christos 		    const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
1.1  christos 		    unsigned int n = words_length;
1.1  christos 		    for (;;)
1.1  christos 		      {
1.1  christos 			if (*p < *w)
1.1  christos 			  {
1.1  christos 			    if (i1 == i)
1.1  christos 			      goto name_not_found;
1.1  christos 			    /* Note here: i1 < i < i2.  */
1.1  christos 			    i1 = i;
1.1  christos 			    break;
1.1  christos 			  }
1.1  christos 			else if (*p > *w)
1.1  christos 			  {
1.1  christos 			    if (i2 == i)
1.1  christos 			      goto name_not_found;
1.1  christos 			    /* Note here: i1 <= i < i2.  */
1.1  christos 			    i2 = i;
1.1  christos 			    break;
1.1  christos 			  }
1.1  christos 			p++; w++; n--;
1.1  christos 			if (n == 0)
1.1  christos 			  {
1.1  christos 			    unsigned int c = unicode_name_to_code[i].code;
1.1  christos
1.1  christos 			    /* Undo the transformation to 16-bit space.  */
1.1  christos 			    static const unsigned int offset[11] =
1.1  christos 			      {
1.1  christos 				0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
1.1  christos 				0x05000, 0x09000, 0x09000, 0x15000, 0x26000,
1.1  christos 				0xD6000
1.1  christos 			      };
1.1  christos 			    return c + offset[c >> 12];
1.1  christos 			  }
1.1  christos 		      }
1.1  christos 		  }
1.1  christos 	      }
1.1  christos 	    name_not_found: ;
1.1  christos 	    }
1.1  christos 	}
1.1  christos     }
1.1  christos   return UNINAME_INVALID;
1.1  christos }