Home | History | Annotate | Line # | Download | only in dist
      1 /*	$NetBSD: charset.c,v 1.5 2023/10/06 05:49:49 simonb Exp $	*/
      2 
      3 /*
      4  * Copyright (C) 1984-2023  Mark Nudelman
      5  *
      6  * You may distribute under the terms of either the GNU General Public
      7  * License or the Less License, as specified in the README file.
      8  *
      9  * For more information, see the README file.
     10  */
     11 
     12 
     13 /*
     14  * Functions to define the character set
     15  * and do things specific to the character set.
     16  */
     17 
     18 #include "less.h"
     19 #if HAVE_LOCALE
     20 #include <locale.h>
     21 #include <ctype.h>
     22 #include <langinfo.h>
     23 #endif
     24 
     25 #include "charset.h"
     26 #include "xbuf.h"
     27 
     28 #if MSDOS_COMPILER==WIN32C
     29 #define WIN32_LEAN_AND_MEAN
     30 #include <windows.h>
     31 #endif
     32 
     33 extern int bs_mode;
     34 
     35 public int utf_mode = 0;
     36 
     37 /*
     38  * Predefined character sets,
     39  * selected by the LESSCHARSET environment variable.
     40  */
     41 struct charset {
     42 	char *name;
     43 	int *p_flag;
     44 	char *desc;
     45 } charsets[] = {
     46 		{ "ascii",              NULL,       "8bcccbcc18b95.b" },
     47 		{ "utf-8",              &utf_mode,  "8bcccbcc18b95.b126.bb" },
     48 		{ "iso8859",            NULL,       "8bcccbcc18b95.33b." },
     49 		{ "latin3",             NULL,       "8bcccbcc18b95.33b5.b8.b15.b4.b12.b18.b12.b." },
     50 		{ "arabic",             NULL,       "8bcccbcc18b95.33b.3b.7b2.13b.3b.b26.5b19.b" },
     51 		{ "greek",              NULL,       "8bcccbcc18b95.33b4.2b4.b3.b35.b44.b" },
     52 		{ "greek2005",          NULL,       "8bcccbcc18b95.33b14.b35.b44.b" },
     53 		{ "hebrew",             NULL,       "8bcccbcc18b95.33b.b29.32b28.2b2.b" },
     54 		{ "koi8-r",             NULL,       "8bcccbcc18b95.b." },
     55 		{ "KOI8-T",             NULL,       "8bcccbcc18b95.b8.b6.b8.b.b.5b7.3b4.b4.b3.b.b.3b." },
     56 		{ "georgianps",         NULL,       "8bcccbcc18b95.3b11.4b12.2b." },
     57 		{ "tcvn",               NULL,       "b..b...bcccbccbbb7.8b95.b48.5b." },
     58 		{ "TIS-620",            NULL,       "8bcccbcc18b95.b.4b.11b7.8b." },
     59 		{ "next",               NULL,       "8bcccbcc18b95.bb125.bb" },
     60 		{ "dos",                NULL,       "8bcccbcc12bc5b95.b." },
     61 		{ "windows-1251",       NULL,       "8bcccbcc12bc5b95.b24.b." },
     62 		{ "windows-1252",       NULL,       "8bcccbcc12bc5b95.b.b11.b.2b12.b." },
     63 		{ "windows-1255",       NULL,       "8bcccbcc12bc5b95.b.b8.b.5b9.b.4b." },
     64 		{ "ebcdic",             NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
     65 		{ "IBM-1047",           NULL,       "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
     66 		{ NULL, NULL, NULL }
     67 };
     68 
     69 /*
     70  * Support "locale charmap"/nl_langinfo(CODESET) values, as well as others.
     71  */
     72 struct cs_alias {
     73 	char *name;
     74 	char *oname;
     75 } cs_aliases[] = {
     76 	{ "UTF-8",              "utf-8" },
     77 	{ "utf8",               "utf-8" },
     78 	{ "UTF8",               "utf-8" },
     79 	{ "ANSI_X3.4-1968",     "ascii" },
     80 	{ "US-ASCII",           "ascii" },
     81 	{ "latin1",             "iso8859" },
     82 	{ "ISO-8859-1",         "iso8859" },
     83 	{ "latin9",             "iso8859" },
     84 	{ "ISO-8859-15",        "iso8859" },
     85 	{ "latin2",             "iso8859" },
     86 	{ "ISO-8859-2",         "iso8859" },
     87 	{ "ISO-8859-3",         "latin3" },
     88 	{ "latin4",             "iso8859" },
     89 	{ "ISO-8859-4",         "iso8859" },
     90 	{ "cyrillic",           "iso8859" },
     91 	{ "ISO-8859-5",         "iso8859" },
     92 	{ "ISO-8859-6",         "arabic" },
     93 	{ "ISO-8859-7",         "greek" },
     94 	{ "IBM9005",            "greek2005" },
     95 	{ "ISO-8859-8",         "hebrew" },
     96 	{ "latin5",             "iso8859" },
     97 	{ "ISO-8859-9",         "iso8859" },
     98 	{ "latin6",             "iso8859" },
     99 	{ "ISO-8859-10",        "iso8859" },
    100 	{ "latin7",             "iso8859" },
    101 	{ "ISO-8859-13",        "iso8859" },
    102 	{ "latin8",             "iso8859" },
    103 	{ "ISO-8859-14",        "iso8859" },
    104 	{ "latin10",            "iso8859" },
    105 	{ "ISO-8859-16",        "iso8859" },
    106 	{ "IBM437",             "dos" },
    107 	{ "EBCDIC-US",          "ebcdic" },
    108 	{ "IBM1047",            "IBM-1047" },
    109 	{ "KOI8-R",             "koi8-r" },
    110 	{ "KOI8-U",             "koi8-r" },
    111 	{ "GEORGIAN-PS",        "georgianps" },
    112 	{ "TCVN5712-1",         "tcvn" },
    113 	{ "NEXTSTEP",           "next" },
    114 	{ "windows",            "windows-1252" }, /* backward compatibility */
    115 	{ "CP1251",             "windows-1251" },
    116 	{ "CP1252",             "windows-1252" },
    117 	{ "CP1255",             "windows-1255" },
    118 	{ NULL, NULL }
    119 };
    120 
    121 #define IS_BINARY_CHAR  01
    122 #define IS_CONTROL_CHAR 02
    123 
    124 static char chardef[256];
    125 static char *binfmt = NULL;
    126 static char *utfbinfmt = NULL;
    127 public int binattr = AT_STANDOUT|AT_COLOR_BIN;
    128 
    129 static struct xbuffer user_wide_array;
    130 static struct xbuffer user_ubin_array;
    131 static struct xbuffer user_compose_array;
    132 static struct xbuffer user_prt_array;
    133 static struct wchar_range_table user_wide_table;
    134 static struct wchar_range_table user_ubin_table;
    135 static struct wchar_range_table user_compose_table;
    136 static struct wchar_range_table user_prt_table;
    137 
    138 /*
    139  * Set a wchar_range_table to the table in an xbuffer.
    140  */
    141 static void wchar_range_table_set(struct wchar_range_table *tbl, struct xbuffer *arr)
    142 {
    143 	tbl->table = (struct wchar_range *) arr->data;
    144 	tbl->count = arr->end / sizeof(struct wchar_range);
    145 }
    146 
    147 /*
    148  * Skip over a "U" or "U+" prefix before a hex codepoint.
    149  */
    150 static char * skip_uprefix(char *s)
    151 {
    152 	if (*s == 'U' || *s == 'u')
    153 		if (*++s == '+') ++s;
    154 	return s;
    155 }
    156 
    157 /*
    158  * Parse a dash-separated range of hex values.
    159  */
    160 static void wchar_range_get(char **ss, struct wchar_range *range)
    161 {
    162 	char *s = skip_uprefix(*ss);
    163 	range->first = lstrtoul(s, &s, 16);
    164 	if (s[0] == '-')
    165 	{
    166 		s = skip_uprefix(&s[1]);
    167 		range->last = lstrtoul(s, &s, 16);
    168 	} else
    169 	{
    170 		range->last = range->first;
    171 	}
    172 	*ss = s;
    173 }
    174 
    175 /*
    176  * Parse the LESSUTFCHARDEF variable.
    177  */
    178 static void ichardef_utf(char *s)
    179 {
    180 	xbuf_init(&user_wide_array);
    181 	xbuf_init(&user_ubin_array);
    182 	xbuf_init(&user_compose_array);
    183 	xbuf_init(&user_prt_array);
    184 
    185 	if (s != NULL)
    186 	{
    187 		while (s[0] != '\0')
    188 		{
    189 			struct wchar_range range;
    190 			wchar_range_get(&s, &range);
    191 			if (range.last == 0)
    192 			{
    193 				error("invalid hex number(s) in LESSUTFCHARDEF", NULL_PARG);
    194 				quit(QUIT_ERROR);
    195 			}
    196 			if (*s++ != ':')
    197 			{
    198 				error("missing colon in LESSUTFCHARDEF", NULL_PARG);
    199 				quit(QUIT_ERROR);
    200 			}
    201 			switch (*s++)
    202 			{
    203 			case 'b':
    204 				xbuf_add_data(&user_ubin_array, (unsigned char *) &range, sizeof(range));
    205 				break;
    206 			case 'c':
    207 				xbuf_add_data(&user_compose_array, (unsigned char *) &range, sizeof(range));
    208 				break;
    209 			case 'w':
    210 				xbuf_add_data(&user_wide_array, (unsigned char *) &range, sizeof(range));
    211 				xbuf_add_data(&user_prt_array, (unsigned char *) &range, sizeof(range));
    212 				break;
    213 			case 'p': case '.':
    214 				xbuf_add_data(&user_prt_array, (unsigned char *) &range, sizeof(range));
    215 				break;
    216 			case '\0':
    217 				s--;
    218 				break;
    219 			default:
    220 				/* Ignore unknown character attribute. */
    221 				break;
    222 			}
    223 			if (s[0] == ',') ++s;
    224 		}
    225 	}
    226 	wchar_range_table_set(&user_wide_table, &user_wide_array);
    227 	wchar_range_table_set(&user_ubin_table, &user_ubin_array);
    228 	wchar_range_table_set(&user_compose_table, &user_compose_array);
    229 	wchar_range_table_set(&user_prt_table, &user_prt_array);
    230 }
    231 
    232 /*
    233  * Define a charset, given a description string.
    234  * The string consists of 256 letters,
    235  * one for each character in the charset.
    236  * If the string is shorter than 256 letters, missing letters
    237  * are taken to be identical to the last one.
    238  * A decimal number followed by a letter is taken to be a
    239  * repetition of the letter.
    240  *
    241  * Each letter is one of:
    242  *      . normal character
    243  *      b binary character
    244  *      c control character
    245  */
    246 static void ichardef(char *s)
    247 {
    248 	char *cp;
    249 	int n;
    250 	char v;
    251 
    252 	n = 0;
    253 	v = 0;
    254 	cp = chardef;
    255 	while (*s != '\0')
    256 	{
    257 		switch (*s++)
    258 		{
    259 		case '.':
    260 			v = 0;
    261 			break;
    262 		case 'c':
    263 			v = IS_CONTROL_CHAR;
    264 			break;
    265 		case 'b':
    266 			v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
    267 			break;
    268 
    269 		case '0': case '1': case '2': case '3': case '4':
    270 		case '5': case '6': case '7': case '8': case '9':
    271 			if (ckd_mul(&n, n, 10) || ckd_add(&n, n, s[-1] - '0'))
    272 				goto invalid_chardef;
    273 			continue;
    274 
    275 		default:
    276 		invalid_chardef:
    277 			error("invalid chardef", NULL_PARG);
    278 			quit(QUIT_ERROR);
    279 			/*NOTREACHED*/
    280 		}
    281 
    282 		do
    283 		{
    284 			if (cp >= chardef + sizeof(chardef))
    285 			{
    286 				error("chardef longer than 256", NULL_PARG);
    287 				quit(QUIT_ERROR);
    288 				/*NOTREACHED*/
    289 			}
    290 			*cp++ = v;
    291 		} while (--n > 0);
    292 		n = 0;
    293 	}
    294 
    295 	while (cp < chardef + sizeof(chardef))
    296 		*cp++ = v;
    297 }
    298 
    299 /*
    300  * Define a charset, given a charset name.
    301  * The valid charset names are listed in the "charsets" array.
    302  */
    303 static int icharset(char *name, int no_error)
    304 {
    305 	struct charset *p;
    306 	struct cs_alias *a;
    307 
    308 	if (name == NULL || *name == '\0')
    309 		return (0);
    310 
    311 	/* First see if the name is an alias. */
    312 	for (a = cs_aliases;  a->name != NULL;  a++)
    313 	{
    314 		if (strcmp(name, a->name) == 0)
    315 		{
    316 			name = a->oname;
    317 			break;
    318 		}
    319 	}
    320 
    321 	for (p = charsets;  p->name != NULL;  p++)
    322 	{
    323 		if (strcmp(name, p->name) == 0)
    324 		{
    325 			ichardef(p->desc);
    326 			if (p->p_flag != NULL)
    327 			{
    328 #if MSDOS_COMPILER==WIN32C
    329 				*(p->p_flag) = 1 + (GetConsoleOutputCP() != CP_UTF8);
    330 #else
    331 				*(p->p_flag) = 1;
    332 #endif
    333 			}
    334 			return (1);
    335 		}
    336 	}
    337 
    338 	if (!no_error) {
    339 		error("invalid charset name", NULL_PARG);
    340 		quit(QUIT_ERROR);
    341 	}
    342 	return (0);
    343 }
    344 
    345 #if HAVE_LOCALE
    346 /*
    347  * Define a charset, given a locale name.
    348  */
    349 static void ilocale(void)
    350 {
    351 	int c;
    352 
    353 	for (c = 0;  c < (int) sizeof(chardef);  c++)
    354 	{
    355 		if (isprint(c))
    356 			chardef[c] = 0;
    357 		else if (iscntrl(c))
    358 			chardef[c] = IS_CONTROL_CHAR;
    359 		else
    360 			chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
    361 	}
    362 }
    363 #endif
    364 
    365 /*
    366  * Define the printing format for control (or binary utf) chars.
    367  */
    368 public void setfmt(char *s, char **fmtvarptr, int *attrptr, char *default_fmt, int for_printf)
    369 {
    370 	if (s && utf_mode)
    371 	{
    372 		/* It would be too hard to account for width otherwise.  */
    373 		char constant *t = s;
    374 		while (*t)
    375 		{
    376 			if (*t < ' ' || *t > '~')
    377 			{
    378 				s = default_fmt;
    379 				goto attr;
    380 			}
    381 			t++;
    382 		}
    383 	}
    384 
    385 	if (s == NULL || *s == '\0')
    386 		s = default_fmt;
    387 	else if (for_printf &&
    388 	    ((*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) ||
    389 	     (*s != '*' && strchr(s, 'n'))))
    390 		/* %n is evil */
    391 		s = default_fmt;
    392 
    393 	/*
    394 	 * Select the attributes if it starts with "*".
    395 	 */
    396  attr:
    397 	if (*s == '*' && s[1] != '\0')
    398 	{
    399 		switch (s[1])
    400 		{
    401 		case 'd':  *attrptr = AT_BOLD;      break;
    402 		case 'k':  *attrptr = AT_BLINK;     break;
    403 		case 's':  *attrptr = AT_STANDOUT;  break;
    404 		case 'u':  *attrptr = AT_UNDERLINE; break;
    405 		default:   *attrptr = AT_NORMAL;    break;
    406 		}
    407 		s += 2;
    408 	}
    409 	*fmtvarptr = s;
    410 }
    411 
    412 /*
    413  *
    414  */
    415 static void set_charset(void)
    416 {
    417 	char *s;
    418 
    419 #if MSDOS_COMPILER==WIN32C
    420 	/*
    421 	 * If the Windows console is using UTF-8, we'll use it too.
    422 	 */
    423 	if (GetConsoleOutputCP() == CP_UTF8)
    424 		if (icharset("utf-8", 1))
    425 			return;
    426 #endif
    427 
    428 	ichardef_utf(lgetenv("LESSUTFCHARDEF"));
    429 
    430 	/*
    431 	 * See if environment variable LESSCHARSET is defined.
    432 	 */
    433 	s = lgetenv("LESSCHARSET");
    434 	if (icharset(s, 0))
    435 		return;
    436 
    437 	/*
    438 	 * LESSCHARSET is not defined: try LESSCHARDEF.
    439 	 */
    440 	s = lgetenv("LESSCHARDEF");
    441 	if (!isnullenv(s))
    442 	{
    443 		ichardef(s);
    444 		return;
    445 	}
    446 
    447 #if HAVE_LOCALE
    448 #ifdef CODESET
    449 	/*
    450 	 * Try using the codeset name as the charset name.
    451 	 */
    452 	s = nl_langinfo(CODESET);
    453 	if (icharset(s, 1))
    454 		return;
    455 #endif
    456 #endif
    457 
    458 #if HAVE_STRSTR
    459 	/*
    460 	 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
    461 	 */
    462 	if ((s = lgetenv("LC_ALL")) != NULL ||
    463 	    (s = lgetenv("LC_CTYPE")) != NULL ||
    464 	    (s = lgetenv("LANG")) != NULL)
    465 	{
    466 		if (   strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL
    467 		    || strstr(s, "UTF8")  != NULL || strstr(s, "utf8")  != NULL)
    468 			if (icharset("utf-8", 1))
    469 				return;
    470 	}
    471 #endif
    472 
    473 #if HAVE_LOCALE
    474 	/*
    475 	 * Get character definitions from locale functions,
    476 	 * rather than from predefined charset entry.
    477 	 */
    478 	ilocale();
    479 #else
    480 #if MSDOS_COMPILER
    481 	/*
    482 	 * Default to "dos".
    483 	 */
    484 	(void) icharset("dos", 1);
    485 #else
    486 	/*
    487 	 * Default to "latin1".
    488 	 */
    489 	(void) icharset("latin1", 1);
    490 #endif
    491 #endif
    492 }
    493 
    494 /*
    495  * Initialize charset data structures.
    496  */
    497 public void init_charset(void)
    498 {
    499 	char *s;
    500 
    501 #if HAVE_LOCALE
    502 	setlocale(LC_ALL, "");
    503 #endif
    504 
    505 	set_charset();
    506 
    507 	s = lgetenv("LESSBINFMT");
    508 	setfmt(s, &binfmt, &binattr, "*s<%02X>", TRUE);
    509 
    510 	s = lgetenv("LESSUTFBINFMT");
    511 	setfmt(s, &utfbinfmt, &binattr, "<U+%04lX>", TRUE);
    512 }
    513 
    514 /*
    515  * Is a given character a "binary" character?
    516  */
    517 public int binary_char(LWCHAR c)
    518 {
    519 	if (utf_mode)
    520 		return (is_ubin_char(c));
    521 	c &= 0377;
    522 	return (chardef[c] & IS_BINARY_CHAR);
    523 }
    524 
    525 /*
    526  * Is a given character a "control" character?
    527  */
    528 public int control_char(LWCHAR c)
    529 {
    530 	c &= 0377;
    531 	return (chardef[c] & IS_CONTROL_CHAR);
    532 }
    533 
    534 /*
    535  * Return the printable form of a character.
    536  * For example, in the "ascii" charset '\3' is printed as "^C".
    537  */
    538 public char * prchar(LWCHAR c)
    539 {
    540 	/* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */
    541 	static char buf[MAX_PRCHAR_LEN+1];
    542 
    543 	c &= 0377;
    544 	if ((c < 128 || !utf_mode) && !control_char(c))
    545 		SNPRINTF1(buf, sizeof(buf), "%c", (int) c);
    546 	else if (c == ESC)
    547 		strcpy(buf, "ESC");
    548 #if IS_EBCDIC_HOST
    549 	else if (!binary_char(c) && c < 64)
    550 		SNPRINTF1(buf, sizeof(buf), "^%c",
    551 		/*
    552 		 * This array roughly inverts CONTROL() #defined in less.h,
    553 		 * and should be kept in sync with CONTROL() and IBM-1047.
    554 		 */
    555 		"@ABC.I.?...KLMNO"
    556 		"PQRS.JH.XY.."
    557 		"\\]^_"
    558 		"......W[.....EFG"
    559 		"..V....D....TU.Z"[c]);
    560 #else
    561 	else if (c < 128 && !control_char(c ^ 0100))
    562 		SNPRINTF1(buf, sizeof(buf), "^%c", (int) (c ^ 0100));
    563 #endif
    564 	else
    565 		SNPRINTF1(buf, sizeof(buf), binfmt, c);
    566 	return (buf);
    567 }
    568 
    569 /*
    570  * Return the printable form of a UTF-8 character.
    571  */
    572 public char * prutfchar(LWCHAR ch)
    573 {
    574 	static char buf[MAX_PRCHAR_LEN+1];
    575 
    576 	if (ch == ESC)
    577 		strcpy(buf, "ESC");
    578 	else if (ch < 128 && control_char(ch))
    579 	{
    580 		if (!control_char(ch ^ 0100))
    581 			SNPRINTF1(buf, sizeof(buf), "^%c", ((char) ch) ^ 0100);
    582 		else
    583 			SNPRINTF1(buf, sizeof(buf), binfmt, (char) ch);
    584 	} else if (is_ubin_char(ch))
    585 	{
    586 		SNPRINTF1(buf, sizeof(buf), utfbinfmt, ch);
    587 	} else
    588 	{
    589 		char *p = buf;
    590 		if (ch >= 0x80000000)
    591 			ch = 0xFFFD; /* REPLACEMENT CHARACTER */
    592 		put_wchar(&p, ch);
    593 		*p = '\0';
    594 	}
    595 	return (buf);
    596 }
    597 
    598 /*
    599  * Get the length of a UTF-8 character in bytes.
    600  */
    601 public int utf_len(int ch)
    602 {
    603 	if ((ch & 0x80) == 0)
    604 		return 1;
    605 	if ((ch & 0xE0) == 0xC0)
    606 		return 2;
    607 	if ((ch & 0xF0) == 0xE0)
    608 		return 3;
    609 	if ((ch & 0xF8) == 0xF0)
    610 		return 4;
    611 	if ((ch & 0xFC) == 0xF8)
    612 		return 5;
    613 	if ((ch & 0xFE) == 0xFC)
    614 		return 6;
    615 	/* Invalid UTF-8 encoding. */
    616 	return 1;
    617 }
    618 
    619 /*
    620  * Does the parameter point to the lead byte of a well-formed UTF-8 character?
    621  */
    622 public int is_utf8_well_formed(char *ss, int slen)
    623 {
    624 	int i;
    625 	int len;
    626 	unsigned char *s = (unsigned char *) ss;
    627 
    628 	if (IS_UTF8_INVALID(s[0]))
    629 		return (0);
    630 
    631 	len = utf_len(s[0]);
    632 	if (len > slen)
    633 		return (0);
    634 	if (len == 1)
    635 		return (1);
    636 	if (len == 2)
    637 	{
    638 		if (s[0] < 0xC2)
    639 		    return (0);
    640 	} else
    641 	{
    642 		unsigned char mask;
    643 		mask = (~((1 << (8-len)) - 1)) & 0xFF;
    644 		if (s[0] == mask && (s[1] & mask) == 0x80)
    645 			return (0);
    646 	}
    647 
    648 	for (i = 1;  i < len;  i++)
    649 		if (!IS_UTF8_TRAIL(s[i]))
    650 			return (0);
    651 	return (1);
    652 }
    653 
    654 /*
    655  * Skip bytes until a UTF-8 lead byte (11xxxxxx) or ASCII byte (0xxxxxxx) is found.
    656  */
    657 public void utf_skip_to_lead(char **pp, char *limit)
    658 {
    659 	do {
    660 		++(*pp);
    661 	} while (*pp < limit && !IS_UTF8_LEAD((*pp)[0] & 0377) && !IS_ASCII_OCTET((*pp)[0]));
    662 }
    663 
    664 
    665 /*
    666  * Get the value of a UTF-8 character.
    667  */
    668 public LWCHAR get_wchar(constant char *p)
    669 {
    670 	switch (utf_len(p[0]))
    671 	{
    672 	case 1:
    673 	default:
    674 		/* 0xxxxxxx */
    675 		return (LWCHAR)
    676 			(p[0] & 0xFF);
    677 	case 2:
    678 		/* 110xxxxx 10xxxxxx */
    679 		return (LWCHAR) (
    680 			((p[0] & 0x1F) << 6) |
    681 			(p[1] & 0x3F));
    682 	case 3:
    683 		/* 1110xxxx 10xxxxxx 10xxxxxx */
    684 		return (LWCHAR) (
    685 			((p[0] & 0x0F) << 12) |
    686 			((p[1] & 0x3F) << 6) |
    687 			(p[2] & 0x3F));
    688 	case 4:
    689 		/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
    690 		return (LWCHAR) (
    691 			((p[0] & 0x07) << 18) |
    692 			((p[1] & 0x3F) << 12) |
    693 			((p[2] & 0x3F) << 6) |
    694 			(p[3] & 0x3F));
    695 	case 5:
    696 		/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
    697 		return (LWCHAR) (
    698 			((p[0] & 0x03) << 24) |
    699 			((p[1] & 0x3F) << 18) |
    700 			((p[2] & 0x3F) << 12) |
    701 			((p[3] & 0x3F) << 6) |
    702 			(p[4] & 0x3F));
    703 	case 6:
    704 		/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
    705 		return (LWCHAR) (
    706 			((p[0] & 0x01) << 30) |
    707 			((p[1] & 0x3F) << 24) |
    708 			((p[2] & 0x3F) << 18) |
    709 			((p[3] & 0x3F) << 12) |
    710 			((p[4] & 0x3F) << 6) |
    711 			(p[5] & 0x3F));
    712 	}
    713 }
    714 
    715 /*
    716  * Store a character into a UTF-8 string.
    717  */
    718 public void put_wchar(char **pp, LWCHAR ch)
    719 {
    720 	if (!utf_mode || ch < 0x80)
    721 	{
    722 		/* 0xxxxxxx */
    723 		*(*pp)++ = (char) ch;
    724 	} else if (ch < 0x800)
    725 	{
    726 		/* 110xxxxx 10xxxxxx */
    727 		*(*pp)++ = (char) (0xC0 | ((ch >> 6) & 0x1F));
    728 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
    729 	} else if (ch < 0x10000)
    730 	{
    731 		/* 1110xxxx 10xxxxxx 10xxxxxx */
    732 		*(*pp)++ = (char) (0xE0 | ((ch >> 12) & 0x0F));
    733 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
    734 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
    735 	} else if (ch < 0x200000)
    736 	{
    737 		/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
    738 		*(*pp)++ = (char) (0xF0 | ((ch >> 18) & 0x07));
    739 		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
    740 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
    741 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
    742 	} else if (ch < 0x4000000)
    743 	{
    744 		/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
    745 		*(*pp)++ = (char) (0xF0 | ((ch >> 24) & 0x03));
    746 		*(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
    747 		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
    748 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
    749 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
    750 	} else
    751 	{
    752 		/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
    753 		*(*pp)++ = (char) (0xF0 | ((ch >> 30) & 0x01));
    754 		*(*pp)++ = (char) (0x80 | ((ch >> 24) & 0x3F));
    755 		*(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
    756 		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
    757 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
    758 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
    759 	}
    760 }
    761 
    762 /*
    763  * Step forward or backward one character in a string.
    764  */
    765 public LWCHAR step_char(char **pp, signed int dir, constant char *limit)
    766 {
    767 	LWCHAR ch;
    768 	int len;
    769 	char *p = *pp;
    770 
    771 	if (!utf_mode)
    772 	{
    773 		/* It's easy if chars are one byte. */
    774 		if (dir > 0)
    775 			ch = (LWCHAR) (unsigned char) ((p < limit) ? *p++ : 0);
    776 		else
    777 			ch = (LWCHAR) (unsigned char) ((p > limit) ? *--p : 0);
    778 	} else if (dir > 0)
    779 	{
    780 		len = utf_len(*p);
    781 		if (p + len > limit)
    782 		{
    783 			ch = 0;
    784 			p = (char *) limit;
    785 		} else
    786 		{
    787 			ch = get_wchar(p);
    788 			p += len;
    789 		}
    790 	} else
    791 	{
    792 		while (p > limit && IS_UTF8_TRAIL(p[-1]))
    793 			p--;
    794 		if (p > limit)
    795 			ch = get_wchar(--p);
    796 		else
    797 			ch = 0;
    798 	}
    799 	*pp = p;
    800 	return ch;
    801 }
    802 
    803 /*
    804  * Unicode characters data
    805  * Actual data is in the generated *.uni files.
    806  */
    807 
    808 #define DECLARE_RANGE_TABLE_START(name) \
    809 	static struct wchar_range name##_array[] = {
    810 #define DECLARE_RANGE_TABLE_END(name) \
    811 	}; struct wchar_range_table name##_table = { name##_array, sizeof(name##_array)/sizeof(*name##_array) };
    812 
    813 DECLARE_RANGE_TABLE_START(compose)
    814 #include "compose.uni"
    815 DECLARE_RANGE_TABLE_END(compose)
    816 
    817 DECLARE_RANGE_TABLE_START(ubin)
    818 #include "ubin.uni"
    819 DECLARE_RANGE_TABLE_END(ubin)
    820 
    821 DECLARE_RANGE_TABLE_START(wide)
    822 #include "wide.uni"
    823 DECLARE_RANGE_TABLE_END(wide)
    824 
    825 DECLARE_RANGE_TABLE_START(fmt)
    826 #include "fmt.uni"
    827 DECLARE_RANGE_TABLE_END(fmt)
    828 
    829 /* comb_table is special pairs, not ranges. */
    830 static struct wchar_range comb_table[] = {
    831 	{0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627},
    832 };
    833 
    834 
    835 static int is_in_table(LWCHAR ch, struct wchar_range_table *table)
    836 {
    837 	int hi;
    838 	int lo;
    839 
    840 	/* Binary search in the table. */
    841 	if (table->table == NULL || table->count == 0 || ch < table->table[0].first)
    842 		return 0;
    843 	lo = 0;
    844 	hi = table->count - 1;
    845 	while (lo <= hi)
    846 	{
    847 		int mid = (lo + hi) / 2;
    848 		if (ch > table->table[mid].last)
    849 			lo = mid + 1;
    850 		else if (ch < table->table[mid].first)
    851 			hi = mid - 1;
    852 		else
    853 			return 1;
    854 	}
    855 	return 0;
    856 }
    857 
    858 /*
    859  * Is a character a UTF-8 composing character?
    860  * If a composing character follows any char, the two combine into one glyph.
    861  */
    862 public int is_composing_char(LWCHAR ch)
    863 {
    864 	if (is_in_table(ch, &user_prt_table)) return 0;
    865 	return is_in_table(ch, &user_compose_table) ||
    866 	       is_in_table(ch, &compose_table) ||
    867 	       (bs_mode != BS_CONTROL && is_in_table(ch, &fmt_table));
    868 }
    869 
    870 /*
    871  * Should this UTF-8 character be treated as binary?
    872  */
    873 public int is_ubin_char(LWCHAR ch)
    874 {
    875 	if (is_in_table(ch, &user_prt_table)) return 0;
    876 	return is_in_table(ch, &user_ubin_table) ||
    877 	       is_in_table(ch, &ubin_table) ||
    878 	       (bs_mode == BS_CONTROL && is_in_table(ch, &fmt_table));
    879 }
    880 
    881 /*
    882  * Is this a double width UTF-8 character?
    883  */
    884 public int is_wide_char(LWCHAR ch)
    885 {
    886 	return is_in_table(ch, &user_wide_table) ||
    887 	       is_in_table(ch, &wide_table);
    888 }
    889 
    890 /*
    891  * Is a character a UTF-8 combining character?
    892  * A combining char acts like an ordinary char, but if it follows
    893  * a specific char (not any char), the two combine into one glyph.
    894  */
    895 public int is_combining_char(LWCHAR ch1, LWCHAR ch2)
    896 {
    897 	/* The table is small; use linear search. */
    898 	int i;
    899 	for (i = 0;  i < sizeof(comb_table)/sizeof(*comb_table);  i++)
    900 	{
    901 		if (ch1 == comb_table[i].first &&
    902 		    ch2 == comb_table[i].last)
    903 			return 1;
    904 	}
    905 	return 0;
    906 }
    907 
    908