Home | History | Annotate | Line # | Download | only in dist
      1 /* $OpenBSD$ */
      2 
      3 /*
      4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott (at) gmail.com>
      5  *
      6  * Permission to use, copy, modify, and distribute this software for any
      7  * purpose with or without fee is hereby granted, provided that the above
      8  * copyright notice and this permission notice appear in all copies.
      9  *
     10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
     15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
     16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     17  */
     18 
     19 #include <sys/types.h>
     20 
     21 #include <ctype.h>
     22 #include <errno.h>
     23 #include <stdlib.h>
     24 #include <string.h>
     25 #include <wchar.h>
     26 
     27 #include "compat.h"
     28 #include "tmux.h"
     29 
     30 struct utf8_width_item {
     31 	wchar_t				wc;
     32 	u_int				width;
     33 	int				allocated;
     34 
     35 	RB_ENTRY(utf8_width_item)	entry;
     36 };
     37 
     38 static int
     39 utf8_width_cache_cmp(struct utf8_width_item *uw1, struct utf8_width_item *uw2)
     40 {
     41 	if (uw1->wc < uw2->wc)
     42 		return (-1);
     43 	if (uw1->wc > uw2->wc)
     44 		return (1);
     45 	return (0);
     46 }
     47 RB_HEAD(utf8_width_cache, utf8_width_item);
     48 RB_GENERATE_STATIC(utf8_width_cache, utf8_width_item, entry,
     49     utf8_width_cache_cmp);
     50 static struct utf8_width_cache utf8_width_cache =
     51     RB_INITIALIZER(utf8_width_cache);
     52 
     53 static struct utf8_width_item utf8_default_width_cache[] = {
     54 	{ .wc = 0x0261D, .width = 2 },
     55 	{ .wc = 0x026F9, .width = 2 },
     56 	{ .wc = 0x0270A, .width = 2 },
     57 	{ .wc = 0x0270B, .width = 2 },
     58 	{ .wc = 0x0270C, .width = 2 },
     59 	{ .wc = 0x0270D, .width = 2 },
     60 	{ .wc = 0x1F1E6, .width = 1 },
     61 	{ .wc = 0x1F1E7, .width = 1 },
     62 	{ .wc = 0x1F1E8, .width = 1 },
     63 	{ .wc = 0x1F1E9, .width = 1 },
     64 	{ .wc = 0x1F1EA, .width = 1 },
     65 	{ .wc = 0x1F1EB, .width = 1 },
     66 	{ .wc = 0x1F1EC, .width = 1 },
     67 	{ .wc = 0x1F1ED, .width = 1 },
     68 	{ .wc = 0x1F1EE, .width = 1 },
     69 	{ .wc = 0x1F1EF, .width = 1 },
     70 	{ .wc = 0x1F1F0, .width = 1 },
     71 	{ .wc = 0x1F1F1, .width = 1 },
     72 	{ .wc = 0x1F1F2, .width = 1 },
     73 	{ .wc = 0x1F1F3, .width = 1 },
     74 	{ .wc = 0x1F1F4, .width = 1 },
     75 	{ .wc = 0x1F1F5, .width = 1 },
     76 	{ .wc = 0x1F1F6, .width = 1 },
     77 	{ .wc = 0x1F1F7, .width = 1 },
     78 	{ .wc = 0x1F1F8, .width = 1 },
     79 	{ .wc = 0x1F1F9, .width = 1 },
     80 	{ .wc = 0x1F1FA, .width = 1 },
     81 	{ .wc = 0x1F1FB, .width = 1 },
     82 	{ .wc = 0x1F1FC, .width = 1 },
     83 	{ .wc = 0x1F1FD, .width = 1 },
     84 	{ .wc = 0x1F1FE, .width = 1 },
     85 	{ .wc = 0x1F1FF, .width = 1 },
     86 	{ .wc = 0x1F385, .width = 2 },
     87 	{ .wc = 0x1F3C2, .width = 2 },
     88 	{ .wc = 0x1F3C3, .width = 2 },
     89 	{ .wc = 0x1F3C4, .width = 2 },
     90 	{ .wc = 0x1F3C7, .width = 2 },
     91 	{ .wc = 0x1F3CA, .width = 2 },
     92 	{ .wc = 0x1F3CB, .width = 2 },
     93 	{ .wc = 0x1F3CC, .width = 2 },
     94 	{ .wc = 0x1F3FB, .width = 2 },
     95 	{ .wc = 0x1F3FC, .width = 2 },
     96 	{ .wc = 0x1F3FD, .width = 2 },
     97 	{ .wc = 0x1F3FE, .width = 2 },
     98 	{ .wc = 0x1F3FF, .width = 2 },
     99 	{ .wc = 0x1F442, .width = 2 },
    100 	{ .wc = 0x1F443, .width = 2 },
    101 	{ .wc = 0x1F446, .width = 2 },
    102 	{ .wc = 0x1F447, .width = 2 },
    103 	{ .wc = 0x1F448, .width = 2 },
    104 	{ .wc = 0x1F449, .width = 2 },
    105 	{ .wc = 0x1F44A, .width = 2 },
    106 	{ .wc = 0x1F44B, .width = 2 },
    107 	{ .wc = 0x1F44C, .width = 2 },
    108 	{ .wc = 0x1F44D, .width = 2 },
    109 	{ .wc = 0x1F44E, .width = 2 },
    110 	{ .wc = 0x1F44F, .width = 2 },
    111 	{ .wc = 0x1F450, .width = 2 },
    112 	{ .wc = 0x1F466, .width = 2 },
    113 	{ .wc = 0x1F467, .width = 2 },
    114 	{ .wc = 0x1F468, .width = 2 },
    115 	{ .wc = 0x1F469, .width = 2 },
    116 	{ .wc = 0x1F46B, .width = 2 },
    117 	{ .wc = 0x1F46C, .width = 2 },
    118 	{ .wc = 0x1F46D, .width = 2 },
    119 	{ .wc = 0x1F46E, .width = 2 },
    120 	{ .wc = 0x1F470, .width = 2 },
    121 	{ .wc = 0x1F471, .width = 2 },
    122 	{ .wc = 0x1F472, .width = 2 },
    123 	{ .wc = 0x1F473, .width = 2 },
    124 	{ .wc = 0x1F474, .width = 2 },
    125 	{ .wc = 0x1F475, .width = 2 },
    126 	{ .wc = 0x1F476, .width = 2 },
    127 	{ .wc = 0x1F477, .width = 2 },
    128 	{ .wc = 0x1F478, .width = 2 },
    129 	{ .wc = 0x1F47C, .width = 2 },
    130 	{ .wc = 0x1F481, .width = 2 },
    131 	{ .wc = 0x1F482, .width = 2 },
    132 	{ .wc = 0x1F483, .width = 2 },
    133 	{ .wc = 0x1F485, .width = 2 },
    134 	{ .wc = 0x1F486, .width = 2 },
    135 	{ .wc = 0x1F487, .width = 2 },
    136 	{ .wc = 0x1F48F, .width = 2 },
    137 	{ .wc = 0x1F491, .width = 2 },
    138 	{ .wc = 0x1F4AA, .width = 2 },
    139 	{ .wc = 0x1F574, .width = 2 },
    140 	{ .wc = 0x1F575, .width = 2 },
    141 	{ .wc = 0x1F57A, .width = 2 },
    142 	{ .wc = 0x1F590, .width = 2 },
    143 	{ .wc = 0x1F595, .width = 2 },
    144 	{ .wc = 0x1F596, .width = 2 },
    145 	{ .wc = 0x1F645, .width = 2 },
    146 	{ .wc = 0x1F646, .width = 2 },
    147 	{ .wc = 0x1F647, .width = 2 },
    148 	{ .wc = 0x1F64B, .width = 2 },
    149 	{ .wc = 0x1F64C, .width = 2 },
    150 	{ .wc = 0x1F64D, .width = 2 },
    151 	{ .wc = 0x1F64E, .width = 2 },
    152 	{ .wc = 0x1F64F, .width = 2 },
    153 	{ .wc = 0x1F6A3, .width = 2 },
    154 	{ .wc = 0x1F6B4, .width = 2 },
    155 	{ .wc = 0x1F6B5, .width = 2 },
    156 	{ .wc = 0x1F6B6, .width = 2 },
    157 	{ .wc = 0x1F6C0, .width = 2 },
    158 	{ .wc = 0x1F6CC, .width = 2 },
    159 	{ .wc = 0x1F90C, .width = 2 },
    160 	{ .wc = 0x1F90F, .width = 2 },
    161 	{ .wc = 0x1F918, .width = 2 },
    162 	{ .wc = 0x1F919, .width = 2 },
    163 	{ .wc = 0x1F91A, .width = 2 },
    164 	{ .wc = 0x1F91B, .width = 2 },
    165 	{ .wc = 0x1F91C, .width = 2 },
    166 	{ .wc = 0x1F91D, .width = 2 },
    167 	{ .wc = 0x1F91E, .width = 2 },
    168 	{ .wc = 0x1F91F, .width = 2 },
    169 	{ .wc = 0x1F926, .width = 2 },
    170 	{ .wc = 0x1F930, .width = 2 },
    171 	{ .wc = 0x1F931, .width = 2 },
    172 	{ .wc = 0x1F932, .width = 2 },
    173 	{ .wc = 0x1F933, .width = 2 },
    174 	{ .wc = 0x1F934, .width = 2 },
    175 	{ .wc = 0x1F935, .width = 2 },
    176 	{ .wc = 0x1F936, .width = 2 },
    177 	{ .wc = 0x1F937, .width = 2 },
    178 	{ .wc = 0x1F938, .width = 2 },
    179 	{ .wc = 0x1F939, .width = 2 },
    180 	{ .wc = 0x1F93D, .width = 2 },
    181 	{ .wc = 0x1F93E, .width = 2 },
    182 	{ .wc = 0x1F977, .width = 2 },
    183 	{ .wc = 0x1F9B5, .width = 2 },
    184 	{ .wc = 0x1F9B6, .width = 2 },
    185 	{ .wc = 0x1F9B8, .width = 2 },
    186 	{ .wc = 0x1F9B9, .width = 2 },
    187 	{ .wc = 0x1F9BB, .width = 2 },
    188 	{ .wc = 0x1F9CD, .width = 2 },
    189 	{ .wc = 0x1F9CE, .width = 2 },
    190 	{ .wc = 0x1F9CF, .width = 2 },
    191 	{ .wc = 0x1F9D1, .width = 2 },
    192 	{ .wc = 0x1F9D2, .width = 2 },
    193 	{ .wc = 0x1F9D3, .width = 2 },
    194 	{ .wc = 0x1F9D4, .width = 2 },
    195 	{ .wc = 0x1F9D5, .width = 2 },
    196 	{ .wc = 0x1F9D6, .width = 2 },
    197 	{ .wc = 0x1F9D7, .width = 2 },
    198 	{ .wc = 0x1F9D8, .width = 2 },
    199 	{ .wc = 0x1F9D9, .width = 2 },
    200 	{ .wc = 0x1F9DA, .width = 2 },
    201 	{ .wc = 0x1F9DB, .width = 2 },
    202 	{ .wc = 0x1F9DC, .width = 2 },
    203 	{ .wc = 0x1F9DD, .width = 2 },
    204 	{ .wc = 0x1FAC3, .width = 2 },
    205 	{ .wc = 0x1FAC4, .width = 2 },
    206 	{ .wc = 0x1FAC5, .width = 2 },
    207 	{ .wc = 0x1FAF0, .width = 2 },
    208 	{ .wc = 0x1FAF1, .width = 2 },
    209 	{ .wc = 0x1FAF2, .width = 2 },
    210 	{ .wc = 0x1FAF3, .width = 2 },
    211 	{ .wc = 0x1FAF4, .width = 2 },
    212 	{ .wc = 0x1FAF5, .width = 2 },
    213 	{ .wc = 0x1FAF6, .width = 2 },
    214 	{ .wc = 0x1FAF7, .width = 2 },
    215 	{ .wc = 0x1FAF8, .width = 2 }
    216 };
    217 
    218 struct utf8_item {
    219 	RB_ENTRY(utf8_item)	index_entry;
    220 	u_int			index;
    221 
    222 	RB_ENTRY(utf8_item)	data_entry;
    223 	char			data[UTF8_SIZE];
    224 	u_char			size;
    225 };
    226 
    227 static int
    228 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
    229 {
    230 	if (ui1->size < ui2->size)
    231 		return (-1);
    232 	if (ui1->size > ui2->size)
    233 		return (1);
    234 	return (memcmp(ui1->data, ui2->data, ui1->size));
    235 }
    236 RB_HEAD(utf8_data_tree, utf8_item);
    237 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
    238 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
    239 
    240 static int
    241 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
    242 {
    243 	if (ui1->index < ui2->index)
    244 		return (-1);
    245 	if (ui1->index > ui2->index)
    246 		return (1);
    247 	return (0);
    248 }
    249 RB_HEAD(utf8_index_tree, utf8_item);
    250 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
    251 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
    252 
    253 static int	utf8_no_width;
    254 static u_int	utf8_next_index;
    255 
    256 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
    257 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
    258 
    259 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
    260 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
    261 
    262 /* Get a UTF-8 item from data. */
    263 static struct utf8_item *
    264 utf8_item_by_data(const u_char *data, size_t size)
    265 {
    266 	struct utf8_item	ui;
    267 
    268 	memcpy(ui.data, data, size);
    269 	ui.size = size;
    270 
    271 	return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
    272 }
    273 
    274 /* Get a UTF-8 item from data. */
    275 static struct utf8_item *
    276 utf8_item_by_index(u_int index)
    277 {
    278 	struct utf8_item	ui;
    279 
    280 	ui.index = index;
    281 
    282 	return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
    283 }
    284 
    285 /* Find a codepoint in the cache. */
    286 static struct utf8_width_item *
    287 utf8_find_in_width_cache(wchar_t wc)
    288 {
    289 	struct utf8_width_item	uw;
    290 
    291 	uw.wc = wc;
    292 	return RB_FIND(utf8_width_cache, &utf8_width_cache, &uw);
    293 }
    294 
    295 /* Parse a single codepoint option. */
    296 static void
    297 utf8_add_to_width_cache(const char *s)
    298 {
    299 	struct utf8_width_item	*uw, *old;
    300 	char			*copy, *cp, *endptr;
    301 	u_int			 width;
    302 	const char		*errstr;
    303 	struct utf8_data	*ud;
    304 	wchar_t			 wc;
    305 	unsigned long long	 n;
    306 
    307 	copy = xstrdup(s);
    308 	if ((cp = strchr(copy, '=')) == NULL) {
    309 		free(copy);
    310 		return;
    311 	}
    312 	*cp++ = '\0';
    313 
    314 	width = strtonum(cp, 0, 2, &errstr);
    315 	if (errstr != NULL) {
    316 		free(copy);
    317 		return;
    318 	}
    319 
    320 	if (strncmp(copy, "U+", 2) == 0) {
    321 		errno = 0;
    322 		n = strtoull(copy + 2, &endptr, 16);
    323 		if (copy[2] == '\0' ||
    324 		    *endptr != '\0' ||
    325 		    n == 0 ||
    326 		    n > WCHAR_MAX ||
    327 		    (errno == ERANGE && n == ULLONG_MAX)) {
    328 			free(copy);
    329 			return;
    330 		}
    331 		wc = n;
    332 	} else {
    333 		utf8_no_width = 1;
    334 		ud = utf8_fromcstr(copy);
    335 		utf8_no_width = 0;
    336 		if (ud[0].size == 0 || ud[1].size != 0) {
    337 			free(ud);
    338 			free(copy);
    339 			return;
    340 		}
    341 #ifdef HAVE_UTF8PROC
    342 		if (utf8proc_mbtowc(&wc, ud[0].data, ud[0].size) <= 0) {
    343 #else
    344 		if (mbtowc(&wc, (char *)ud[0].data, ud[0].size) <= 0) {
    345 #endif
    346 			free(ud);
    347 			free(copy);
    348 			return;
    349 		}
    350 		free(ud);
    351 	}
    352 
    353 	log_debug("Unicode width cache: %08X=%u", (u_int)wc, width);
    354 
    355 	uw = xcalloc(1, sizeof *uw);
    356 	uw->wc = wc;
    357 	uw->width = width;
    358 	uw->allocated = 1;
    359 
    360 	old = RB_INSERT(utf8_width_cache, &utf8_width_cache, uw);
    361 	if (old != NULL) {
    362 		RB_REMOVE(utf8_width_cache, &utf8_width_cache, old);
    363 		if (old->allocated)
    364 			free(old);
    365 		RB_INSERT(utf8_width_cache, &utf8_width_cache, uw);
    366 	}
    367 
    368 	free(copy);
    369 }
    370 
    371 /* Rebuild cache of widths. */
    372 void
    373 utf8_update_width_cache(void)
    374 {
    375 	struct utf8_width_item		*uw, *uw1;
    376 	struct options_entry		*o;
    377 	struct options_array_item	*a;
    378 	u_int				 i;
    379 
    380 	RB_FOREACH_SAFE (uw, utf8_width_cache, &utf8_width_cache, uw1) {
    381 		RB_REMOVE(utf8_width_cache, &utf8_width_cache, uw);
    382 		if (uw->allocated)
    383 			free(uw);
    384 	}
    385 
    386 	for (i = 0; i < nitems(utf8_default_width_cache); i++) {
    387 		RB_INSERT(utf8_width_cache, &utf8_width_cache,
    388 		    &utf8_default_width_cache[i]);
    389 	}
    390 
    391 	o = options_get(global_options, "codepoint-widths");
    392 	a = options_array_first(o);
    393 	while (a != NULL) {
    394 		utf8_add_to_width_cache(options_array_item_value(a)->string);
    395 		a = options_array_next(a);
    396 	}
    397 }
    398 
    399 /* Add a UTF-8 item. */
    400 static int
    401 utf8_put_item(const u_char *data, size_t size, u_int *index)
    402 {
    403 	struct utf8_item	*ui;
    404 
    405 	ui = utf8_item_by_data((const unsigned char *)data, size);
    406 	if (ui != NULL) {
    407 		*index = ui->index;
    408 		log_debug("%s: found %.*s = %u", __func__, (int)size, data,
    409 		    *index);
    410 		return (0);
    411 	}
    412 
    413 	if (utf8_next_index == 0xffffff + 1)
    414 		return (-1);
    415 
    416 	ui = xcalloc(1, sizeof *ui);
    417 	ui->index = utf8_next_index++;
    418 	RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
    419 
    420 	memcpy(ui->data, data, size);
    421 	ui->size = size;
    422 	RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
    423 
    424 	*index = ui->index;
    425 	log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
    426 	return (0);
    427 }
    428 
    429 /* Get UTF-8 character from data. */
    430 enum utf8_state
    431 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
    432 {
    433 	u_int	index;
    434 
    435 	if (ud->width > 2)
    436 		fatalx("invalid UTF-8 width: %u", ud->width);
    437 
    438 	if (ud->size > UTF8_SIZE)
    439 		goto fail;
    440 	if (ud->size <= 3) {
    441 		index = (((utf8_char)ud->data[2] << 16)|
    442 			  ((utf8_char)ud->data[1] << 8)|
    443 			  ((utf8_char)ud->data[0]));
    444 	} else if (utf8_put_item(ud->data, ud->size, &index) != 0)
    445 		goto fail;
    446 	*uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
    447 	log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
    448 	    (int)ud->size, ud->data, *uc);
    449 	return (UTF8_DONE);
    450 
    451 fail:
    452 	if (ud->width == 0)
    453 		*uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
    454 	else if (ud->width == 1)
    455 		*uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
    456 	else
    457 		*uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
    458 	return (UTF8_ERROR);
    459 }
    460 
    461 /* Get UTF-8 data from character. */
    462 void
    463 utf8_to_data(utf8_char uc, struct utf8_data *ud)
    464 {
    465 	struct utf8_item	*ui;
    466 	u_int			 index;
    467 
    468 	memset(ud, 0, sizeof *ud);
    469 	ud->size = ud->have = UTF8_GET_SIZE(uc);
    470 	ud->width = UTF8_GET_WIDTH(uc);
    471 
    472 	if (ud->size <= 3) {
    473 		ud->data[2] = (uc >> 16);
    474 		ud->data[1] = ((uc >> 8) & 0xff);
    475 		ud->data[0] = (uc & 0xff);
    476 	} else {
    477 		index = (uc & 0xffffff);
    478 		if ((ui = utf8_item_by_index(index)) == NULL)
    479 			memset(ud->data, ' ', ud->size);
    480 		else
    481 			memcpy(ud->data, ui->data, ud->size);
    482 	}
    483 
    484 	log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
    485 	    (int)ud->size, ud->data);
    486 }
    487 
    488 /* Get UTF-8 character from a single ASCII character. */
    489 u_int
    490 utf8_build_one(u_char ch)
    491 {
    492 	return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
    493 }
    494 
    495 /* Set a single character. */
    496 void
    497 utf8_set(struct utf8_data *ud, u_char ch)
    498 {
    499 	static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
    500 
    501 	memcpy(ud, &empty, sizeof *ud);
    502 	*ud->data = ch;
    503 }
    504 
    505 /* Copy UTF-8 character. */
    506 void
    507 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
    508 {
    509 	u_int	i;
    510 
    511 	memcpy(to, from, sizeof *to);
    512 
    513 	for (i = to->size; i < sizeof to->data; i++)
    514 		to->data[i] = '\0';
    515 }
    516 
    517 /* Get width of Unicode character. */
    518 static enum utf8_state
    519 utf8_width(struct utf8_data *ud, int *width)
    520 {
    521 	struct utf8_width_item	*uw;
    522 	wchar_t			 wc;
    523 
    524 	if (utf8_towc(ud, &wc) != UTF8_DONE)
    525 		return (UTF8_ERROR);
    526 	uw = utf8_find_in_width_cache(wc);
    527 	if (uw != NULL) {
    528 		*width = uw->width;
    529 		log_debug("cached width for %08X is %d", (u_int)wc, *width);
    530 		return (UTF8_DONE);
    531 	}
    532 #ifdef HAVE_UTF8PROC
    533 	*width = utf8proc_wcwidth(wc);
    534 	log_debug("utf8proc_wcwidth(%05X) returned %d", (u_int)wc, *width);
    535 #else
    536 	*width = wcwidth(wc);
    537 	log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
    538 	if (*width < 0) {
    539 		/*
    540 		 * C1 control characters are nonprintable, so they are always
    541 		 * zero width.
    542 		 */
    543 		*width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
    544 	}
    545 #endif
    546 	if (*width >= 0 && *width <= 0xff)
    547 		return (UTF8_DONE);
    548 	return (UTF8_ERROR);
    549 }
    550 
    551 /* Convert UTF-8 character to wide character. */
    552 enum utf8_state
    553 utf8_towc(const struct utf8_data *ud, wchar_t *wc)
    554 {
    555 #ifdef HAVE_UTF8PROC
    556 	switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
    557 #else
    558 	switch (mbtowc(wc, __UNCONST(ud->data), ud->size)) {
    559 #endif
    560 	case -1:
    561 		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
    562 		    errno);
    563 		mbtowc(NULL, NULL, MB_CUR_MAX);
    564 		return (UTF8_ERROR);
    565 	case 0:
    566 		return (UTF8_ERROR);
    567 	}
    568 	log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
    569 	return (UTF8_DONE);
    570 }
    571 
    572 /* Convert wide character to UTF-8 character. */
    573 enum utf8_state
    574 utf8_fromwc(wchar_t wc, struct utf8_data *ud)
    575 {
    576 	int	size, width;
    577 
    578 #ifdef HAVE_UTF8PROC
    579 	size = utf8proc_wctomb(ud->data, wc);
    580 #else
    581 	size = wctomb((char *)ud->data, wc);
    582 #endif
    583 	if (size < 0) {
    584 		log_debug("UTF-8 %d, wctomb() %d", wc, errno);
    585 		wctomb(NULL, 0);
    586 		return (UTF8_ERROR);
    587 	}
    588 	if (size == 0)
    589 		return (UTF8_ERROR);
    590 	ud->size = ud->have = size;
    591 	if (utf8_width(ud, &width) == UTF8_DONE) {
    592 		ud->width = width;
    593 		return (UTF8_DONE);
    594 	}
    595 	return (UTF8_ERROR);
    596 }
    597 
    598 /*
    599  * Open UTF-8 sequence.
    600  *
    601  * 11000010-11011111 C2-DF start of 2-byte sequence
    602  * 11100000-11101111 E0-EF start of 3-byte sequence
    603  * 11110000-11110100 F0-F4 start of 4-byte sequence
    604  */
    605 enum utf8_state
    606 utf8_open(struct utf8_data *ud, u_char ch)
    607 {
    608 	memset(ud, 0, sizeof *ud);
    609 	if (ch >= 0xc2 && ch <= 0xdf)
    610 		ud->size = 2;
    611 	else if (ch >= 0xe0 && ch <= 0xef)
    612 		ud->size = 3;
    613 	else if (ch >= 0xf0 && ch <= 0xf4)
    614 		ud->size = 4;
    615 	else
    616 		return (UTF8_ERROR);
    617 	utf8_append(ud, ch);
    618 	return (UTF8_MORE);
    619 }
    620 
    621 /* Append character to UTF-8, closing if finished. */
    622 enum utf8_state
    623 utf8_append(struct utf8_data *ud, u_char ch)
    624 {
    625 	int	width;
    626 
    627 	if (ud->have >= ud->size)
    628 		fatalx("UTF-8 character overflow");
    629 	if (ud->size > sizeof ud->data)
    630 		fatalx("UTF-8 character size too large");
    631 
    632 	if (ud->have != 0 && (ch & 0xc0) != 0x80)
    633 		ud->width = 0xff;
    634 
    635 	ud->data[ud->have++] = ch;
    636 	if (ud->have != ud->size)
    637 		return (UTF8_MORE);
    638 
    639 	if (!utf8_no_width) {
    640 		if (ud->width == 0xff)
    641 			return (UTF8_ERROR);
    642 		if (utf8_width(ud, &width) != UTF8_DONE)
    643 			return (UTF8_ERROR);
    644 		ud->width = width;
    645 	}
    646 
    647 	return (UTF8_DONE);
    648 }
    649 
    650 /*
    651  * Encode len characters from src into dst, which is guaranteed to have four
    652  * bytes available for each character from src (for \abc or UTF-8) plus space
    653  * for \0.
    654  */
    655 int
    656 utf8_strvis(char *dst, const char *src, size_t len, int flag)
    657 {
    658 	struct utf8_data	 ud;
    659 	const char		*start = dst, *end = src + len;
    660 	enum utf8_state		 more;
    661 	size_t			 i;
    662 
    663 	while (src < end) {
    664 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
    665 			while (++src < end && more == UTF8_MORE)
    666 				more = utf8_append(&ud, *src);
    667 			if (more == UTF8_DONE) {
    668 				/* UTF-8 character finished. */
    669 				for (i = 0; i < ud.size; i++)
    670 					*dst++ = ud.data[i];
    671 				continue;
    672 			}
    673 			/* Not a complete, valid UTF-8 character. */
    674 			src -= ud.have;
    675 		}
    676 		if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) {
    677 			if (isalpha((u_char)src[1]) ||
    678 			    src[1] == '_' ||
    679 			    src[1] == '{')
    680 				*dst++ = '\\';
    681 			*dst++ = '$';
    682 		} else if (src < end - 1)
    683 			dst = vis(dst, src[0], flag, src[1]);
    684 		else if (src < end)
    685 			dst = vis(dst, src[0], flag, '\0');
    686 		src++;
    687 	}
    688 	*dst = '\0';
    689 	return (dst - start);
    690 }
    691 
    692 /* Same as utf8_strvis but allocate the buffer. */
    693 int
    694 utf8_stravis(char **dst, const char *src, int flag)
    695 {
    696 	char	*buf;
    697 	int	 len;
    698 
    699 	buf = xreallocarray(NULL, 4, strlen(src) + 1);
    700 	len = utf8_strvis(buf, src, strlen(src), flag);
    701 
    702 	*dst = xrealloc(buf, len + 1);
    703 	return (len);
    704 }
    705 
    706 /* Same as utf8_strvis but allocate the buffer. */
    707 int
    708 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
    709 {
    710 	char	*buf;
    711 	int	 len;
    712 
    713 	buf = xreallocarray(NULL, 4, srclen + 1);
    714 	len = utf8_strvis(buf, src, srclen, flag);
    715 
    716 	*dst = xrealloc(buf, len + 1);
    717 	return (len);
    718 }
    719 
    720 /* Does this string contain anything that isn't valid UTF-8? */
    721 int
    722 utf8_isvalid(const char *s)
    723 {
    724 	struct utf8_data ud;
    725 	const char	*end;
    726 	enum utf8_state	 more;
    727 
    728 	end = s + strlen(s);
    729 	while (s < end) {
    730 		if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
    731 			while (++s < end && more == UTF8_MORE)
    732 				more = utf8_append(&ud, *s);
    733 			if (more == UTF8_DONE)
    734 				continue;
    735 			return (0);
    736 		}
    737 		if (*s < 0x20 || *s > 0x7e)
    738 			return (0);
    739 		s++;
    740 	}
    741 	return (1);
    742 }
    743 
    744 /*
    745  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
    746  * the returned string. Anything not valid printable ASCII or UTF-8 is
    747  * stripped.
    748  */
    749 char *
    750 utf8_sanitize(const char *src)
    751 {
    752 	char		*dst = NULL;
    753 	size_t		 n = 0;
    754 	enum utf8_state	 more;
    755 	struct utf8_data ud;
    756 	u_int		 i;
    757 
    758 	while (*src != '\0') {
    759 		dst = xreallocarray(dst, n + 1, sizeof *dst);
    760 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
    761 			while (*++src != '\0' && more == UTF8_MORE)
    762 				more = utf8_append(&ud, *src);
    763 			if (more == UTF8_DONE) {
    764 				dst = xreallocarray(dst, n + ud.width,
    765 				    sizeof *dst);
    766 				for (i = 0; i < ud.width; i++)
    767 					dst[n++] = '_';
    768 				continue;
    769 			}
    770 			src -= ud.have;
    771 		}
    772 		if (*src > 0x1f && *src < 0x7f)
    773 			dst[n++] = *src;
    774 		else
    775 			dst[n++] = '_';
    776 		src++;
    777 	}
    778 	dst = xreallocarray(dst, n + 1, sizeof *dst);
    779 	dst[n] = '\0';
    780 	return (dst);
    781 }
    782 
    783 /* Get UTF-8 buffer length. */
    784 size_t
    785 utf8_strlen(const struct utf8_data *s)
    786 {
    787 	size_t	i;
    788 
    789 	for (i = 0; s[i].size != 0; i++)
    790 		/* nothing */;
    791 	return (i);
    792 }
    793 
    794 /* Get UTF-8 string width. */
    795 u_int
    796 utf8_strwidth(const struct utf8_data *s, ssize_t n)
    797 {
    798 	ssize_t	i;
    799 	u_int	width = 0;
    800 
    801 	for (i = 0; s[i].size != 0; i++) {
    802 		if (n != -1 && n == i)
    803 			break;
    804 		width += s[i].width;
    805 	}
    806 	return (width);
    807 }
    808 
    809 /*
    810  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
    811  * Caller frees.
    812  */
    813 struct utf8_data *
    814 utf8_fromcstr(const char *src)
    815 {
    816 	struct utf8_data	*dst = NULL;
    817 	size_t			 n = 0;
    818 	enum utf8_state		 more;
    819 
    820 	while (*src != '\0') {
    821 		dst = xreallocarray(dst, n + 1, sizeof *dst);
    822 		if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
    823 			while (*++src != '\0' && more == UTF8_MORE)
    824 				more = utf8_append(&dst[n], *src);
    825 			if (more == UTF8_DONE) {
    826 				n++;
    827 				continue;
    828 			}
    829 			src -= dst[n].have;
    830 		}
    831 		utf8_set(&dst[n], *src);
    832 		n++;
    833 		src++;
    834 	}
    835 	dst = xreallocarray(dst, n + 1, sizeof *dst);
    836 	dst[n].size = 0;
    837 	return (dst);
    838 }
    839 
    840 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
    841 char *
    842 utf8_tocstr(struct utf8_data *src)
    843 {
    844 	char	*dst = NULL;
    845 	size_t	 n = 0;
    846 
    847 	for(; src->size != 0; src++) {
    848 		dst = xreallocarray(dst, n + src->size, 1);
    849 		memcpy(dst + n, src->data, src->size);
    850 		n += src->size;
    851 	}
    852 	dst = xreallocarray(dst, n + 1, 1);
    853 	dst[n] = '\0';
    854 	return (dst);
    855 }
    856 
    857 /* Get width of UTF-8 string. */
    858 u_int
    859 utf8_cstrwidth(const char *s)
    860 {
    861 	struct utf8_data	tmp;
    862 	u_int			width;
    863 	enum utf8_state		more;
    864 
    865 	width = 0;
    866 	while (*s != '\0') {
    867 		if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
    868 			while (*++s != '\0' && more == UTF8_MORE)
    869 				more = utf8_append(&tmp, *s);
    870 			if (more == UTF8_DONE) {
    871 				width += tmp.width;
    872 				continue;
    873 			}
    874 			s -= tmp.have;
    875 		}
    876 		if (*s > 0x1f && *s != 0x7f)
    877 			width++;
    878 		s++;
    879 	}
    880 	return (width);
    881 }
    882 
    883 /* Pad UTF-8 string to width on the left. Caller frees. */
    884 char *
    885 utf8_padcstr(const char *s, u_int width)
    886 {
    887 	size_t	 slen;
    888 	char	*out;
    889 	u_int	 n, i;
    890 
    891 	n = utf8_cstrwidth(s);
    892 	if (n >= width)
    893 		return (xstrdup(s));
    894 
    895 	slen = strlen(s);
    896 	out = xmalloc(slen + 1 + (width - n));
    897 	memcpy(out, s, slen);
    898 	for (i = n; i < width; i++)
    899 		out[slen++] = ' ';
    900 	out[slen] = '\0';
    901 	return (out);
    902 }
    903 
    904 /* Pad UTF-8 string to width on the right. Caller frees. */
    905 char *
    906 utf8_rpadcstr(const char *s, u_int width)
    907 {
    908 	size_t	 slen;
    909 	char	*out;
    910 	u_int	 n, i;
    911 
    912 	n = utf8_cstrwidth(s);
    913 	if (n >= width)
    914 		return (xstrdup(s));
    915 
    916 	slen = strlen(s);
    917 	out = xmalloc(slen + 1 + (width - n));
    918 	for (i = 0; i < width - n; i++)
    919 		out[i] = ' ';
    920 	memcpy(out + i, s, slen);
    921 	out[i + slen] = '\0';
    922 	return (out);
    923 }
    924 
    925 int
    926 utf8_cstrhas(const char *s, const struct utf8_data *ud)
    927 {
    928 	struct utf8_data	*copy, *loop;
    929 	int			 found = 0;
    930 
    931 	copy = utf8_fromcstr(s);
    932 	for (loop = copy; loop->size != 0; loop++) {
    933 		if (loop->size != ud->size)
    934 			continue;
    935 		if (memcmp(loop->data, ud->data, loop->size) == 0) {
    936 			found = 1;
    937 			break;
    938 		}
    939 	}
    940 	free(copy);
    941 
    942 	return (found);
    943 }
    944