ptydata.c revision 04b94745
1/* $XTermId: ptydata.c,v 1.160 2024/05/10 22:54:17 tom Exp $ */
2
3/*
4 * Copyright 1999-2023,2024 by Thomas E. Dickey
5 *
6 *                         All Rights Reserved
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the
10 * "Software"), to deal in the Software without restriction, including
11 * without limitation the rights to use, copy, modify, merge, publish,
12 * distribute, sublicense, and/or sell copies of the Software, and to
13 * permit persons to whom the Software is furnished to do so, subject to
14 * the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included
17 * in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
23 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * Except as contained in this notice, the name(s) of the above copyright
28 * holders shall not be used in advertising or otherwise to promote the
29 * sale, use or other dealings in this Software without prior written
30 * authorization.
31 */
32
33#include <data.h>
34
35#if OPT_WIDE_CHARS
36#include <menu.h>
37#include <wcwidth.h>
38#endif
39
40#ifdef TEST_DRIVER
41#undef TRACE
42#define TRACE(p) if (1) printf p
43#undef TRACE2
44#define TRACE2(p) if (0) printf p
45#define visibleChars(buf, len) "buffer"
46#endif
47
48/*
49 * Check for both EAGAIN and EWOULDBLOCK, because some supposedly POSIX
50 * systems are broken and return EWOULDBLOCK when they should return EAGAIN.
51 * Note that this macro may evaluate its argument more than once.
52 */
53#if defined(EAGAIN) && defined(EWOULDBLOCK)
54#define E_TEST(err) ((err) == EAGAIN || (err) == EWOULDBLOCK)
55#else
56#ifdef EAGAIN
57#define E_TEST(err) ((err) == EAGAIN)
58#else
59#define E_TEST(err) ((err) == EWOULDBLOCK)
60#endif
61#endif
62
63#if OPT_WIDE_CHARS
64/*
65 * Convert the 8-bit codes in data->buffer[] into Unicode in data->utf_data.
66 * The number of bytes converted will be nonzero iff there is data.
67 */
68Bool
69decodeUtf8(TScreen *screen, PtyData *data)
70{
71    size_t i;
72    size_t length = (size_t) (data->last - data->next);
73    int utf_count = 0;
74    unsigned utf_char = 0;
75
76    data->utf_size = 0;
77    for (i = 0; i < length; i++) {
78	unsigned c = data->next[i];
79
80	/* Combine UTF-8 into Unicode */
81	if (c < 0x80) {
82	    /* We received an ASCII character */
83	    if (utf_count > 0) {
84		data->utf_data = UCS_REPL;	/* prev. sequence incomplete */
85		data->utf_size = i;
86	    } else {
87		data->utf_data = (IChar) c;
88		data->utf_size = 1;
89	    }
90	    break;
91	} else if (screen->vt100_graphics
92		   && (c < 0x100)
93		   && (utf_count == 0)
94		   && screen->gsets[(int) screen->curgr] != nrc_ASCII) {
95	    data->utf_data = (IChar) c;
96	    data->utf_size = 1;
97	    break;
98	} else if (c < 0xc0) {
99	    /* We received a continuation byte */
100	    if (utf_count < 1) {
101		if (screen->c1_printable) {
102		    data->utf_data = (IChar) c;
103		} else if ((i + 1) < length
104			   && data->next[i + 1] > 0x20
105			   && data->next[i + 1] < 0x80) {
106		    /*
107		     * Allow for C1 control string if the next byte is
108		     * available for inspection.
109		     */
110		    data->utf_data = (IChar) c;
111		} else {
112		    /*
113		     * We received a continuation byte before receiving a
114		     * sequence state, or a failed attempt to use a C1 control
115		     * string.
116		     */
117		    data->utf_data = (IChar) UCS_REPL;
118		}
119		data->utf_size = (i + 1);
120		break;
121	    } else if (screen->utf8_weblike
122		       && (utf_count == 3
123			   && utf_char == 0x04
124			   && c >= 0x90)) {
125		/* The encoding would form a code point beyond U+10FFFF. */
126		data->utf_size = i;
127		data->utf_data = UCS_REPL;
128		break;
129	    } else if (screen->utf8_weblike
130		       && (utf_count == 2
131			   && utf_char == 0x0d
132			   && c >= 0xa0)) {
133		/* The encoding would form a surrogate code point. */
134		data->utf_size = i;
135		data->utf_data = UCS_REPL;
136		break;
137	    } else {
138		/* Check for overlong UTF-8 sequences for which a shorter
139		 * encoding would exist and replace them with UCS_REPL.
140		 * An overlong UTF-8 sequence can have any of the following
141		 * forms:
142		 *   1100000x 10xxxxxx
143		 *   11100000 100xxxxx 10xxxxxx
144		 *   11110000 1000xxxx 10xxxxxx 10xxxxxx
145		 *   11111000 10000xxx 10xxxxxx 10xxxxxx 10xxxxxx
146		 *   11111100 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
147		 */
148		if (!utf_char && !((c & 0x7f) >> (7 - utf_count))) {
149		    if (screen->utf8_weblike) {
150			/* overlong sequence continued */
151			data->utf_data = UCS_REPL;
152			data->utf_size = i;
153			break;
154		    } else {
155			utf_char = UCS_REPL;
156		    }
157		}
158		utf_char <<= 6;
159		utf_char |= (c & 0x3f);
160		if ((utf_char >= 0xd800 &&
161		     utf_char <= 0xdfff) ||
162		    (utf_char == 0xfffe) ||
163		    (utf_char == HIDDEN_CHAR)) {
164		    utf_char = UCS_REPL;
165		}
166		utf_count--;
167		if (utf_count == 0) {
168#if !OPT_WIDER_ICHAR
169		    /* characters outside UCS-2 become UCS_REPL */
170		    if (utf_char > NARROW_ICHAR) {
171			TRACE(("using replacement for %#x\n", utf_char));
172			utf_char = UCS_REPL;
173		    }
174#endif
175		    data->utf_data = (IChar) utf_char;
176		    data->utf_size = (i + 1);
177		    break;
178		}
179	    }
180	} else {
181	    /* We received a sequence start byte */
182	    if (utf_count > 0) {
183		/* previous sequence is incomplete */
184		data->utf_data = UCS_REPL;
185		data->utf_size = i;
186		break;
187	    }
188	    if (screen->utf8_weblike) {
189		if (c < 0xe0) {
190		    if (!(c & 0x1e)) {
191			/* overlong sequence start */
192			data->utf_data = UCS_REPL;
193			data->utf_size = (i + 1);
194			break;
195		    }
196		    utf_count = 1;
197		    utf_char = (c & 0x1f);
198		} else if (c < 0xf0) {
199		    utf_count = 2;
200		    utf_char = (c & 0x0f);
201		} else if (c < 0xf5) {
202		    utf_count = 3;
203		    utf_char = (c & 0x07);
204		} else {
205		    data->utf_data = UCS_REPL;
206		    data->utf_size = (i + 1);
207		    break;
208		}
209	    } else {
210		if (c < 0xe0) {
211		    utf_count = 1;
212		    utf_char = (c & 0x1f);
213		    if (!(c & 0x1e)) {
214			/* overlong sequence */
215			utf_char = UCS_REPL;
216		    }
217		} else if (c < 0xf0) {
218		    utf_count = 2;
219		    utf_char = (c & 0x0f);
220		} else if (c < 0xf8) {
221		    utf_count = 3;
222		    utf_char = (c & 0x07);
223		} else if (c < 0xfc) {
224		    utf_count = 4;
225		    utf_char = (c & 0x03);
226		} else if (c < 0xfe) {
227		    utf_count = 5;
228		    utf_char = (c & 0x01);
229		} else {
230		    data->utf_data = UCS_REPL;
231		    data->utf_size = (i + 1);
232		    break;
233		}
234	    }
235	}
236    }
237#if OPT_TRACE > 1
238    TRACE(("UTF-8 char %04X [%lu..%lu]\n",
239	   data->utf_data,
240	   (unsigned long) (data->next - data->buffer),
241	   (unsigned long) (data->next - data->buffer + data->utf_size - 1)));
242#endif
243
244    return (data->utf_size != 0);
245}
246#endif
247
248int
249readPtyData(XtermWidget xw, PtySelect * select_mask, PtyData *data)
250{
251    TScreen *screen = TScreenOf(xw);
252    int size = 0;
253
254#ifdef VMS
255    if (*select_mask & pty_mask) {
256	trimPtyData(xw, data);
257	if (read_queue.flink != 0) {
258	    size = tt_read(data->next);
259	    if (size == 0) {
260		Panic("input: read returned zero\n", 0);
261	    }
262	} else {
263	    sys$hiber();
264	}
265    }
266#else /* !VMS */
267    if (FD_ISSET(screen->respond, select_mask)) {
268	int save_err;
269	trimPtyData(xw, data);
270
271	size = (int) read(screen->respond, (char *) data->last, (size_t) FRG_SIZE);
272	save_err = errno;
273#if (defined(i386) && defined(SVR4) && defined(sun)) || defined(__CYGWIN__)
274	/*
275	 * Yes, I know this is a majorly f*ugly hack, however it seems to
276	 * be necessary for Solaris x86.  DWH 11/15/94
277	 * Dunno why though..
278	 * (and now CYGWIN, alanh@xfree86.org 08/15/01
279	 */
280	if (size <= 0) {
281	    if (save_err == EIO || save_err == 0)
282		NormalExit();
283	    else if (!E_TEST(save_err))
284		Panic("input: read returned unexpected error (%d)\n", save_err);
285	    size = 0;
286	}
287#else /* !f*ugly */
288	if (size < 0) {
289	    if (save_err == EIO)
290		NormalExit();
291	    else if (!E_TEST(save_err))
292		Panic("input: read returned unexpected error (%d)\n", save_err);
293	    size = 0;
294	} else if (size == 0) {
295#if defined(__FreeBSD__)
296	    NormalExit();
297#else
298	    Panic("input: read returned zero\n", 0);
299#endif
300	}
301#endif /* f*ugly */
302    }
303#endif /* VMS */
304
305    if (size) {
306#if OPT_TRACE
307	int i;
308
309	TRACE(("read %d bytes from pty\n", size));
310	for (i = 0; i < size; i++) {
311	    if (!(i % 16))
312		TRACE(("%s", i ? "\n    " : "READ"));
313	    TRACE((" %02X", data->last[i]));
314	}
315	TRACE(("\n"));
316#endif
317	data->last += size;
318#ifdef ALLOWLOGGING
319	TScreenOf(term)->logstart = VTbuffer->next;
320#endif
321    }
322
323    return (size);
324}
325
326/*
327 * Return the next value from the input buffer.  Note that morePtyData() is
328 * always called before this function, so we can do the UTF-8 input conversion
329 * in that function and simply return the result here.
330 */
331#if OPT_WIDE_CHARS
332IChar
333nextPtyData(TScreen *screen, PtyData *data)
334{
335    IChar result;
336    if (screen->utf8_inparse) {
337	skipPtyData(data, result);
338    } else {
339	result = *((data)->next++);
340	if (!screen->output_eight_bits) {
341	    result = (IChar) (result & 0x7f);
342	}
343    }
344    TRACE2(("nextPtyData returns %#x\n", result));
345    return result;
346}
347#endif
348
349#if OPT_WIDE_CHARS
350/*
351 * Called when UTF-8 mode has been turned on/off.
352 */
353void
354switchPtyData(TScreen *screen, int flag)
355{
356    if (screen->utf8_mode != flag) {
357	screen->utf8_mode = flag;
358	screen->utf8_inparse = (Boolean) (flag != 0);
359	mk_wcwidth_init(screen->utf8_mode);
360
361	TRACE(("turning UTF-8 mode %s\n", BtoS(flag)));
362	update_font_utf8_mode();
363    }
364}
365#endif
366
367/*
368 * Allocate a buffer.
369 */
370void
371initPtyData(PtyData **result)
372{
373    PtyData *data;
374
375    TRACE2(("initPtyData given minBufSize %d, maxBufSize %d\n",
376	    FRG_SIZE, BUF_SIZE));
377
378    if (FRG_SIZE < 64)
379	FRG_SIZE = 64;
380    if (BUF_SIZE < FRG_SIZE)
381	BUF_SIZE = FRG_SIZE;
382    if (BUF_SIZE % FRG_SIZE)
383	BUF_SIZE = BUF_SIZE + FRG_SIZE - (BUF_SIZE % FRG_SIZE);
384
385    TRACE2(("initPtyData using minBufSize %d, maxBufSize %d\n",
386	    FRG_SIZE, BUF_SIZE));
387
388    data = TypeXtMallocX(PtyData, (BUF_SIZE + FRG_SIZE));
389
390    memset(data, 0, sizeof(*data));
391    data->next = data->buffer;
392    data->last = data->buffer;
393    *result = data;
394}
395
396/*
397 * Initialize a buffer for the caller, using its data in 'next'.
398 */
399#if OPT_WIDE_CHARS
400PtyData *
401fakePtyData(PtyData *result, Char *next, Char *last)
402{
403    PtyData *data = result;
404
405    memset(data, 0, sizeof(*data));
406    data->next = next;
407    data->last = last;
408
409    return data;
410}
411#endif
412
413/*
414 * Remove used data by shifting the buffer down, to make room for more data,
415 * e.g., a continuation-read.
416 */
417void
418trimPtyData(XtermWidget xw, PtyData *data)
419{
420    (void) xw;
421    FlushLog(xw);
422
423    if (data->next != data->buffer) {
424	size_t i;
425	size_t n = (size_t) (data->last - data->next);
426
427	TRACE(("shifting buffer down by %lu\n", (unsigned long) n));
428	for (i = 0; i < n; ++i) {
429	    data->buffer[i] = data->next[i];
430	}
431	data->next = data->buffer;
432	data->last = data->next + n;
433    }
434
435}
436
437/*
438 * Insert new data into the input buffer so the next calls to morePtyData()
439 * and nextPtyData() will return that.
440 */
441void
442fillPtyData(XtermWidget xw, PtyData *data, const char *value, size_t length)
443{
444    size_t size;
445    size_t n;
446
447    /* remove the used portion of the buffer */
448    trimPtyData(xw, data);
449
450    VTbuffer->last += length;
451    size = (size_t) (VTbuffer->last - VTbuffer->next);
452
453    /* shift the unused portion up to make room */
454    for (n = size; n >= length; --n)
455	VTbuffer->next[n] = VTbuffer->next[n - length];
456
457    /* insert the new bytes to interpret */
458    for (n = 0; n < length; n++)
459	VTbuffer->next[n] = CharOf(value[n]);
460}
461
462#if OPT_WIDE_CHARS
463/*
464 * Convert an ISO-8859-1 code 'c' to UTF-8, storing the result in the target
465 * 'lp', and returning a pointer past the converted character.
466 */
467Char *
468convertToUTF8(Char *lp, unsigned c)
469{
470#define CH(n) (Char)((c) >> ((n) * 8))
471    if (c < 0x80) {
472	/*  0*******  */
473	*lp++ = (Char) CH(0);
474    } else if (c < 0x800) {
475	/*  110***** 10******  */
476	*lp++ = (Char) (0xc0 | (CH(0) >> 6) | ((CH(1) & 0x07) << 2));
477	*lp++ = (Char) (0x80 | (CH(0) & 0x3f));
478    } else if (c < 0x00010000) {
479	/*  1110**** 10****** 10******  */
480	*lp++ = (Char) (0xe0 | ((int) (CH(1) & 0xf0) >> 4));
481	*lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
482	*lp++ = (Char) (0x80 | (CH(0) & 0x3f));
483    } else if (c < 0x00200000) {
484	*lp++ = (Char) (0xf0 | ((int) (CH(2) & 0x1f) >> 2));
485	*lp++ = (Char) (0x80 |
486			((int) (CH(1) & 0xf0) >> 4) |
487			((int) (CH(2) & 0x03) << 4));
488	*lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
489	*lp++ = (Char) (0x80 | (CH(0) & 0x3f));
490    } else if (c < 0x04000000) {
491	*lp++ = (Char) (0xf8 | (CH(3) & 0x03));
492	*lp++ = (Char) (0x80 | (CH(2) >> 2));
493	*lp++ = (Char) (0x80 |
494			((int) (CH(1) & 0xf0) >> 4) |
495			((int) (CH(2) & 0x03) << 4));
496	*lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
497	*lp++ = (Char) (0x80 | (CH(0) & 0x3f));
498    } else {
499	*lp++ = (Char) (0xfc | ((int) (CH(3) & 0x40) >> 6));
500	*lp++ = (Char) (0x80 | (CH(3) & 0x3f));
501	*lp++ = (Char) (0x80 | (CH(2) >> 2));
502	*lp++ = (Char) (0x80 | (CH(1) >> 4) | ((CH(2) & 0x03) << 4));
503	*lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
504	*lp++ = (Char) (0x80 | (CH(0) & 0x3f));
505    }
506    return lp;
507#undef CH
508}
509
510/*
511 * Convert a UTF-8 multibyte character to an Unicode value, returning a pointer
512 * past the converted UTF-8 input.  The first 256 values align with ISO-8859-1,
513 * making it possible to use this to convert to Latin-1.
514 *
515 * If the conversion fails, return null.
516 */
517Char *
518convertFromUTF8(Char *lp, unsigned *cp)
519{
520    int want;
521
522    /*
523     * Find the number of bytes we will need from the source.
524     */
525    if ((*lp & 0x80) == 0) {
526	want = 1;
527    } else if ((*lp & 0xe0) == 0xc0) {
528	want = 2;
529    } else if ((*lp & 0xf0) == 0xe0) {
530	want = 3;
531    } else if ((*lp & 0xf8) == 0xf0) {
532	want = 4;
533    } else if ((*lp & 0xfc) == 0xf8) {
534	want = 5;
535    } else if ((*lp & 0xfe) == 0xfc) {
536	want = 6;
537    } else {
538	want = 0;
539    }
540
541    if (want) {
542	int have = 1;
543
544	while (lp[have] != '\0') {
545	    if ((lp[have] & 0xc0) != 0x80)
546		break;
547	    ++have;
548	}
549	if (want == have) {
550	    unsigned mask = 0;
551	    int j;
552	    int shift = 0;
553
554	    *cp = 0;
555	    switch (want) {
556	    case 1:
557		mask = (*lp);
558		break;
559	    case 2:
560		mask = (*lp & 0x1f);
561		break;
562	    case 3:
563		mask = (*lp & 0x0f);
564		break;
565	    case 4:
566		mask = (*lp & 0x07);
567		break;
568	    case 5:
569		mask = (*lp & 0x03);
570		break;
571	    case 6:
572		mask = (*lp & 0x01);
573		break;
574	    default:
575		mask = 0;
576		break;
577	    }
578
579	    for (j = 1; j < want; j++) {
580		*cp |= (unsigned) ((lp[want - j] & 0x3f) << shift);
581		shift += 6;
582	    }
583	    *cp |= mask << shift;
584	    lp += want;
585	} else {
586	    *cp = BAD_ASCII;
587	    lp = NULL;
588	}
589    } else {
590	*cp = BAD_ASCII;
591	lp = NULL;
592    }
593    return lp;
594}
595
596/*
597 * Returns true if the entire string is valid UTF-8.
598 */
599Boolean
600isValidUTF8(Char *lp)
601{
602    Boolean result = True;
603    while (*lp) {
604	unsigned ch;
605	Char *next = convertFromUTF8(lp, &ch);
606	if (next == NULL || ch == 0) {
607	    result = False;
608	    break;
609	}
610	lp = next;
611    }
612    return result;
613}
614
615/*
616 * Write data back to the PTY
617 */
618void
619writePtyData(int f, IChar *d, size_t len)
620{
621    size_t n = (len << 1);
622
623    if (VTbuffer->write_len <= len) {
624	VTbuffer->write_len = n;
625	VTbuffer->write_buf = realloc(VTbuffer->write_buf, VTbuffer->write_len);
626    }
627
628    for (n = 0; n < len; n++)
629	VTbuffer->write_buf[n] = (Char) d[n];
630
631    TRACE(("writePtyData %lu:%s\n", (unsigned long) n,
632	   visibleChars(VTbuffer->write_buf, n)));
633    v_write(f, VTbuffer->write_buf, n);
634}
635#endif /* OPT_WIDE_CHARS */
636
637#ifdef NO_LEAKS
638void
639noleaks_ptydata(void)
640{
641    if (VTbuffer != 0) {
642#if OPT_WIDE_CHARS
643	free(VTbuffer->write_buf);
644#endif
645	FreeAndNull(VTbuffer);
646    }
647}
648#endif
649
650#ifdef TEST_DRIVER
651
652#include "data.c"
653
654void
655NormalExit(void)
656{
657    fprintf(stderr, "NormalExit!\n");
658    exit(EXIT_SUCCESS);
659}
660
661void
662Panic(const char *s, int a)
663{
664    (void) s;
665    (void) a;
666    fprintf(stderr, "Panic!\n");
667    exit(EXIT_FAILURE);
668}
669
670#if OPT_WIDE_CHARS
671
672#ifdef ALLOWLOGGING
673void
674FlushLog(XtermWidget xw)
675{
676    (void) xw;
677}
678#endif
679
680void
681v_write(int f, const Char *data, size_t len)
682{
683    (void) f;
684    (void) data;
685    (void) len;
686}
687
688void
689mk_wcwidth_init(int mode)
690{
691    (void) mode;
692}
693
694void
695update_font_utf8_mode(void)
696{
697}
698
699static int message_level = 0;
700static int opt_all = 0;
701static int opt_illegal = 0;
702static int opt_convert = 0;
703static int opt_reverse = 0;
704static long total_test = 0;
705static long total_errs = 0;
706
707static void
708usage(void)
709{
710    static const char *msg[] =
711    {
712	"Usage: test_ptydata [options] [c1[-c1b] [c2-[c2b] [...]]]",
713	"",
714	"Options:",
715	" -a  exercise all legal encode/decode to/from UTF-8",
716	" -c  call convertFromUTF8 rather than decodeUTF8",
717	" -i  ignore illegal UTF-8 when testing -r option",
718	" -q  quieter",
719	" -r  reverse/decode from UTF-8 byte-string to/from Unicode",
720	" -v  more verbose"
721    };
722    size_t n;
723    for (n = 0; n < sizeof(msg) / sizeof(msg[0]); ++n) {
724	fprintf(stderr, "%s\n", msg[n]);
725    }
726    exit(EXIT_FAILURE);
727}
728
729/*
730 * http://www.unicode.org/versions/corrigendum1.html, table 3.1B
731 */
732#define OkRange(n,lo,hi) \
733 	if (value[n] < lo || value[n] > hi) { \
734	    result = False; \
735	    break; \
736	}
737static Bool
738is_legal_utf8(const Char *value)
739{
740    Bool result = True;
741    Char ch;
742    while ((ch = *value) != '\0') {
743	if (ch <= 0x7f) {
744	    ++value;
745	} else if (ch >= 0xc2 && ch <= 0xdf) {
746	    OkRange(1, 0x80, 0xbf);
747	    value += 2;
748	} else if (ch == 0xe0) {
749	    OkRange(1, 0xa0, 0xbf);
750	    OkRange(2, 0x80, 0xbf);
751	    value += 3;
752	} else if (ch >= 0xe1 && ch <= 0xef) {
753	    OkRange(1, 0x80, 0xbf);
754	    OkRange(2, 0x80, 0xbf);
755	    value += 3;
756	} else if (ch == 0xf0) {
757	    OkRange(1, 0x90, 0xbf);
758	    OkRange(2, 0x80, 0xbf);
759	    OkRange(3, 0x80, 0xbf);
760	    value += 4;
761	} else if (ch >= 0xf1 && ch <= 0xf3) {
762	    OkRange(1, 0x80, 0xbf);
763	    OkRange(2, 0x80, 0xbf);
764	    OkRange(3, 0x80, 0xbf);
765	    value += 4;
766	} else if (ch == 0xf4) {
767	    OkRange(1, 0x80, 0x8f);
768	    OkRange(2, 0x80, 0xbf);
769	    OkRange(3, 0x80, 0xbf);
770	    value += 4;
771	} else {
772	    result = False;
773	    break;
774	}
775    }
776    return result;
777}
778
779static void
780test_utf8_convert(void)
781{
782    unsigned c_in, c_out;
783    Char buffer[10];
784    Char *result;
785    unsigned limit = 0x110000;
786    unsigned success = 0;
787    unsigned bucket[256];
788
789    memset(bucket, 0, sizeof(bucket));
790    for (c_in = 0; c_in < limit; ++c_in) {
791	memset(buffer, 0, sizeof(buffer));
792	if ((result = convertToUTF8(buffer, c_in)) == 0) {
793	    TRACE(("conversion of U+%04X to UTF-8 failed\n", c_in));
794	} else {
795	    if ((result = convertFromUTF8(buffer, &c_out)) == 0) {
796		TRACE(("conversion of U+%04X from UTF-8 failed\n", c_in));
797	    } else if (c_in != c_out) {
798		TRACE(("conversion of U+%04X to/from UTF-8 gave U+%04X\n",
799		       c_in, c_out));
800	    } else {
801		while (result-- != buffer) {
802		    bucket[*result]++;
803		}
804		++success;
805	    }
806	}
807    }
808    TRACE(("%u/%u successful\n", success, limit));
809    for (c_in = 0; c_in < 256; ++c_in) {
810	if ((c_in % 8) == 0) {
811	    TRACE((" %02X:", c_in));
812	}
813	TRACE((" %8X", bucket[c_in]));
814	if (((c_in + 1) % 8) == 0) {
815	    TRACE(("\n"));
816	}
817    }
818}
819
820static int
821decode_one(const char *source, char **target)
822{
823    int result = -1;
824    long check;
825    int radix = 0;
826    if ((source[0] == 'u' || source[0] == 'U') && source[1] == '+') {
827	source += 2;
828	radix = 16;
829    } else if (source[0] == '0' && source[1] == 'b') {
830	source += 2;
831	radix = 2;
832    }
833    check = strtol(source, target, radix);
834    if (*target != NULL && *target != source)
835	result = (int) check;
836    return result;
837}
838
839static int
840decode_range(const char *source, int *lo, int *hi)
841{
842    int result = 0;
843    char *after1;
844    char *after2;
845    if ((*lo = decode_one(source, &after1)) >= 0) {
846	after1 += strspn(after1, ":-.\t ");
847	if ((*hi = decode_one(after1, &after2)) < 0) {
848	    *hi = *lo;
849	}
850	result = 1;
851    }
852    return result;
853}
854
855#define MAX_BYTES 6
856
857static void
858do_range(const char *source)
859{
860    int lo, hi;
861
862    TScreen screen;
863    memset(&screen, 0, sizeof(screen));
864
865    if (decode_range(source, &lo, &hi)) {
866	while (lo <= hi) {
867	    unsigned c_in = (unsigned) lo++;
868	    PtyData *data;
869	    Char *next;
870	    Char buffer[MAX_BYTES + 1];
871
872	    if (opt_reverse) {
873		Bool skip = False;
874		Bool first = True;
875		int j, k;
876		for (j = 0; j < MAX_BYTES; ++j) {
877		    unsigned long bits = ((unsigned long) c_in >> (8 * j));
878		    if ((buffer[j] = (Char) bits) == 0) {
879			skip = (bits != 0);
880			break;
881		    }
882		}
883		if (skip)
884		    continue;
885		initPtyData(&data);
886		for (k = 0; k <= j; ++k) {
887		    data->buffer[k] = buffer[j - k - 1];
888		}
889		if (opt_illegal && !is_legal_utf8(data->buffer)) {
890		    free(data);
891		    continue;
892		}
893		if (message_level > 1) {
894		    printf("TEST ");
895		    for (k = 0; k < j; ++k) {
896			printf("%02X", data->buffer[k]);
897		    }
898		}
899		data->next = data->buffer;
900		data->last = data->buffer + j;
901		while (decodeUtf8(&screen, data)) {
902		    total_test++;
903		    if (is_UCS_SPECIAL(data->utf_data))
904			total_errs++;
905		    data->next += data->utf_size;
906		    if (message_level > 1) {
907			printf("%s%04X", first ? " ->" : ", ", data->utf_data);
908		    }
909		    first = False;
910		}
911		if (!first)
912		    total_test--;
913		if (message_level > 1) {
914		    printf("\n");
915		    fflush(stdout);
916		}
917		free(data);
918	    } else if (opt_convert) {
919		unsigned c_out;
920		Char *result;
921
922		memset(buffer, 0, sizeof(buffer));
923		if ((result = next = convertToUTF8(buffer, c_in)) == 0) {
924		    fprintf(stderr,
925			    "conversion of U+%04X to UTF-8 failed\n", c_in);
926		} else if ((result = convertFromUTF8(buffer, &c_out)) == 0) {
927		    fprintf(stderr,
928			    "conversion of U+%04X from UTF-8 failed\n", c_in);
929		    total_errs++;
930		} else if (c_in != c_out) {
931		    fprintf(stderr,
932			    "conversion of U+%04X to/from UTF-8 gave U+%04X\n",
933			    c_in, c_out);
934		} else if (message_level > 1) {
935		    *next = '\0';
936		    printf("TEST %04X (%lu:%s) ->%04X\n", c_in,
937			   (unsigned long) (next - buffer),
938			   buffer,
939			   c_out);
940		    fflush(stdout);
941		}
942	    } else {
943		initPtyData(&data);
944		next = convertToUTF8(data->buffer, c_in);
945		*next = 0;
946		data->next = data->buffer;
947		data->last = next;
948		decodeUtf8(&screen, data);
949		if (message_level > 1) {
950		    printf("TEST %04X (%lu:%s) ->%04X\n", c_in,
951			   (unsigned long) (next - data->buffer),
952			   data->buffer,
953			   data->utf_data);
954		    fflush(stdout);
955		}
956		if (c_in != data->utf_data) {
957		    fprintf(stderr, "Mismatch: %04X vs %04X\n", c_in, data->utf_data);
958		    total_errs++;
959		}
960		free(data);
961	    }
962	    total_test++;
963	}
964    }
965}
966
967int
968main(int argc, char **argv)
969{
970    int ch;
971
972    setlocale(LC_ALL, "");
973    while ((ch = getopt(argc, argv, "aciqrv")) != -1) {
974	switch (ch) {
975	case 'a':
976	    opt_all = 1;
977	    break;
978	case 'c':
979	    opt_convert = 1;
980	    break;
981	case 'i':
982	    opt_illegal = 1;
983	    break;
984	case 'q':
985	    message_level--;
986	    break;
987	case 'r':
988	    opt_reverse = 1;
989	    break;
990	case 'v':
991	    message_level++;
992	    break;
993	default:
994	    usage();
995	}
996    }
997    if (opt_all) {
998	test_utf8_convert();
999    } else {
1000	if (optind >= argc)
1001	    usage();
1002	while (optind < argc) {
1003	    do_range(argv[optind++]);
1004	}
1005	if (total_test) {
1006	    printf("%ld/%ld mismatches (%.0f%%)\n",
1007		   total_errs,
1008		   total_test,
1009		   (100.0 * (double) total_errs) / (double) total_test);
1010	}
1011    }
1012    return EXIT_SUCCESS;
1013}
1014#else
1015int
1016main(int argc, char **argv)
1017{
1018    (void) argc;
1019    (void) argv;
1020    printf("Nothing to be done here...\n");
1021    return EXIT_SUCCESS;
1022}
1023#endif /* OPT_WIDE_CHARS */
1024#endif
1025