ptydata.c revision 5104ee6e
1/* $XTermId: ptydata.c,v 1.163 2024/12/01 23:48:07 tom Exp $ */
2
3/*
4 * Copyright 1999-2023,2024 by Thomas E. Dickey
5 *
6 *                         All Rights Reserved
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the
10 * "Software"), to deal in the Software without restriction, including
11 * without limitation the rights to use, copy, modify, merge, publish,
12 * distribute, sublicense, and/or sell copies of the Software, and to
13 * permit persons to whom the Software is furnished to do so, subject to
14 * the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included
17 * in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
23 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * Except as contained in this notice, the name(s) of the above copyright
28 * holders shall not be used in advertising or otherwise to promote the
29 * sale, use or other dealings in this Software without prior written
30 * authorization.
31 */
32
33#include <data.h>
34
35#if OPT_WIDE_CHARS
36#include <menu.h>
37#include <wcwidth.h>
38#endif
39
40#ifdef TEST_DRIVER
41#undef TRACE
42#define TRACE(p) if (1) printf p
43#undef TRACE2
44#define TRACE2(p) if (0) printf p
45#define visibleChars(buf, len) "buffer"
46#endif
47
48/*
49 * Check for both EAGAIN and EWOULDBLOCK, because some supposedly POSIX
50 * systems are broken and return EWOULDBLOCK when they should return EAGAIN.
51 * Note that this macro may evaluate its argument more than once.
52 */
53#if defined(EAGAIN) && defined(EWOULDBLOCK)
54#define E_TEST(err) ((err) == EAGAIN || (err) == EWOULDBLOCK)
55#else
56#ifdef EAGAIN
57#define E_TEST(err) ((err) == EAGAIN)
58#else
59#define E_TEST(err) ((err) == EWOULDBLOCK)
60#endif
61#endif
62
63#if OPT_WIDE_CHARS
64/*
65 * Convert the 8-bit codes in data->buffer[] into Unicode in data->utf_data.
66 * The number of bytes converted will be nonzero iff there is data.
67 */
68Bool
69decodeUtf8(TScreen *screen, PtyData *data)
70{
71    size_t i;
72    size_t length = (size_t) (data->last - data->next);
73    int utf_count = 0;
74    unsigned utf_char = 0;
75
76    data->utf_size = 0;
77    for (i = 0; i < length; i++) {
78	unsigned c = data->next[i];
79
80	/* Combine UTF-8 into Unicode */
81	if (c < 0x80) {
82	    /* We received an ASCII character */
83	    if (utf_count > 0) {
84		data->utf_data = UCS_REPL;	/* prev. sequence incomplete */
85		data->utf_size = i;
86	    } else {
87		data->utf_data = (IChar) c;
88		data->utf_size = 1;
89	    }
90	    break;
91	} else if (screen->vt100_graphics
92		   && (c < 0x100)
93		   && (utf_count == 0)
94		   && screen->gsets[(int) screen->curgr] != nrc_ASCII) {
95	    data->utf_data = (IChar) c;
96	    data->utf_size = 1;
97	    break;
98	} else if (c < 0xc0) {
99	    /* We received a continuation byte */
100	    if (utf_count < 1) {
101		if (screen->c1_printable) {
102		    data->utf_data = (IChar) c;
103		} else if ((i + 1) < length
104			   && data->next[i + 1] > 0x20
105			   && data->next[i + 1] < 0x80) {
106		    /*
107		     * Allow for C1 control string if the next byte is
108		     * available for inspection.
109		     */
110		    data->utf_data = (IChar) c;
111		} else {
112		    /*
113		     * We received a continuation byte before receiving a
114		     * sequence state, or a failed attempt to use a C1 control
115		     * string.
116		     */
117		    data->utf_data = (IChar) UCS_REPL;
118		}
119		data->utf_size = (i + 1);
120		break;
121	    } else if (screen->utf8_weblike
122		       && (utf_count == 3
123			   && utf_char == 0x04
124			   && c >= 0x90)) {
125		/* The encoding would form a code point beyond U+10FFFF. */
126		data->utf_size = i;
127		data->utf_data = UCS_REPL;
128		break;
129	    } else if (screen->utf8_weblike
130		       && (utf_count == 2
131			   && utf_char == 0x0d
132			   && c >= 0xa0)) {
133		/* The encoding would form a surrogate code point. */
134		data->utf_size = i;
135		data->utf_data = UCS_REPL;
136		break;
137	    } else {
138		/* Check for overlong UTF-8 sequences for which a shorter
139		 * encoding would exist and replace them with UCS_REPL.
140		 * An overlong UTF-8 sequence can have any of the following
141		 * forms:
142		 *   1100000x 10xxxxxx
143		 *   11100000 100xxxxx 10xxxxxx
144		 *   11110000 1000xxxx 10xxxxxx 10xxxxxx
145		 *   11111000 10000xxx 10xxxxxx 10xxxxxx 10xxxxxx
146		 *   11111100 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
147		 */
148		if (!utf_char && !((c & 0x7f) >> (7 - utf_count))) {
149		    if (screen->utf8_weblike) {
150			/* overlong sequence continued */
151			data->utf_data = UCS_REPL;
152			data->utf_size = i;
153			break;
154		    } else {
155			utf_char = UCS_REPL;
156		    }
157		}
158		utf_char <<= 6;
159		utf_char |= (c & 0x3f);
160		if ((utf_char >= 0xd800 &&
161		     utf_char <= 0xdfff) ||
162		    (utf_char == 0xfffe) ||
163		    (utf_char == HIDDEN_CHAR)) {
164		    utf_char = UCS_REPL;
165		}
166		utf_count--;
167		if (utf_count == 0) {
168#if !OPT_WIDER_ICHAR
169		    /* characters outside UCS-2 become UCS_REPL */
170		    if (utf_char > NARROW_ICHAR) {
171			TRACE(("using replacement for %#x\n", utf_char));
172			utf_char = UCS_REPL;
173		    }
174#endif
175		    data->utf_data = (IChar) utf_char;
176		    data->utf_size = (i + 1);
177		    break;
178		}
179	    }
180	} else {
181	    /* We received a sequence start byte */
182	    if (utf_count > 0) {
183		/* previous sequence is incomplete */
184		data->utf_data = UCS_REPL;
185		data->utf_size = i;
186		break;
187	    }
188	    if (screen->utf8_weblike) {
189		if (c < 0xe0) {
190		    if (!(c & 0x1e)) {
191			/* overlong sequence start */
192			data->utf_data = UCS_REPL;
193			data->utf_size = (i + 1);
194			break;
195		    }
196		    utf_count = 1;
197		    utf_char = (c & 0x1f);
198		} else if (c < 0xf0) {
199		    utf_count = 2;
200		    utf_char = (c & 0x0f);
201		} else if (c < 0xf5) {
202		    utf_count = 3;
203		    utf_char = (c & 0x07);
204		} else {
205		    data->utf_data = UCS_REPL;
206		    data->utf_size = (i + 1);
207		    break;
208		}
209	    } else {
210		if (c < 0xe0) {
211		    utf_count = 1;
212		    utf_char = (c & 0x1f);
213		    if (!(c & 0x1e)) {
214			/* overlong sequence */
215			utf_char = UCS_REPL;
216		    }
217		} else if (c < 0xf0) {
218		    utf_count = 2;
219		    utf_char = (c & 0x0f);
220		} else if (c < 0xf8) {
221		    utf_count = 3;
222		    utf_char = (c & 0x07);
223		} else if (c < 0xfc) {
224		    utf_count = 4;
225		    utf_char = (c & 0x03);
226		} else if (c < 0xfe) {
227		    utf_count = 5;
228		    utf_char = (c & 0x01);
229		} else {
230		    data->utf_data = UCS_REPL;
231		    data->utf_size = (i + 1);
232		    break;
233		}
234	    }
235	}
236    }
237#if OPT_TRACE > 1
238    TRACE(("UTF-8 char %04X [%lu..%lu]\n",
239	   data->utf_data,
240	   (unsigned long) (data->next - data->buffer),
241	   (unsigned long) (data->next - data->buffer + data->utf_size - 1)));
242#endif
243
244    return (data->utf_size != 0);
245}
246#endif
247
248int
249readPtyData(XtermWidget xw, PtySelect * select_mask, PtyData *data)
250{
251    TScreen *screen = TScreenOf(xw);
252    int size = 0;
253
254    if (FD_ISSET(screen->respond, select_mask)) {
255	int save_err;
256	trimPtyData(xw, data);
257
258	size = (int) read(screen->respond, (char *) data->last, (size_t) FRG_SIZE);
259	save_err = errno;
260#if (defined(i386) && defined(SVR4) && defined(sun)) || defined(__CYGWIN__)
261	/*
262	 * Yes, I know this is a majorly f*ugly hack, however it seems to
263	 * be necessary for Solaris x86.  DWH 11/15/94
264	 * Dunno why though..
265	 * (and now CYGWIN, alanh@xfree86.org 08/15/01
266	 */
267	if (size <= 0) {
268	    if (save_err == EIO || save_err == 0)
269		NormalExit();
270	    else if (!E_TEST(save_err))
271		Panic("input: read returned unexpected error (%d)\n", save_err);
272	    size = 0;
273	}
274#else /* !f*ugly */
275	if (size < 0) {
276	    if (save_err == EIO)
277		NormalExit();
278	    else if (!E_TEST(save_err))
279		Panic("input: read returned unexpected error (%d)\n", save_err);
280	    size = 0;
281	} else if (size == 0) {
282#if defined(__FreeBSD__)
283	    NormalExit();
284#else
285	    Panic("input: read returned zero\n", 0);
286#endif
287	}
288#endif /* f*ugly */
289    }
290
291    if (size) {
292#if OPT_TRACE
293	int i;
294
295	TRACE(("read %d bytes from pty\n", size));
296	for (i = 0; i < size; i++) {
297	    if (!(i % 16))
298		TRACE(("%s", i ? "\n    " : "READ"));
299	    TRACE((" %02X", data->last[i]));
300	}
301	TRACE(("\n"));
302#endif
303	data->last += size;
304#ifdef ALLOWLOGGING
305	TScreenOf(term)->logstart = VTbuffer->next;
306#endif
307    }
308
309    return (size);
310}
311
312/*
313 * Return the next value from the input buffer.  Note that morePtyData() is
314 * always called before this function, so we can do the UTF-8 input conversion
315 * in that function and simply return the result here.
316 */
317#if OPT_WIDE_CHARS
318IChar
319nextPtyData(TScreen *screen, PtyData *data)
320{
321    IChar result;
322    if (screen->utf8_inparse) {
323	skipPtyData(data, result);
324    } else {
325	result = *((data)->next++);
326	if (!screen->output_eight_bits) {
327	    result = (IChar) (result & 0x7f);
328	}
329    }
330    TRACE2(("nextPtyData returns %#x\n", result));
331    return result;
332}
333#endif
334
335#if OPT_WIDE_CHARS
336/*
337 * Called when UTF-8 mode has been turned on/off.
338 */
339void
340switchPtyData(TScreen *screen, int flag)
341{
342    if (screen->utf8_mode != flag) {
343	screen->utf8_mode = flag;
344	screen->utf8_inparse = (Boolean) (flag != 0);
345	mk_wcwidth_init(screen->utf8_mode);
346
347	TRACE(("turning UTF-8 mode %s\n", BtoS(flag)));
348	update_font_utf8_mode();
349    }
350}
351#endif
352
353/*
354 * Allocate a buffer.
355 */
356void
357initPtyData(PtyData **result)
358{
359    PtyData *data;
360
361    TRACE2(("initPtyData given minBufSize %d, maxBufSize %d\n",
362	    FRG_SIZE, BUF_SIZE));
363
364    if (FRG_SIZE < 64)
365	FRG_SIZE = 64;
366    if (BUF_SIZE < FRG_SIZE)
367	BUF_SIZE = FRG_SIZE;
368    if (BUF_SIZE % FRG_SIZE)
369	BUF_SIZE = BUF_SIZE + FRG_SIZE - (BUF_SIZE % FRG_SIZE);
370
371    TRACE2(("initPtyData using minBufSize %d, maxBufSize %d\n",
372	    FRG_SIZE, BUF_SIZE));
373
374    data = TypeXtMallocX(PtyData, (BUF_SIZE + FRG_SIZE));
375
376    memset(data, 0, sizeof(*data));
377    data->next = data->buffer;
378    data->last = data->buffer;
379    *result = data;
380}
381
382/*
383 * Initialize a buffer for the caller, using its data in 'next'.
384 */
385#if OPT_WIDE_CHARS
386PtyData *
387fakePtyData(PtyData *result, Char *next, Char *last)
388{
389    PtyData *data = result;
390
391    memset(data, 0, sizeof(*data));
392    data->next = next;
393    data->last = last;
394
395    return data;
396}
397#endif
398
399/*
400 * Remove used data by shifting the buffer down, to make room for more data,
401 * e.g., a continuation-read.
402 */
403void
404trimPtyData(XtermWidget xw, PtyData *data)
405{
406    (void) xw;
407    FlushLog(xw);
408
409    if (data->next != data->buffer) {
410	size_t i;
411	size_t n = (size_t) (data->last - data->next);
412
413	TRACE(("shifting buffer down by %lu\n", (unsigned long) n));
414	for (i = 0; i < n; ++i) {
415	    data->buffer[i] = data->next[i];
416	}
417	data->next = data->buffer;
418	data->last = data->next + n;
419    }
420
421}
422
423/*
424 * Insert new data into the input buffer so the next calls to morePtyData()
425 * and nextPtyData() will return that.
426 */
427void
428fillPtyData(XtermWidget xw, PtyData *data, const char *value, size_t length)
429{
430    size_t size;
431    size_t n;
432
433    /* remove the used portion of the buffer */
434    trimPtyData(xw, data);
435
436    VTbuffer->last += length;
437    size = (size_t) (VTbuffer->last - VTbuffer->next);
438
439    /* shift the unused portion up to make room */
440    for (n = size; n >= length; --n)
441	VTbuffer->next[n] = VTbuffer->next[n - length];
442
443    /* insert the new bytes to interpret */
444    for (n = 0; n < length; n++)
445	VTbuffer->next[n] = CharOf(value[n]);
446}
447
448#if OPT_WIDE_CHARS
449/*
450 * Convert an ISO-8859-1 code 'c' to UTF-8, storing the result in the target
451 * 'lp', and returning a pointer past the converted character.
452 */
453Char *
454convertToUTF8(Char *lp, unsigned c)
455{
456#define CH(n) (Char)((c) >> ((n) * 8))
457    if (c < 0x80) {
458	/*  0*******  */
459	*lp++ = (Char) CH(0);
460    } else if (c < 0x800) {
461	/*  110***** 10******  */
462	*lp++ = (Char) (0xc0 | (CH(0) >> 6) | ((CH(1) & 0x07) << 2));
463	*lp++ = (Char) (0x80 | (CH(0) & 0x3f));
464    } else if (c < 0x00010000) {
465	/*  1110**** 10****** 10******  */
466	*lp++ = (Char) (0xe0 | ((int) (CH(1) & 0xf0) >> 4));
467	*lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
468	*lp++ = (Char) (0x80 | (CH(0) & 0x3f));
469    } else if (c < 0x00200000) {
470	*lp++ = (Char) (0xf0 | ((int) (CH(2) & 0x1f) >> 2));
471	*lp++ = (Char) (0x80 |
472			((int) (CH(1) & 0xf0) >> 4) |
473			((int) (CH(2) & 0x03) << 4));
474	*lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
475	*lp++ = (Char) (0x80 | (CH(0) & 0x3f));
476    } else if (c < 0x04000000) {
477	*lp++ = (Char) (0xf8 | (CH(3) & 0x03));
478	*lp++ = (Char) (0x80 | (CH(2) >> 2));
479	*lp++ = (Char) (0x80 |
480			((int) (CH(1) & 0xf0) >> 4) |
481			((int) (CH(2) & 0x03) << 4));
482	*lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
483	*lp++ = (Char) (0x80 | (CH(0) & 0x3f));
484    } else {
485	*lp++ = (Char) (0xfc | ((int) (CH(3) & 0x40) >> 6));
486	*lp++ = (Char) (0x80 | (CH(3) & 0x3f));
487	*lp++ = (Char) (0x80 | (CH(2) >> 2));
488	*lp++ = (Char) (0x80 | (CH(1) >> 4) | ((CH(2) & 0x03) << 4));
489	*lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
490	*lp++ = (Char) (0x80 | (CH(0) & 0x3f));
491    }
492    return lp;
493#undef CH
494}
495
496/*
497 * Convert a UTF-8 multibyte character to an Unicode value, returning a pointer
498 * past the converted UTF-8 input.  The first 256 values align with ISO-8859-1,
499 * making it possible to use this to convert to Latin-1.
500 *
501 * If the conversion fails, return null.
502 */
503Char *
504convertFromUTF8(Char *lp, unsigned *cp)
505{
506    int want;
507
508    /*
509     * Find the number of bytes we will need from the source.
510     */
511    if ((*lp & 0x80) == 0) {
512	want = 1;
513    } else if ((*lp & 0xe0) == 0xc0) {
514	want = 2;
515    } else if ((*lp & 0xf0) == 0xe0) {
516	want = 3;
517    } else if ((*lp & 0xf8) == 0xf0) {
518	want = 4;
519    } else if ((*lp & 0xfc) == 0xf8) {
520	want = 5;
521    } else if ((*lp & 0xfe) == 0xfc) {
522	want = 6;
523    } else {
524	want = 0;
525    }
526
527    if (want) {
528	int have = 1;
529
530	while (lp[have] != '\0') {
531	    if ((lp[have] & 0xc0) != 0x80)
532		break;
533	    ++have;
534	}
535	if (want == have) {
536	    unsigned mask = 0;
537	    int j;
538	    int shift = 0;
539
540	    *cp = 0;
541	    switch (want) {
542	    case 1:
543		mask = (*lp);
544		break;
545	    case 2:
546		mask = (*lp & 0x1f);
547		break;
548	    case 3:
549		mask = (*lp & 0x0f);
550		break;
551	    case 4:
552		mask = (*lp & 0x07);
553		break;
554	    case 5:
555		mask = (*lp & 0x03);
556		break;
557	    case 6:
558		mask = (*lp & 0x01);
559		break;
560	    default:
561		mask = 0;
562		break;
563	    }
564
565	    for (j = 1; j < want; j++) {
566		*cp |= (unsigned) ((lp[want - j] & 0x3f) << shift);
567		shift += 6;
568	    }
569	    *cp |= mask << shift;
570	    lp += want;
571	} else {
572	    *cp = BAD_ASCII;
573	    lp = NULL;
574	}
575    } else {
576	*cp = BAD_ASCII;
577	lp = NULL;
578    }
579    return lp;
580}
581
582/*
583 * Returns true if the entire string is valid UTF-8.
584 */
585Boolean
586isValidUTF8(Char *lp)
587{
588    Boolean result = True;
589    while (*lp) {
590	unsigned ch;
591	Char *next = convertFromUTF8(lp, &ch);
592	if (next == NULL || ch == 0) {
593	    result = False;
594	    break;
595	}
596	lp = next;
597    }
598    return result;
599}
600
601/*
602 * Write data back to the PTY
603 */
604void
605writePtyData(int f, IChar *d, size_t len)
606{
607    size_t n = (len << 1);
608
609    if (VTbuffer->write_len <= len) {
610	VTbuffer->write_len = n;
611	VTbuffer->write_buf = realloc(VTbuffer->write_buf, VTbuffer->write_len);
612    }
613
614    for (n = 0; n < len; n++)
615	VTbuffer->write_buf[n] = (Char) d[n];
616
617    TRACE(("writePtyData %lu:%s\n", (unsigned long) n,
618	   visibleChars(VTbuffer->write_buf, n)));
619    v_write(f, VTbuffer->write_buf, n);
620}
621#endif /* OPT_WIDE_CHARS */
622
623#ifdef NO_LEAKS
624void
625noleaks_ptydata(void)
626{
627    if (VTbuffer != NULL) {
628#if OPT_WIDE_CHARS
629	free(VTbuffer->write_buf);
630#endif
631	FreeAndNull(VTbuffer);
632    }
633}
634#endif
635
636#ifdef TEST_DRIVER
637
638#include "data.c"
639
640void
641NormalExit(void)
642{
643    fprintf(stderr, "NormalExit!\n");
644    exit(EXIT_SUCCESS);
645}
646
647void
648Panic(const char *s, int a)
649{
650    (void) s;
651    (void) a;
652    fprintf(stderr, "Panic!\n");
653    exit(EXIT_FAILURE);
654}
655
656#if OPT_WIDE_CHARS
657
658#ifdef ALLOWLOGGING
659void
660FlushLog(XtermWidget xw)
661{
662    (void) xw;
663}
664#endif
665
666void
667v_write(int f, const Char *data, size_t len)
668{
669    (void) f;
670    (void) data;
671    (void) len;
672}
673
674void
675mk_wcwidth_init(int mode)
676{
677    (void) mode;
678}
679
680void
681update_font_utf8_mode(void)
682{
683}
684
685static int message_level = 0;
686static int opt_all = 0;
687static int opt_illegal = 0;
688static int opt_convert = 0;
689static int opt_reverse = 0;
690static long total_test = 0;
691static long total_errs = 0;
692
693static void
694usage(void)
695{
696    static const char *msg[] =
697    {
698	"Usage: test_ptydata [options] [c1[-c1b] [c2-[c2b] [...]]]",
699	"",
700	"Options:",
701	" -a  exercise all legal encode/decode to/from UTF-8",
702	" -c  call convertFromUTF8 rather than decodeUTF8",
703	" -i  ignore illegal UTF-8 when testing -r option",
704	" -q  quieter",
705	" -r  reverse/decode from UTF-8 byte-string to/from Unicode",
706	" -v  more verbose"
707    };
708    size_t n;
709    for (n = 0; n < sizeof(msg) / sizeof(msg[0]); ++n) {
710	fprintf(stderr, "%s\n", msg[n]);
711    }
712    exit(EXIT_FAILURE);
713}
714
715/*
716 * http://www.unicode.org/versions/corrigendum1.html, table 3.1B
717 */
718#define OkRange(n,lo,hi) \
719 	if (value[n] < lo || value[n] > hi) { \
720	    result = False; \
721	    break; \
722	}
723static Bool
724is_legal_utf8(const Char *value)
725{
726    Bool result = True;
727    Char ch;
728    while ((ch = *value) != '\0') {
729	if (ch <= 0x7f) {
730	    ++value;
731	} else if (ch >= 0xc2 && ch <= 0xdf) {
732	    OkRange(1, 0x80, 0xbf);
733	    value += 2;
734	} else if (ch == 0xe0) {
735	    OkRange(1, 0xa0, 0xbf);
736	    OkRange(2, 0x80, 0xbf);
737	    value += 3;
738	} else if (ch >= 0xe1 && ch <= 0xef) {
739	    OkRange(1, 0x80, 0xbf);
740	    OkRange(2, 0x80, 0xbf);
741	    value += 3;
742	} else if (ch == 0xf0) {
743	    OkRange(1, 0x90, 0xbf);
744	    OkRange(2, 0x80, 0xbf);
745	    OkRange(3, 0x80, 0xbf);
746	    value += 4;
747	} else if (ch >= 0xf1 && ch <= 0xf3) {
748	    OkRange(1, 0x80, 0xbf);
749	    OkRange(2, 0x80, 0xbf);
750	    OkRange(3, 0x80, 0xbf);
751	    value += 4;
752	} else if (ch == 0xf4) {
753	    OkRange(1, 0x80, 0x8f);
754	    OkRange(2, 0x80, 0xbf);
755	    OkRange(3, 0x80, 0xbf);
756	    value += 4;
757	} else {
758	    result = False;
759	    break;
760	}
761    }
762    return result;
763}
764
765static void
766test_utf8_convert(void)
767{
768    unsigned c_in, c_out;
769    Char buffer[10];
770    Char *result;
771    unsigned limit = 0x110000;
772    unsigned success = 0;
773    unsigned bucket[256];
774
775    memset(bucket, 0, sizeof(bucket));
776    for (c_in = 0; c_in < limit; ++c_in) {
777	memset(buffer, 0, sizeof(buffer));
778	if ((result = convertToUTF8(buffer, c_in)) == NULL) {
779	    TRACE(("conversion of U+%04X to UTF-8 failed\n", c_in));
780	} else {
781	    if ((result = convertFromUTF8(buffer, &c_out)) == NULL) {
782		TRACE(("conversion of U+%04X from UTF-8 failed\n", c_in));
783	    } else if (c_in != c_out) {
784		TRACE(("conversion of U+%04X to/from UTF-8 gave U+%04X\n",
785		       c_in, c_out));
786	    } else {
787		while (result-- != buffer) {
788		    bucket[*result]++;
789		}
790		++success;
791	    }
792	}
793    }
794    TRACE(("%u/%u successful\n", success, limit));
795    for (c_in = 0; c_in < 256; ++c_in) {
796	if ((c_in % 8) == 0) {
797	    TRACE((" %02X:", c_in));
798	}
799	TRACE((" %8X", bucket[c_in]));
800	if (((c_in + 1) % 8) == 0) {
801	    TRACE(("\n"));
802	}
803    }
804}
805
806static int
807decode_one(const char *source, char **target)
808{
809    int result = -1;
810    long check;
811    int radix = 0;
812    if ((source[0] == 'u' || source[0] == 'U') && source[1] == '+') {
813	source += 2;
814	radix = 16;
815    } else if (source[0] == '0' && source[1] == 'b') {
816	source += 2;
817	radix = 2;
818    }
819    check = strtol(source, target, radix);
820    if (*target != NULL && *target != source)
821	result = (int) check;
822    return result;
823}
824
825static int
826decode_range(const char *source, int *lo, int *hi)
827{
828    int result = 0;
829    char *after1;
830    char *after2;
831    if ((*lo = decode_one(source, &after1)) >= 0) {
832	after1 += strspn(after1, ":-.\t ");
833	if ((*hi = decode_one(after1, &after2)) < 0) {
834	    *hi = *lo;
835	}
836	result = 1;
837    }
838    return result;
839}
840
841#define MAX_BYTES 6
842
843static void
844do_range(const char *source)
845{
846    int lo, hi;
847
848    TScreen screen;
849    memset(&screen, 0, sizeof(screen));
850
851    if (decode_range(source, &lo, &hi)) {
852	while (lo <= hi) {
853	    unsigned c_in = (unsigned) lo++;
854	    PtyData *data;
855	    Char *next;
856	    Char buffer[MAX_BYTES + 1];
857
858	    if (opt_reverse) {
859		Bool skip = False;
860		Bool first = True;
861		int j, k;
862		for (j = 0; j < MAX_BYTES; ++j) {
863		    unsigned long bits = ((unsigned long) c_in >> (8 * j));
864		    if ((buffer[j] = (Char) bits) == 0) {
865			skip = (bits != 0);
866			break;
867		    }
868		}
869		if (skip)
870		    continue;
871		initPtyData(&data);
872		for (k = 0; k <= j; ++k) {
873		    data->buffer[k] = buffer[j - k - 1];
874		}
875		if (opt_illegal && !is_legal_utf8(data->buffer)) {
876		    free(data);
877		    continue;
878		}
879		if (message_level > 1) {
880		    printf("TEST ");
881		    for (k = 0; k < j; ++k) {
882			printf("%02X", data->buffer[k]);
883		    }
884		}
885		data->next = data->buffer;
886		data->last = data->buffer + j;
887		while (decodeUtf8(&screen, data)) {
888		    total_test++;
889		    if (is_UCS_SPECIAL(data->utf_data))
890			total_errs++;
891		    data->next += data->utf_size;
892		    if (message_level > 1) {
893			printf("%s%04X", first ? " ->" : ", ", data->utf_data);
894		    }
895		    first = False;
896		}
897		if (!first)
898		    total_test--;
899		if (message_level > 1) {
900		    printf("\n");
901		    fflush(stdout);
902		}
903		free(data);
904	    } else if (opt_convert) {
905		unsigned c_out;
906		Char *result;
907
908		memset(buffer, 0, sizeof(buffer));
909		if ((result = next = convertToUTF8(buffer, c_in)) == NULL) {
910		    fprintf(stderr,
911			    "conversion of U+%04X to UTF-8 failed\n", c_in);
912		} else if ((result = convertFromUTF8(buffer, &c_out)) == NULL) {
913		    fprintf(stderr,
914			    "conversion of U+%04X from UTF-8 failed\n", c_in);
915		    total_errs++;
916		} else if (c_in != c_out) {
917		    fprintf(stderr,
918			    "conversion of U+%04X to/from UTF-8 gave U+%04X\n",
919			    c_in, c_out);
920		} else if (message_level > 1) {
921		    *next = '\0';
922		    printf("TEST %04X (%lu:%s) ->%04X\n", c_in,
923			   (unsigned long) (next - buffer),
924			   buffer,
925			   c_out);
926		    fflush(stdout);
927		}
928	    } else {
929		initPtyData(&data);
930		next = convertToUTF8(data->buffer, c_in);
931		*next = 0;
932		data->next = data->buffer;
933		data->last = next;
934		decodeUtf8(&screen, data);
935		if (message_level > 1) {
936		    printf("TEST %04X (%lu:%s) ->%04X\n", c_in,
937			   (unsigned long) (next - data->buffer),
938			   data->buffer,
939			   data->utf_data);
940		    fflush(stdout);
941		}
942		if (c_in != data->utf_data) {
943		    fprintf(stderr, "Mismatch: %04X vs %04X\n", c_in, data->utf_data);
944		    total_errs++;
945		}
946		free(data);
947	    }
948	    total_test++;
949	}
950    }
951}
952
953int
954main(int argc, char **argv)
955{
956    int ch;
957
958    setlocale(LC_ALL, "");
959    while ((ch = getopt(argc, argv, "aciqrv")) != -1) {
960	switch (ch) {
961	case 'a':
962	    opt_all = 1;
963	    break;
964	case 'c':
965	    opt_convert = 1;
966	    break;
967	case 'i':
968	    opt_illegal = 1;
969	    break;
970	case 'q':
971	    message_level--;
972	    break;
973	case 'r':
974	    opt_reverse = 1;
975	    break;
976	case 'v':
977	    message_level++;
978	    break;
979	default:
980	    usage();
981	}
982    }
983    if (opt_all) {
984	test_utf8_convert();
985    } else {
986	if (optind >= argc)
987	    usage();
988	while (optind < argc) {
989	    do_range(argv[optind++]);
990	}
991	if (total_test) {
992	    printf("%ld/%ld mismatches (%.0f%%)\n",
993		   total_errs,
994		   total_test,
995		   (100.0 * (double) total_errs) / (double) total_test);
996	}
997    }
998    return EXIT_SUCCESS;
999}
1000#else
1001int
1002main(int argc, char **argv)
1003{
1004    (void) argc;
1005    (void) argv;
1006    printf("Nothing to be done here...\n");
1007    return EXIT_SUCCESS;
1008}
1009#endif /* OPT_WIDE_CHARS */
1010#endif
1011