ptydata.c revision 04b94745
1/* $XTermId: ptydata.c,v 1.160 2024/05/10 22:54:17 tom Exp $ */ 2 3/* 4 * Copyright 1999-2023,2024 by Thomas E. Dickey 5 * 6 * All Rights Reserved 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the 10 * "Software"), to deal in the Software without restriction, including 11 * without limitation the rights to use, copy, modify, merge, publish, 12 * distribute, sublicense, and/or sell copies of the Software, and to 13 * permit persons to whom the Software is furnished to do so, subject to 14 * the following conditions: 15 * 16 * The above copyright notice and this permission notice shall be included 17 * in all copies or substantial portions of the Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 22 * IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY 23 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 * 27 * Except as contained in this notice, the name(s) of the above copyright 28 * holders shall not be used in advertising or otherwise to promote the 29 * sale, use or other dealings in this Software without prior written 30 * authorization. 31 */ 32 33#include <data.h> 34 35#if OPT_WIDE_CHARS 36#include <menu.h> 37#include <wcwidth.h> 38#endif 39 40#ifdef TEST_DRIVER 41#undef TRACE 42#define TRACE(p) if (1) printf p 43#undef TRACE2 44#define TRACE2(p) if (0) printf p 45#define visibleChars(buf, len) "buffer" 46#endif 47 48/* 49 * Check for both EAGAIN and EWOULDBLOCK, because some supposedly POSIX 50 * systems are broken and return EWOULDBLOCK when they should return EAGAIN. 51 * Note that this macro may evaluate its argument more than once. 52 */ 53#if defined(EAGAIN) && defined(EWOULDBLOCK) 54#define E_TEST(err) ((err) == EAGAIN || (err) == EWOULDBLOCK) 55#else 56#ifdef EAGAIN 57#define E_TEST(err) ((err) == EAGAIN) 58#else 59#define E_TEST(err) ((err) == EWOULDBLOCK) 60#endif 61#endif 62 63#if OPT_WIDE_CHARS 64/* 65 * Convert the 8-bit codes in data->buffer[] into Unicode in data->utf_data. 66 * The number of bytes converted will be nonzero iff there is data. 67 */ 68Bool 69decodeUtf8(TScreen *screen, PtyData *data) 70{ 71 size_t i; 72 size_t length = (size_t) (data->last - data->next); 73 int utf_count = 0; 74 unsigned utf_char = 0; 75 76 data->utf_size = 0; 77 for (i = 0; i < length; i++) { 78 unsigned c = data->next[i]; 79 80 /* Combine UTF-8 into Unicode */ 81 if (c < 0x80) { 82 /* We received an ASCII character */ 83 if (utf_count > 0) { 84 data->utf_data = UCS_REPL; /* prev. sequence incomplete */ 85 data->utf_size = i; 86 } else { 87 data->utf_data = (IChar) c; 88 data->utf_size = 1; 89 } 90 break; 91 } else if (screen->vt100_graphics 92 && (c < 0x100) 93 && (utf_count == 0) 94 && screen->gsets[(int) screen->curgr] != nrc_ASCII) { 95 data->utf_data = (IChar) c; 96 data->utf_size = 1; 97 break; 98 } else if (c < 0xc0) { 99 /* We received a continuation byte */ 100 if (utf_count < 1) { 101 if (screen->c1_printable) { 102 data->utf_data = (IChar) c; 103 } else if ((i + 1) < length 104 && data->next[i + 1] > 0x20 105 && data->next[i + 1] < 0x80) { 106 /* 107 * Allow for C1 control string if the next byte is 108 * available for inspection. 109 */ 110 data->utf_data = (IChar) c; 111 } else { 112 /* 113 * We received a continuation byte before receiving a 114 * sequence state, or a failed attempt to use a C1 control 115 * string. 116 */ 117 data->utf_data = (IChar) UCS_REPL; 118 } 119 data->utf_size = (i + 1); 120 break; 121 } else if (screen->utf8_weblike 122 && (utf_count == 3 123 && utf_char == 0x04 124 && c >= 0x90)) { 125 /* The encoding would form a code point beyond U+10FFFF. */ 126 data->utf_size = i; 127 data->utf_data = UCS_REPL; 128 break; 129 } else if (screen->utf8_weblike 130 && (utf_count == 2 131 && utf_char == 0x0d 132 && c >= 0xa0)) { 133 /* The encoding would form a surrogate code point. */ 134 data->utf_size = i; 135 data->utf_data = UCS_REPL; 136 break; 137 } else { 138 /* Check for overlong UTF-8 sequences for which a shorter 139 * encoding would exist and replace them with UCS_REPL. 140 * An overlong UTF-8 sequence can have any of the following 141 * forms: 142 * 1100000x 10xxxxxx 143 * 11100000 100xxxxx 10xxxxxx 144 * 11110000 1000xxxx 10xxxxxx 10xxxxxx 145 * 11111000 10000xxx 10xxxxxx 10xxxxxx 10xxxxxx 146 * 11111100 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 147 */ 148 if (!utf_char && !((c & 0x7f) >> (7 - utf_count))) { 149 if (screen->utf8_weblike) { 150 /* overlong sequence continued */ 151 data->utf_data = UCS_REPL; 152 data->utf_size = i; 153 break; 154 } else { 155 utf_char = UCS_REPL; 156 } 157 } 158 utf_char <<= 6; 159 utf_char |= (c & 0x3f); 160 if ((utf_char >= 0xd800 && 161 utf_char <= 0xdfff) || 162 (utf_char == 0xfffe) || 163 (utf_char == HIDDEN_CHAR)) { 164 utf_char = UCS_REPL; 165 } 166 utf_count--; 167 if (utf_count == 0) { 168#if !OPT_WIDER_ICHAR 169 /* characters outside UCS-2 become UCS_REPL */ 170 if (utf_char > NARROW_ICHAR) { 171 TRACE(("using replacement for %#x\n", utf_char)); 172 utf_char = UCS_REPL; 173 } 174#endif 175 data->utf_data = (IChar) utf_char; 176 data->utf_size = (i + 1); 177 break; 178 } 179 } 180 } else { 181 /* We received a sequence start byte */ 182 if (utf_count > 0) { 183 /* previous sequence is incomplete */ 184 data->utf_data = UCS_REPL; 185 data->utf_size = i; 186 break; 187 } 188 if (screen->utf8_weblike) { 189 if (c < 0xe0) { 190 if (!(c & 0x1e)) { 191 /* overlong sequence start */ 192 data->utf_data = UCS_REPL; 193 data->utf_size = (i + 1); 194 break; 195 } 196 utf_count = 1; 197 utf_char = (c & 0x1f); 198 } else if (c < 0xf0) { 199 utf_count = 2; 200 utf_char = (c & 0x0f); 201 } else if (c < 0xf5) { 202 utf_count = 3; 203 utf_char = (c & 0x07); 204 } else { 205 data->utf_data = UCS_REPL; 206 data->utf_size = (i + 1); 207 break; 208 } 209 } else { 210 if (c < 0xe0) { 211 utf_count = 1; 212 utf_char = (c & 0x1f); 213 if (!(c & 0x1e)) { 214 /* overlong sequence */ 215 utf_char = UCS_REPL; 216 } 217 } else if (c < 0xf0) { 218 utf_count = 2; 219 utf_char = (c & 0x0f); 220 } else if (c < 0xf8) { 221 utf_count = 3; 222 utf_char = (c & 0x07); 223 } else if (c < 0xfc) { 224 utf_count = 4; 225 utf_char = (c & 0x03); 226 } else if (c < 0xfe) { 227 utf_count = 5; 228 utf_char = (c & 0x01); 229 } else { 230 data->utf_data = UCS_REPL; 231 data->utf_size = (i + 1); 232 break; 233 } 234 } 235 } 236 } 237#if OPT_TRACE > 1 238 TRACE(("UTF-8 char %04X [%lu..%lu]\n", 239 data->utf_data, 240 (unsigned long) (data->next - data->buffer), 241 (unsigned long) (data->next - data->buffer + data->utf_size - 1))); 242#endif 243 244 return (data->utf_size != 0); 245} 246#endif 247 248int 249readPtyData(XtermWidget xw, PtySelect * select_mask, PtyData *data) 250{ 251 TScreen *screen = TScreenOf(xw); 252 int size = 0; 253 254#ifdef VMS 255 if (*select_mask & pty_mask) { 256 trimPtyData(xw, data); 257 if (read_queue.flink != 0) { 258 size = tt_read(data->next); 259 if (size == 0) { 260 Panic("input: read returned zero\n", 0); 261 } 262 } else { 263 sys$hiber(); 264 } 265 } 266#else /* !VMS */ 267 if (FD_ISSET(screen->respond, select_mask)) { 268 int save_err; 269 trimPtyData(xw, data); 270 271 size = (int) read(screen->respond, (char *) data->last, (size_t) FRG_SIZE); 272 save_err = errno; 273#if (defined(i386) && defined(SVR4) && defined(sun)) || defined(__CYGWIN__) 274 /* 275 * Yes, I know this is a majorly f*ugly hack, however it seems to 276 * be necessary for Solaris x86. DWH 11/15/94 277 * Dunno why though.. 278 * (and now CYGWIN, alanh@xfree86.org 08/15/01 279 */ 280 if (size <= 0) { 281 if (save_err == EIO || save_err == 0) 282 NormalExit(); 283 else if (!E_TEST(save_err)) 284 Panic("input: read returned unexpected error (%d)\n", save_err); 285 size = 0; 286 } 287#else /* !f*ugly */ 288 if (size < 0) { 289 if (save_err == EIO) 290 NormalExit(); 291 else if (!E_TEST(save_err)) 292 Panic("input: read returned unexpected error (%d)\n", save_err); 293 size = 0; 294 } else if (size == 0) { 295#if defined(__FreeBSD__) 296 NormalExit(); 297#else 298 Panic("input: read returned zero\n", 0); 299#endif 300 } 301#endif /* f*ugly */ 302 } 303#endif /* VMS */ 304 305 if (size) { 306#if OPT_TRACE 307 int i; 308 309 TRACE(("read %d bytes from pty\n", size)); 310 for (i = 0; i < size; i++) { 311 if (!(i % 16)) 312 TRACE(("%s", i ? "\n " : "READ")); 313 TRACE((" %02X", data->last[i])); 314 } 315 TRACE(("\n")); 316#endif 317 data->last += size; 318#ifdef ALLOWLOGGING 319 TScreenOf(term)->logstart = VTbuffer->next; 320#endif 321 } 322 323 return (size); 324} 325 326/* 327 * Return the next value from the input buffer. Note that morePtyData() is 328 * always called before this function, so we can do the UTF-8 input conversion 329 * in that function and simply return the result here. 330 */ 331#if OPT_WIDE_CHARS 332IChar 333nextPtyData(TScreen *screen, PtyData *data) 334{ 335 IChar result; 336 if (screen->utf8_inparse) { 337 skipPtyData(data, result); 338 } else { 339 result = *((data)->next++); 340 if (!screen->output_eight_bits) { 341 result = (IChar) (result & 0x7f); 342 } 343 } 344 TRACE2(("nextPtyData returns %#x\n", result)); 345 return result; 346} 347#endif 348 349#if OPT_WIDE_CHARS 350/* 351 * Called when UTF-8 mode has been turned on/off. 352 */ 353void 354switchPtyData(TScreen *screen, int flag) 355{ 356 if (screen->utf8_mode != flag) { 357 screen->utf8_mode = flag; 358 screen->utf8_inparse = (Boolean) (flag != 0); 359 mk_wcwidth_init(screen->utf8_mode); 360 361 TRACE(("turning UTF-8 mode %s\n", BtoS(flag))); 362 update_font_utf8_mode(); 363 } 364} 365#endif 366 367/* 368 * Allocate a buffer. 369 */ 370void 371initPtyData(PtyData **result) 372{ 373 PtyData *data; 374 375 TRACE2(("initPtyData given minBufSize %d, maxBufSize %d\n", 376 FRG_SIZE, BUF_SIZE)); 377 378 if (FRG_SIZE < 64) 379 FRG_SIZE = 64; 380 if (BUF_SIZE < FRG_SIZE) 381 BUF_SIZE = FRG_SIZE; 382 if (BUF_SIZE % FRG_SIZE) 383 BUF_SIZE = BUF_SIZE + FRG_SIZE - (BUF_SIZE % FRG_SIZE); 384 385 TRACE2(("initPtyData using minBufSize %d, maxBufSize %d\n", 386 FRG_SIZE, BUF_SIZE)); 387 388 data = TypeXtMallocX(PtyData, (BUF_SIZE + FRG_SIZE)); 389 390 memset(data, 0, sizeof(*data)); 391 data->next = data->buffer; 392 data->last = data->buffer; 393 *result = data; 394} 395 396/* 397 * Initialize a buffer for the caller, using its data in 'next'. 398 */ 399#if OPT_WIDE_CHARS 400PtyData * 401fakePtyData(PtyData *result, Char *next, Char *last) 402{ 403 PtyData *data = result; 404 405 memset(data, 0, sizeof(*data)); 406 data->next = next; 407 data->last = last; 408 409 return data; 410} 411#endif 412 413/* 414 * Remove used data by shifting the buffer down, to make room for more data, 415 * e.g., a continuation-read. 416 */ 417void 418trimPtyData(XtermWidget xw, PtyData *data) 419{ 420 (void) xw; 421 FlushLog(xw); 422 423 if (data->next != data->buffer) { 424 size_t i; 425 size_t n = (size_t) (data->last - data->next); 426 427 TRACE(("shifting buffer down by %lu\n", (unsigned long) n)); 428 for (i = 0; i < n; ++i) { 429 data->buffer[i] = data->next[i]; 430 } 431 data->next = data->buffer; 432 data->last = data->next + n; 433 } 434 435} 436 437/* 438 * Insert new data into the input buffer so the next calls to morePtyData() 439 * and nextPtyData() will return that. 440 */ 441void 442fillPtyData(XtermWidget xw, PtyData *data, const char *value, size_t length) 443{ 444 size_t size; 445 size_t n; 446 447 /* remove the used portion of the buffer */ 448 trimPtyData(xw, data); 449 450 VTbuffer->last += length; 451 size = (size_t) (VTbuffer->last - VTbuffer->next); 452 453 /* shift the unused portion up to make room */ 454 for (n = size; n >= length; --n) 455 VTbuffer->next[n] = VTbuffer->next[n - length]; 456 457 /* insert the new bytes to interpret */ 458 for (n = 0; n < length; n++) 459 VTbuffer->next[n] = CharOf(value[n]); 460} 461 462#if OPT_WIDE_CHARS 463/* 464 * Convert an ISO-8859-1 code 'c' to UTF-8, storing the result in the target 465 * 'lp', and returning a pointer past the converted character. 466 */ 467Char * 468convertToUTF8(Char *lp, unsigned c) 469{ 470#define CH(n) (Char)((c) >> ((n) * 8)) 471 if (c < 0x80) { 472 /* 0******* */ 473 *lp++ = (Char) CH(0); 474 } else if (c < 0x800) { 475 /* 110***** 10****** */ 476 *lp++ = (Char) (0xc0 | (CH(0) >> 6) | ((CH(1) & 0x07) << 2)); 477 *lp++ = (Char) (0x80 | (CH(0) & 0x3f)); 478 } else if (c < 0x00010000) { 479 /* 1110**** 10****** 10****** */ 480 *lp++ = (Char) (0xe0 | ((int) (CH(1) & 0xf0) >> 4)); 481 *lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2)); 482 *lp++ = (Char) (0x80 | (CH(0) & 0x3f)); 483 } else if (c < 0x00200000) { 484 *lp++ = (Char) (0xf0 | ((int) (CH(2) & 0x1f) >> 2)); 485 *lp++ = (Char) (0x80 | 486 ((int) (CH(1) & 0xf0) >> 4) | 487 ((int) (CH(2) & 0x03) << 4)); 488 *lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2)); 489 *lp++ = (Char) (0x80 | (CH(0) & 0x3f)); 490 } else if (c < 0x04000000) { 491 *lp++ = (Char) (0xf8 | (CH(3) & 0x03)); 492 *lp++ = (Char) (0x80 | (CH(2) >> 2)); 493 *lp++ = (Char) (0x80 | 494 ((int) (CH(1) & 0xf0) >> 4) | 495 ((int) (CH(2) & 0x03) << 4)); 496 *lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2)); 497 *lp++ = (Char) (0x80 | (CH(0) & 0x3f)); 498 } else { 499 *lp++ = (Char) (0xfc | ((int) (CH(3) & 0x40) >> 6)); 500 *lp++ = (Char) (0x80 | (CH(3) & 0x3f)); 501 *lp++ = (Char) (0x80 | (CH(2) >> 2)); 502 *lp++ = (Char) (0x80 | (CH(1) >> 4) | ((CH(2) & 0x03) << 4)); 503 *lp++ = (Char) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2)); 504 *lp++ = (Char) (0x80 | (CH(0) & 0x3f)); 505 } 506 return lp; 507#undef CH 508} 509 510/* 511 * Convert a UTF-8 multibyte character to an Unicode value, returning a pointer 512 * past the converted UTF-8 input. The first 256 values align with ISO-8859-1, 513 * making it possible to use this to convert to Latin-1. 514 * 515 * If the conversion fails, return null. 516 */ 517Char * 518convertFromUTF8(Char *lp, unsigned *cp) 519{ 520 int want; 521 522 /* 523 * Find the number of bytes we will need from the source. 524 */ 525 if ((*lp & 0x80) == 0) { 526 want = 1; 527 } else if ((*lp & 0xe0) == 0xc0) { 528 want = 2; 529 } else if ((*lp & 0xf0) == 0xe0) { 530 want = 3; 531 } else if ((*lp & 0xf8) == 0xf0) { 532 want = 4; 533 } else if ((*lp & 0xfc) == 0xf8) { 534 want = 5; 535 } else if ((*lp & 0xfe) == 0xfc) { 536 want = 6; 537 } else { 538 want = 0; 539 } 540 541 if (want) { 542 int have = 1; 543 544 while (lp[have] != '\0') { 545 if ((lp[have] & 0xc0) != 0x80) 546 break; 547 ++have; 548 } 549 if (want == have) { 550 unsigned mask = 0; 551 int j; 552 int shift = 0; 553 554 *cp = 0; 555 switch (want) { 556 case 1: 557 mask = (*lp); 558 break; 559 case 2: 560 mask = (*lp & 0x1f); 561 break; 562 case 3: 563 mask = (*lp & 0x0f); 564 break; 565 case 4: 566 mask = (*lp & 0x07); 567 break; 568 case 5: 569 mask = (*lp & 0x03); 570 break; 571 case 6: 572 mask = (*lp & 0x01); 573 break; 574 default: 575 mask = 0; 576 break; 577 } 578 579 for (j = 1; j < want; j++) { 580 *cp |= (unsigned) ((lp[want - j] & 0x3f) << shift); 581 shift += 6; 582 } 583 *cp |= mask << shift; 584 lp += want; 585 } else { 586 *cp = BAD_ASCII; 587 lp = NULL; 588 } 589 } else { 590 *cp = BAD_ASCII; 591 lp = NULL; 592 } 593 return lp; 594} 595 596/* 597 * Returns true if the entire string is valid UTF-8. 598 */ 599Boolean 600isValidUTF8(Char *lp) 601{ 602 Boolean result = True; 603 while (*lp) { 604 unsigned ch; 605 Char *next = convertFromUTF8(lp, &ch); 606 if (next == NULL || ch == 0) { 607 result = False; 608 break; 609 } 610 lp = next; 611 } 612 return result; 613} 614 615/* 616 * Write data back to the PTY 617 */ 618void 619writePtyData(int f, IChar *d, size_t len) 620{ 621 size_t n = (len << 1); 622 623 if (VTbuffer->write_len <= len) { 624 VTbuffer->write_len = n; 625 VTbuffer->write_buf = realloc(VTbuffer->write_buf, VTbuffer->write_len); 626 } 627 628 for (n = 0; n < len; n++) 629 VTbuffer->write_buf[n] = (Char) d[n]; 630 631 TRACE(("writePtyData %lu:%s\n", (unsigned long) n, 632 visibleChars(VTbuffer->write_buf, n))); 633 v_write(f, VTbuffer->write_buf, n); 634} 635#endif /* OPT_WIDE_CHARS */ 636 637#ifdef NO_LEAKS 638void 639noleaks_ptydata(void) 640{ 641 if (VTbuffer != 0) { 642#if OPT_WIDE_CHARS 643 free(VTbuffer->write_buf); 644#endif 645 FreeAndNull(VTbuffer); 646 } 647} 648#endif 649 650#ifdef TEST_DRIVER 651 652#include "data.c" 653 654void 655NormalExit(void) 656{ 657 fprintf(stderr, "NormalExit!\n"); 658 exit(EXIT_SUCCESS); 659} 660 661void 662Panic(const char *s, int a) 663{ 664 (void) s; 665 (void) a; 666 fprintf(stderr, "Panic!\n"); 667 exit(EXIT_FAILURE); 668} 669 670#if OPT_WIDE_CHARS 671 672#ifdef ALLOWLOGGING 673void 674FlushLog(XtermWidget xw) 675{ 676 (void) xw; 677} 678#endif 679 680void 681v_write(int f, const Char *data, size_t len) 682{ 683 (void) f; 684 (void) data; 685 (void) len; 686} 687 688void 689mk_wcwidth_init(int mode) 690{ 691 (void) mode; 692} 693 694void 695update_font_utf8_mode(void) 696{ 697} 698 699static int message_level = 0; 700static int opt_all = 0; 701static int opt_illegal = 0; 702static int opt_convert = 0; 703static int opt_reverse = 0; 704static long total_test = 0; 705static long total_errs = 0; 706 707static void 708usage(void) 709{ 710 static const char *msg[] = 711 { 712 "Usage: test_ptydata [options] [c1[-c1b] [c2-[c2b] [...]]]", 713 "", 714 "Options:", 715 " -a exercise all legal encode/decode to/from UTF-8", 716 " -c call convertFromUTF8 rather than decodeUTF8", 717 " -i ignore illegal UTF-8 when testing -r option", 718 " -q quieter", 719 " -r reverse/decode from UTF-8 byte-string to/from Unicode", 720 " -v more verbose" 721 }; 722 size_t n; 723 for (n = 0; n < sizeof(msg) / sizeof(msg[0]); ++n) { 724 fprintf(stderr, "%s\n", msg[n]); 725 } 726 exit(EXIT_FAILURE); 727} 728 729/* 730 * http://www.unicode.org/versions/corrigendum1.html, table 3.1B 731 */ 732#define OkRange(n,lo,hi) \ 733 if (value[n] < lo || value[n] > hi) { \ 734 result = False; \ 735 break; \ 736 } 737static Bool 738is_legal_utf8(const Char *value) 739{ 740 Bool result = True; 741 Char ch; 742 while ((ch = *value) != '\0') { 743 if (ch <= 0x7f) { 744 ++value; 745 } else if (ch >= 0xc2 && ch <= 0xdf) { 746 OkRange(1, 0x80, 0xbf); 747 value += 2; 748 } else if (ch == 0xe0) { 749 OkRange(1, 0xa0, 0xbf); 750 OkRange(2, 0x80, 0xbf); 751 value += 3; 752 } else if (ch >= 0xe1 && ch <= 0xef) { 753 OkRange(1, 0x80, 0xbf); 754 OkRange(2, 0x80, 0xbf); 755 value += 3; 756 } else if (ch == 0xf0) { 757 OkRange(1, 0x90, 0xbf); 758 OkRange(2, 0x80, 0xbf); 759 OkRange(3, 0x80, 0xbf); 760 value += 4; 761 } else if (ch >= 0xf1 && ch <= 0xf3) { 762 OkRange(1, 0x80, 0xbf); 763 OkRange(2, 0x80, 0xbf); 764 OkRange(3, 0x80, 0xbf); 765 value += 4; 766 } else if (ch == 0xf4) { 767 OkRange(1, 0x80, 0x8f); 768 OkRange(2, 0x80, 0xbf); 769 OkRange(3, 0x80, 0xbf); 770 value += 4; 771 } else { 772 result = False; 773 break; 774 } 775 } 776 return result; 777} 778 779static void 780test_utf8_convert(void) 781{ 782 unsigned c_in, c_out; 783 Char buffer[10]; 784 Char *result; 785 unsigned limit = 0x110000; 786 unsigned success = 0; 787 unsigned bucket[256]; 788 789 memset(bucket, 0, sizeof(bucket)); 790 for (c_in = 0; c_in < limit; ++c_in) { 791 memset(buffer, 0, sizeof(buffer)); 792 if ((result = convertToUTF8(buffer, c_in)) == 0) { 793 TRACE(("conversion of U+%04X to UTF-8 failed\n", c_in)); 794 } else { 795 if ((result = convertFromUTF8(buffer, &c_out)) == 0) { 796 TRACE(("conversion of U+%04X from UTF-8 failed\n", c_in)); 797 } else if (c_in != c_out) { 798 TRACE(("conversion of U+%04X to/from UTF-8 gave U+%04X\n", 799 c_in, c_out)); 800 } else { 801 while (result-- != buffer) { 802 bucket[*result]++; 803 } 804 ++success; 805 } 806 } 807 } 808 TRACE(("%u/%u successful\n", success, limit)); 809 for (c_in = 0; c_in < 256; ++c_in) { 810 if ((c_in % 8) == 0) { 811 TRACE((" %02X:", c_in)); 812 } 813 TRACE((" %8X", bucket[c_in])); 814 if (((c_in + 1) % 8) == 0) { 815 TRACE(("\n")); 816 } 817 } 818} 819 820static int 821decode_one(const char *source, char **target) 822{ 823 int result = -1; 824 long check; 825 int radix = 0; 826 if ((source[0] == 'u' || source[0] == 'U') && source[1] == '+') { 827 source += 2; 828 radix = 16; 829 } else if (source[0] == '0' && source[1] == 'b') { 830 source += 2; 831 radix = 2; 832 } 833 check = strtol(source, target, radix); 834 if (*target != NULL && *target != source) 835 result = (int) check; 836 return result; 837} 838 839static int 840decode_range(const char *source, int *lo, int *hi) 841{ 842 int result = 0; 843 char *after1; 844 char *after2; 845 if ((*lo = decode_one(source, &after1)) >= 0) { 846 after1 += strspn(after1, ":-.\t "); 847 if ((*hi = decode_one(after1, &after2)) < 0) { 848 *hi = *lo; 849 } 850 result = 1; 851 } 852 return result; 853} 854 855#define MAX_BYTES 6 856 857static void 858do_range(const char *source) 859{ 860 int lo, hi; 861 862 TScreen screen; 863 memset(&screen, 0, sizeof(screen)); 864 865 if (decode_range(source, &lo, &hi)) { 866 while (lo <= hi) { 867 unsigned c_in = (unsigned) lo++; 868 PtyData *data; 869 Char *next; 870 Char buffer[MAX_BYTES + 1]; 871 872 if (opt_reverse) { 873 Bool skip = False; 874 Bool first = True; 875 int j, k; 876 for (j = 0; j < MAX_BYTES; ++j) { 877 unsigned long bits = ((unsigned long) c_in >> (8 * j)); 878 if ((buffer[j] = (Char) bits) == 0) { 879 skip = (bits != 0); 880 break; 881 } 882 } 883 if (skip) 884 continue; 885 initPtyData(&data); 886 for (k = 0; k <= j; ++k) { 887 data->buffer[k] = buffer[j - k - 1]; 888 } 889 if (opt_illegal && !is_legal_utf8(data->buffer)) { 890 free(data); 891 continue; 892 } 893 if (message_level > 1) { 894 printf("TEST "); 895 for (k = 0; k < j; ++k) { 896 printf("%02X", data->buffer[k]); 897 } 898 } 899 data->next = data->buffer; 900 data->last = data->buffer + j; 901 while (decodeUtf8(&screen, data)) { 902 total_test++; 903 if (is_UCS_SPECIAL(data->utf_data)) 904 total_errs++; 905 data->next += data->utf_size; 906 if (message_level > 1) { 907 printf("%s%04X", first ? " ->" : ", ", data->utf_data); 908 } 909 first = False; 910 } 911 if (!first) 912 total_test--; 913 if (message_level > 1) { 914 printf("\n"); 915 fflush(stdout); 916 } 917 free(data); 918 } else if (opt_convert) { 919 unsigned c_out; 920 Char *result; 921 922 memset(buffer, 0, sizeof(buffer)); 923 if ((result = next = convertToUTF8(buffer, c_in)) == 0) { 924 fprintf(stderr, 925 "conversion of U+%04X to UTF-8 failed\n", c_in); 926 } else if ((result = convertFromUTF8(buffer, &c_out)) == 0) { 927 fprintf(stderr, 928 "conversion of U+%04X from UTF-8 failed\n", c_in); 929 total_errs++; 930 } else if (c_in != c_out) { 931 fprintf(stderr, 932 "conversion of U+%04X to/from UTF-8 gave U+%04X\n", 933 c_in, c_out); 934 } else if (message_level > 1) { 935 *next = '\0'; 936 printf("TEST %04X (%lu:%s) ->%04X\n", c_in, 937 (unsigned long) (next - buffer), 938 buffer, 939 c_out); 940 fflush(stdout); 941 } 942 } else { 943 initPtyData(&data); 944 next = convertToUTF8(data->buffer, c_in); 945 *next = 0; 946 data->next = data->buffer; 947 data->last = next; 948 decodeUtf8(&screen, data); 949 if (message_level > 1) { 950 printf("TEST %04X (%lu:%s) ->%04X\n", c_in, 951 (unsigned long) (next - data->buffer), 952 data->buffer, 953 data->utf_data); 954 fflush(stdout); 955 } 956 if (c_in != data->utf_data) { 957 fprintf(stderr, "Mismatch: %04X vs %04X\n", c_in, data->utf_data); 958 total_errs++; 959 } 960 free(data); 961 } 962 total_test++; 963 } 964 } 965} 966 967int 968main(int argc, char **argv) 969{ 970 int ch; 971 972 setlocale(LC_ALL, ""); 973 while ((ch = getopt(argc, argv, "aciqrv")) != -1) { 974 switch (ch) { 975 case 'a': 976 opt_all = 1; 977 break; 978 case 'c': 979 opt_convert = 1; 980 break; 981 case 'i': 982 opt_illegal = 1; 983 break; 984 case 'q': 985 message_level--; 986 break; 987 case 'r': 988 opt_reverse = 1; 989 break; 990 case 'v': 991 message_level++; 992 break; 993 default: 994 usage(); 995 } 996 } 997 if (opt_all) { 998 test_utf8_convert(); 999 } else { 1000 if (optind >= argc) 1001 usage(); 1002 while (optind < argc) { 1003 do_range(argv[optind++]); 1004 } 1005 if (total_test) { 1006 printf("%ld/%ld mismatches (%.0f%%)\n", 1007 total_errs, 1008 total_test, 1009 (100.0 * (double) total_errs) / (double) total_test); 1010 } 1011 } 1012 return EXIT_SUCCESS; 1013} 1014#else 1015int 1016main(int argc, char **argv) 1017{ 1018 (void) argc; 1019 (void) argv; 1020 printf("Nothing to be done here...\n"); 1021 return EXIT_SUCCESS; 1022} 1023#endif /* OPT_WIDE_CHARS */ 1024#endif 1025