1/* 2 * Copyright (c) 2002 by The XFree86 Project, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 * 22 * Except as contained in this notice, the name of the XFree86 Project shall 23 * not be used in advertising or otherwise to promote the sale, use or other 24 * dealings in this Software without prior written authorization from the 25 * XFree86 Project. 26 * 27 * Author: Paulo César Pereira de Andrade 28 */ 29 30/* $XFree86: xc/programs/xedit/lisp/re/re.c,v 1.8 2002/11/17 07:51:30 paulo Exp $ */ 31 32#include <stdio.h> 33#include "rep.h" 34 35/* 36 * Types 37 */ 38 39/* Information used when generating the final form of the compiled re. 40 */ 41struct _re_inf { 42 rec_alt *alt; 43 unsigned char *cod; 44 long len; 45 long spc; 46 47 /* Start offset of special repetition instruction */ 48 long sr[MAX_DEPTH]; 49 50 /* Jump offset of special repetition instruction */ 51 long sj[MAX_DEPTH]; 52 53 /* Just a flag, to know if this nesting is for a special repetition */ 54 char sp[MAX_DEPTH]; 55 56 int bas; /* Alternatives/repetitions depth */ 57 int par; /* Open parenthesis counter */ 58 int ref; /* Backreference counter */ 59 60 rec_pat *apat; /* Alternatives duplicate patterns 61 * if a special repetition is found, 62 * this is done to somewhat simplify 63 * the bytecode engine and still allow 64 * some complex (and time consuming) 65 * patterns. */ 66 67 int flags; 68 int ecode; 69}; 70 71/* This structure is not associated with re_cod as it's data only matters 72 * to the current match search. 73 */ 74struct _re_eng { 75 unsigned char *bas; /* Base string pointer */ 76 unsigned char *str; /* String to search for pattern */ 77 unsigned char *end; /* Where to stop searching */ 78 unsigned char *cod; /* Pointer in the re_cod structure */ 79 long off; /* Number of used entries in so/eo etc */ 80 81 /* Match offset/nesting information */ 82 long so[MAX_DEPTH]; /* (s)tart of (m)atch */ 83 long eo[MAX_DEPTH]; /* (e)nd of (m)atch */ 84 long sv[MAX_DEPTH]; /* (s)a(v)e match end offset */ 85 long re[MAX_DEPTH]; /* (re)petition count */ 86 long ss[MAX_DEPTH]; /* (s)ave (s)tart of match */ 87 unsigned char *rcod[MAX_DEPTH]; /* restart position in regex code */ 88 unsigned char *rstr[MAX_DEPTH]; /* restart position in string */ 89 90 /* Group/backreference information */ 91 long goff; 92 long gso[9]; 93 long geo[9]; 94}; 95 96/* 97 * Prototypes 98 */ 99static void reinit(void); 100static int rec_check(re_inf*, int); 101static int rec_code(re_inf*, ReCode); 102static int rec_byte(re_inf*, int); 103static int rec_byte_byte(re_inf*, int, int); 104static int rec_code_byte(re_inf*, ReCode, int); 105static int rec_length(re_inf*, int); 106static int rec_code_byte_byte(re_inf*, ReCode, int, int); 107static int rec_build_alt(re_inf*, rec_alt*); 108static int rec_build_pat(re_inf*, rec_pat*); 109static int rec_build_rng(re_inf*, rec_rng*); 110static int rec_build_grp(re_inf*, rec_grp*); 111static int rec_build_stl(re_inf*, rec_stl*); 112static int rec_build_rep(re_inf*, rec_rep*); 113static int rec_inc_spc(re_inf*); 114static int rec_dec_spc(re_inf*); 115static int rec_add_spc(re_inf*, int); 116static int rec_off_spc(re_inf*); 117static int rec_alt_spc(re_inf*, int); 118static int rec_rep_spc(re_inf*, int); 119#ifdef DEBUG 120static void redump(re_cod*); 121#endif 122 123/* 124 * Initialization 125 */ 126unsigned char re__alnum[256]; 127unsigned char re__odigit[256]; 128unsigned char re__ddigit[256]; 129unsigned char re__xdigit[256]; 130unsigned char re__control[256]; 131 132/* 133 * Implementation 134 */ 135int 136recomp(re_cod *preg, const char *pattern, int flags) 137{ 138 int i, ecode; 139 re_inf inf; 140 141 reinit(); 142 143 preg->cod = NULL; 144 inf.alt = irec_comp(pattern, 145 flags & RE_PEND ? preg->re_endp : 146 pattern + strlen(pattern), 147 flags, &ecode); 148 if (ecode != 0) 149 return (ecode); 150 151 inf.cod = NULL; 152 inf.len = inf.spc = 0; 153 inf.bas = 0; 154 inf.par = 0; 155 inf.ref = 0; 156 inf.apat = NULL; 157 inf.flags = flags; 158 inf.ecode = 0; 159 for (i = 0; i < MAX_DEPTH; i++) 160 inf.sp[i] = 0; 161 162 /* First byte is runtime modifier flags */ 163 if (rec_byte(&inf, flags & (RE_NEWLINE | RE_NOSUB)) == 0 && 164 rec_byte(&inf, 0xff) == 0 && 165 rec_build_alt(&inf, inf.alt) == 0 && 166 rec_rep_spc(&inf, 0) == 0 && 167 rec_code(&inf, Re_Done) == 0) { 168 /* Number of possible references, loops will not leave this 169 * value correct, but it is cheap to read it from the second 170 * byte, instead of adding several extra checks in the bytecode. */ 171 if (inf.ref) 172 inf.cod[1] = inf.ref - 1; 173 preg->cod = inf.cod; 174 /* Public structure member */ 175 preg->re_nsub = inf.ref; 176 } 177 178 irec_free_alt(inf.alt); 179 if (inf.ecode) 180 free(inf.cod); 181#ifdef DEBUG 182 else if (flags & RE_DUMP) 183 redump(preg); 184#endif 185 186 return (inf.ecode); 187} 188 189int 190reexec(const re_cod *preg, const char *string, 191 int nmatch, re_mat pmat[], int flags) 192{ 193 unsigned char *ptr, *str, newline, nosub; 194 int len, si, ci, bas, i, j, k, l, m; 195 re_eng eng; 196 197 if (preg == NULL || preg->cod == NULL || nmatch < 0 || 198 ((flags & RE_STARTEND) && 199 (pmat == NULL || pmat[0].rm_eo < pmat[0].rm_so))) 200 return (RE_INVARG); 201 202 eng.str = (unsigned char*)string; 203 if (flags & RE_STARTEND) { 204 eng.end = eng.str + pmat[0].rm_eo; 205 eng.str += pmat[0].rm_so; 206 } 207 else 208 eng.end = eng.str + strlen(string); 209 eng.bas = eng.str; 210 nosub = preg->cod[0] & RE_NOSUB; 211 newline = preg->cod[0] & RE_NEWLINE; 212 eng.cod = preg->cod + 2; 213 214 if (!nosub && preg->cod[1] != 0xff) { 215 for (i = 0; i <= preg->cod[1]; i++) { 216 eng.gso[i] = 0; 217 eng.geo[i] = -1; 218 } 219 } 220 221 /* Setup to search for start of match from the first character */ 222 eng.so[0] = 0; 223 eng.eo[0] = eng.sv[0] = -1; 224 eng.rcod[0] = eng.cod; 225 eng.rstr[0] = eng.str + 1; 226 eng.off = 0; 227 eng.goff = -1; 228 for (ci = si = 1;;) { 229reset: 230 switch (*eng.cod) { 231 /**************************************************** 232 * One byte codes * 233 ****************************************************/ 234 case Re_Any: 235 if (eng.str == eng.end || (newline && eng.str[0] == '\n')) 236 goto fail; 237 goto match; 238 case Re_AnyEatAnyTimes: 239 if (newline) { 240 for (ptr = eng.str; ptr < eng.end; ptr++) { 241 if (*ptr == '\n') 242 break; 243 } 244 si = ptr - eng.str; 245 } 246 else 247 si = eng.end - eng.str; 248 goto match; 249 case Re_AnyEatMaybe: 250 si = eng.end > eng.str; 251 if (newline && si && eng.str[0] == '\n') 252 si = 0; 253 goto match; 254 case Re_AnyEatAtLeast: 255 if (newline) { 256 for (ptr = eng.str; ptr < eng.end; ptr++) { 257 if (*ptr == '\n') 258 break; 259 } 260 si = ptr - eng.str; 261 } 262 else 263 si = eng.end - eng.str; 264 if (si == 0) { 265 si = 1; 266 goto fail; 267 } 268 goto match; 269 case Re_Odigit: 270 if (eng.str >= eng.end) 271 goto fail; 272 if (re__odigit[eng.str[0]]) 273 goto match; 274 goto fail; 275 case Re_OdigitNot: 276 if (eng.str >= eng.end || re__odigit[eng.str[0]]) 277 goto fail; 278 goto match; 279 case Re_Digit: 280 if (eng.str >= eng.end) 281 goto fail; 282 if (re__ddigit[eng.str[0]]) 283 goto match; 284 goto fail; 285 case Re_DigitNot: 286 if (eng.str >= eng.end || re__ddigit[eng.str[0]]) 287 goto fail; 288 goto match; 289 case Re_Xdigit: 290 if (eng.str >= eng.end) 291 goto fail; 292 if (re__xdigit[eng.str[0]]) 293 goto match; 294 goto fail; 295 case Re_XdigitNot: 296 if (eng.str >= eng.end || re__xdigit[eng.str[0]]) 297 goto fail; 298 goto match; 299 case Re_Space: 300 if (eng.str >= eng.end) 301 goto fail; 302 if (eng.str[0] == ' ' || eng.str[0] == '\t') 303 goto match; 304 goto fail; 305 case Re_SpaceNot: 306 if (eng.str >= eng.end) 307 goto fail; 308 if (eng.str[0] != ' ' && eng.str[0] != '\t') 309 goto match; 310 goto fail; 311 case Re_Tab: 312 if (eng.str >= eng.end) 313 goto fail; 314 if (eng.str[0] == '\t') 315 goto match; 316 goto fail; 317 case Re_Newline: 318 if (eng.str >= eng.end) 319 goto fail; 320 if (eng.str[0] == '\n') 321 goto match; 322 goto fail; 323 case Re_Lower: 324 if (eng.str >= eng.end) 325 goto fail; 326 if (eng.str[0] >= 'a' && eng.str[0] <= 'z') 327 goto match; 328 goto fail; 329 case Re_Upper: 330 if (eng.str >= eng.end) 331 goto fail; 332 if (eng.str[0] >= 'A' && eng.str[0] <= 'Z') 333 goto match; 334 goto fail; 335 case Re_Alnum: 336 if (eng.str >= eng.end) 337 goto fail; 338 if (re__alnum[eng.str[0]]) 339 goto match; 340 goto fail; 341 case Re_AlnumNot: 342 if (eng.str >= eng.end) 343 goto fail; 344 if (re__alnum[eng.str[0]]) 345 goto fail; 346 goto match; 347 case Re_Control: 348 if (eng.str >= eng.end) 349 goto fail; 350 if (re__control[eng.str[0]]) 351 goto match; 352 goto fail; 353 case Re_ControlNot: 354 if (eng.str >= eng.end || re__control[eng.str[0]]) 355 goto fail; 356 goto match; 357 358 /**************************************************** 359 * One byte codes, match special emtpy strings * 360 ****************************************************/ 361 case Re_Bol: 362 if (eng.str == eng.bas) { 363 if ((flags & RE_NOTBOL)) { 364 /* String does not start at the beginning of a line */ 365 if (newline) 366 goto fail; 367 goto wont; 368 } 369 si = 0; 370 goto match; 371 } 372 if (newline && eng.str[-1] == '\n') { 373 si = 0; 374 goto match; 375 } 376 goto fail; 377 case Re_Eol: 378 if (eng.str == eng.end) { 379 if (flags & RE_NOTEOL) 380 /* String does not finish at the end of a line */ 381 goto wont; 382 si = 0; 383 goto match; 384 } 385 if (newline && eng.str[0] == '\n') { 386 si = 0; 387 goto match; 388 } 389 goto fail; 390 case Re_Bow: 391 if (eng.str >= eng.end || 392 (eng.str > eng.bas && 393 (re__alnum[eng.str[-1]]))) 394 goto fail; 395 if (re__alnum[eng.str[0]]) { 396 si = 0; 397 goto match; 398 } 399 goto fail; 400 case Re_Eow: 401 if (eng.str == eng.bas || 402 (eng.str < eng.end && 403 re__alnum[eng.str[0]])) 404 goto fail; 405 if (re__alnum[eng.str[-1]]) { 406 si = 0; 407 goto match; 408 } 409 goto fail; 410 411 /**************************************************** 412 * One byte code, one byte argument * 413 ****************************************************/ 414 case Re_Literal: 415 if (eng.str >= eng.end) 416 goto fail; 417 if (eng.str[0] == eng.cod[1]) { 418 ci = 2; 419 goto match; 420 } 421 goto fail; 422 case Re_LiteralNot: 423 if (eng.str >= eng.end) 424 goto fail; 425 if (eng.str[0] != eng.cod[1]) { 426 ci = 2; 427 goto match; 428 } 429 goto fail; 430 case Re_SearchLiteral: 431 for (str = eng.str; str < eng.end; str++) { 432 if (*str == eng.cod[1]) { 433 ci = 2; 434 eng.str = str; 435 goto match; 436 } 437 } 438 /* This bytecode only happens in the toplevel */ 439 eng.so[0] = str - eng.bas; 440 eng.str = str; 441 goto fail; 442 443 /**************************************************** 444 * One byte code, two bytes argument * 445 ****************************************************/ 446 case Re_CaseLiteral: 447 if (eng.str >= eng.end) 448 goto fail; 449 if (eng.str[0] == eng.cod[1] || eng.str[0] == eng.cod[2]) { 450 ci = 3; 451 goto match; 452 } 453 goto fail; 454 case Re_CaseLiteralNot: 455 if (eng.str >= eng.end) 456 goto fail; 457 if (eng.str[0] != eng.cod[1] && eng.str[0] != eng.cod[2]) { 458 ci = 3; 459 goto match; 460 } 461 goto fail; 462 case Re_SearchCaseLiteral: 463 for (str = eng.str; str < eng.end; str++) { 464 if (*str == eng.cod[1] || *str == eng.cod[2]) { 465 ci = 3; 466 eng.str = str; 467 goto match; 468 } 469 } 470 eng.so[0] = str - eng.bas; 471 eng.str = str; 472 goto fail; 473 474 /**************************************************** 475 * One byte codes, two arguments, n bytes * 476 ****************************************************/ 477 case Re_String: 478 len = eng.cod[1]; 479 if (len & 0x80) { 480 i = 3; 481 len = (len & 0x7f) + (eng.cod[2] << 7); 482 } 483 else 484 i = 2; 485 if (eng.end - eng.str < len) 486 goto fail; 487 ptr = eng.cod + i; 488 str = eng.str; 489 for (k = len; k > 0; k--) { 490 if (*ptr++ != *str++) 491 goto fail; 492 } 493 ci = i + len; 494 si = len; 495 goto match; 496 case Re_SearchString: 497 len = eng.cod[1]; 498 if (len & 0x80) { 499 i = 3; 500 len = (len & 0x7f) + (eng.cod[2] << 7); 501 } 502 else 503 i = 2; 504 for (str = eng.str; eng.end - str >= len; str = eng.str++) { 505 for (ptr = eng.cod + i, str = eng.str, k = len; k > 0; k--) 506 if (*ptr++ != *str++) 507 break; 508 if (k == 0) { 509 /* Substring found */ 510 ci = i + len; 511 si = str - eng.str; 512 goto match; 513 } 514 } 515 eng.so[0] = eng.end - eng.bas; 516 eng.str = eng.end; 517 goto fail; 518 519 case Re_CaseString: 520 len = eng.cod[1]; 521 if (len & 0x80) { 522 i = 3; 523 len = (len & 0x7f) + (eng.cod[2] << 7); 524 } 525 else 526 i = 2; 527 528 len >>= 1; 529 /* Check if there are at least len/2 bytes left, string 530 * is represented as two bytes, lower and upper case */ 531 if (eng.end - eng.str < len) 532 goto fail; 533 ptr = eng.cod + i; 534 str = eng.str; 535 for (k = len; k > 0; str++, ptr += 2, k--) { 536 if (*str != ptr[0] && *str != ptr[1]) 537 goto fail; 538 } 539 ci = i + (len << 1); 540 si = len; 541 goto match; 542 case Re_SearchCaseString: 543 len = eng.cod[1]; 544 if (len & 0x80) { 545 i = 3; 546 len = (len & 0x7f) + (eng.cod[2] << 7); 547 } 548 else 549 i = 2; 550 len >>= 1; 551 for (str = eng.str; eng.end - str >= len; str = eng.str++) { 552 for (ptr = eng.cod + i, str = eng.str, k = len; 553 k > 0; k--, ptr += 2, str++) 554 if (ptr[0] != str[0] && ptr[1] != str[0]) 555 break; 556 if (k == 0) { 557 /* Substring found */ 558 ci = i + (len << 1); 559 si = str - eng.str; 560 goto match; 561 } 562 } 563 eng.so[0] = eng.end - eng.bas; 564 eng.str = eng.end; 565 goto fail; 566 567 case Re_StringList: 568 /* Number of strings */ 569 k = eng.cod[1]; 570 571 /* Where to jump after match */ 572 bas = eng.cod[2] | (eng.cod[3] << 8); 573 574 str = eng.str; 575 ptr = eng.cod + k + 4; 576 l = eng.end - eng.str; 577 for (j = 0; j < k; j++) { 578 len = eng.cod[j + 4]; 579 if (len <= l) { 580 for (i = 0; i < len; i++) 581 if (ptr[i] != str[i]) 582 goto next_stl; 583 goto stl_match; 584 } 585next_stl: 586 ptr += len; 587 } 588 goto fail; 589stl_match: 590 ci = bas; 591 si = len; 592 goto match; 593 594 case Re_CaseStringList: 595 /* Number of strings */ 596 k = eng.cod[1]; 597 598 /* Where to jump after match */ 599 bas = eng.cod[2] | (eng.cod[3] << 8); 600 601 str = eng.str; 602 ptr = eng.cod + k + 4; 603 l = eng.end - eng.str; 604 for (j = 0; j < k; j++) { 605 len = eng.cod[j + 4]; 606 if ((len >> 1) <= l) { 607 for (i = m = 0; i < len; m++, i += 2) 608 if (ptr[i] != str[m] && ptr[i + 1] != str[m]) 609 goto next_cstl; 610 goto cstl_match; 611 } 612next_cstl: 613 ptr += len; 614 } 615 goto fail; 616cstl_match: 617 ci = bas; 618 si = len >> 1; 619 goto match; 620 621 622 case Re_LargeStringList: 623 /* Where to jump after match */ 624 bas = eng.cod[1] | (eng.cod[2] << 8); 625 626 str = eng.str; 627 628 /* First entry in index map */ 629 ptr = eng.cod + 3; 630 i = (int)str[0] << 1; 631 j = ptr[i] | (ptr[i + 1] << 8); 632 if (j == 0xffff) 633 /* No entry with this byte */ 634 goto fail; 635 636 /* Bytes left in input */ 637 l = eng.end - eng.str; 638 639 /* First entry matching initial byte */ 640 ptr += 512 + j; 641 642 for (len = ptr[0]; 643 str[0] == ptr[1]; 644 ptr += len + 1, len = ptr[0]) { 645 if (len <= l) { 646 for (i = 1; i < len; i++) { 647 if (ptr[i + 1] != str[i]) 648 goto next_lstl; 649 } 650 ci = bas; 651 si = len; 652 goto match; 653 } 654next_lstl:; 655 } 656 goto fail; 657 658 case Re_LargeCaseStringList: 659 /* Where to jump after match */ 660 bas = eng.cod[1] | (eng.cod[2] << 8); 661 662 str = eng.str; 663 664 /* First entry in index map */ 665 ptr = eng.cod + 3; 666 i = (int)str[0] << 1; 667 j = ptr[i] | (ptr[i + 1] << 8); 668 if (j == 0xffff) 669 /* No entry with this byte */ 670 goto fail; 671 672 /* Bytes left in input */ 673 l = eng.end - eng.str; 674 675 /* First entry matching initial byte */ 676 ptr += 512 + j; 677 678 for (len = ptr[0]; 679 str[0] == ptr[1] || str[0] == ptr[2]; 680 ptr += len + 1, len = ptr[0]) { 681 if ((k = (len >> 1)) <= l) { 682 for (i = 2, j = 1; i < len; i += 2, j++) { 683 if (ptr[i + 1] != str[j] && ptr[i + 2] != str[j]) 684 goto next_lcstl; 685 } 686 ci = bas; 687 si = k; 688 goto match; 689 } 690next_lcstl:; 691 } 692 goto fail; 693 694 695 /**************************************************** 696 * Character range matching * 697 ****************************************************/ 698 case Re_Range: 699 if (eng.str < eng.end && eng.cod[eng.str[0] + 1]) { 700 ci = 257; 701 goto match; 702 } 703 goto fail; 704 case Re_RangeNot: 705 if (eng.str >= eng.end || eng.cod[eng.str[0] + 1]) 706 goto fail; 707 ci = 257; 708 goto match; 709 710 /**************************************************** 711 * Group handling * 712 ****************************************************/ 713 case Re_Open: 714 if (++eng.goff >= 9) 715 return (RE_ASSERT); 716 eng.gso[eng.goff] = eng.str - eng.bas; 717 ++eng.cod; 718 continue; 719 case Re_Close: 720 eng.geo[eng.goff] = eng.str - eng.bas; 721 ++eng.cod; 722 continue; 723 case Re_Update: 724 bas = eng.cod[1]; 725 eng.geo[eng.goff] = eng.str - eng.bas; 726 eng.cod += 2; /* + Update + bas */ 727 continue; 728 729 /**************************************************** 730 * Backreference * 731 ****************************************************/ 732 case Re_Backref: 733 i = eng.cod[1]; 734 j = eng.gso[i]; 735 k = eng.geo[i]; 736 len = k - j; 737 if (k < j || eng.end - eng.str < len) 738 goto fail; 739 ptr = eng.bas + j; 740 str = eng.str; 741 for (l = len; l > 0; l--) { 742 if (*ptr++ != *str++) 743 goto fail; 744 } 745 ci = 2; 746 si = len; 747 goto match; 748 749 /**************************************************** 750 * Alternatives handling * 751 ****************************************************/ 752 case Re_Alt: 753 bas = eng.off; 754 if (++eng.off >= MAX_DEPTH) 755 return (RE_ASSERT); 756 757 /* Get offset of next alternative */ 758 i = eng.cod[1] | (eng.cod[2] << 8); 759 760 /* Setup for next alternative if the current fails */ 761 eng.rcod[eng.off] = eng.cod + i + 1; /* + Alt */ 762 763 /* If fail, test the next alternative in the same string */ 764 eng.rstr[eng.off] = eng.str; 765 766 /* Setup match offsets */ 767 if (eng.so[bas] <= eng.eo[bas]) 768 eng.so[eng.off] = eng.eo[bas]; 769 else 770 eng.so[eng.off] = eng.so[bas]; 771 eng.sv[eng.off] = eng.eo[eng.off] = eng.so[eng.off] - 1; 772 773 /* Save start of possible previous matches */ 774 eng.ss[eng.off] = eng.so[bas]; 775 776 /* Skip code */ 777 eng.cod += 3; 778 779 /* Go try the first alternative */ 780 continue; 781 782 case Re_AltNext: 783 bas = eng.off - 1; 784 /* Check if matched and if it is a better match */ 785 if (eng.eo[eng.off] >= eng.so[eng.off] && 786 eng.sv[eng.off] - eng.so[eng.off] < 787 eng.eo[eng.off] - eng.so[eng.off]) 788 eng.sv[eng.off] = eng.eo[eng.off]; 789 790 /* Get offset of next alternative */ 791 i = eng.cod[1] | (eng.cod[2] << 8); 792 793 /* Setup for next alternative if the current fails */ 794 eng.rcod[eng.off] = eng.cod + i + 1; /* + AltNext */ 795 796 /* Setup match offset */ 797 eng.eo[eng.off] = eng.so[eng.off] - 1; 798 799 /* Reset string for next alternative */ 800 eng.str = eng.rstr[eng.off]; 801 802 /* Skip code */ 803 eng.cod += 3; 804 805 /* Go try the next alternative */ 806 continue; 807 808 case Re_AltDone: 809 bas = eng.off - 1; 810 /* Check if matched and if it is a better match */ 811 if (eng.sv[eng.off] - eng.so[eng.off] < 812 eng.eo[eng.off] - eng.so[eng.off]) 813 eng.sv[eng.off] = eng.eo[eng.off]; 814 815 /* If there is a match */ 816 if (eng.sv[eng.off] >= eng.so[eng.off]) { 817 eng.so[bas] = eng.ss[eng.off]; 818 eng.eo[bas] = eng.sv[eng.off]; 819 eng.str = eng.bas + eng.eo[bas]; 820 821 /* Pop stack and skip code */ 822 --eng.off; 823 ++eng.cod; 824 825 /* Go try next regular expression pattern */ 826 continue; 827 } 828 829 /* Failed, reset string and pop stack */ 830 eng.str = eng.rstr[eng.off]; 831 --eng.off; 832 goto fail; 833 834 835 /**************************************************** 836 * Repetition * 837 ****************************************************/ 838 839 /* Note that the repetition counter is not 840 * updated for <re>*, <re>+ and <re>? 841 * it is easy to updated, but since it is not 842 * really useful, code to do it was removed 843 * to save a few cpu cicles. */ 844 845#define REPETITION_SETUP() \ 846 if (++eng.off >= MAX_DEPTH) \ 847 return (RE_ASSERT); \ 848 \ 849 /* Return here for recovery if match fail */ \ 850 eng.rcod[eng.off] = eng.cod; \ 851 \ 852 /* Setup match offsets */ \ 853 if (eng.so[bas] <= eng.eo[bas]) \ 854 eng.so[eng.off] = eng.eo[bas]; \ 855 else \ 856 eng.so[eng.off] = eng.so[bas]; \ 857 eng.ss[eng.off] = eng.so[bas]; \ 858 eng.sv[eng.off] = eng.eo[eng.off] = eng.so[eng.off] - 1;\ 859 \ 860 /* Skip repetition instruction */ \ 861 eng.cod += 4; 862 863 864 case Re_AnyTimes: 865 bas = eng.cod[1]; 866 if (eng.off == bas) { 867 /* First iteration */ 868 REPETITION_SETUP(); 869 } 870 else { 871 if (eng.eo[eng.off] >= eng.so[eng.off] && 872 eng.eo[eng.off] > eng.sv[eng.off]) { 873 /* Update offset of match */ 874 eng.sv[eng.off] = eng.eo[eng.off]; 875 876 /* Skip repetition instruction */ 877 eng.cod += 4; 878 } 879 else { 880 /* Match failed but it is ok */ 881 len = eng.cod[2] | (eng.cod[3] << 8); 882 eng.so[bas] = eng.ss[eng.off]; 883 if (eng.sv[eng.off] >= eng.so[eng.off]) 884 /* Something matched earlier, update */ 885 eng.eo[bas] = eng.sv[eng.off]; 886 else if (eng.eo[bas] < eng.so[bas]) 887 /* Empty match */ 888 eng.eo[bas] = eng.so[bas]; 889 890 /* Try next pattern at correct offset */ 891 eng.str = eng.bas + eng.eo[bas]; 892 893 /* Pop stack and skip code */ 894 --eng.off; 895 eng.cod += len; 896 } 897 } 898 continue; 899 900 case Re_Maybe: 901 bas = eng.cod[1]; 902 if (eng.off == bas) { 903 /* First iteration */ 904 REPETITION_SETUP(); 905 } 906 else { 907 /* Matched or first iteration is done */ 908 len = eng.cod[2] | (eng.cod[3] << 8); 909 eng.so[bas] = eng.ss[eng.off]; 910 if (eng.eo[eng.off] > eng.so[eng.off]) { 911 /* Something matched earlier, update */ 912 eng.eo[bas] = eng.eo[eng.off]; 913 eng.str = eng.bas + eng.eo[bas]; 914 /* Don't need to update counter */ 915 } 916 else { 917 /* Empty match */ 918 if (eng.eo[bas] < eng.so[bas]) 919 eng.eo[bas] = eng.so[bas]; 920 921 /* Try next pattern at correct offset */ 922 eng.str = eng.bas + eng.eo[bas]; 923 } 924 925 /* Pop stack */ 926 --eng.off; 927 928 /* Skip code */ 929 eng.cod += len; 930 } 931 continue; 932 933 case Re_AtLeast: 934 bas = eng.cod[1]; 935 if (eng.off == bas) { 936 /* First iteration */ 937 REPETITION_SETUP(); 938 } 939 else { 940 if (eng.eo[eng.off] >= eng.so[eng.off] && 941 eng.eo[eng.off] > eng.sv[eng.off]) { 942 /* Update offset of match */ 943 eng.sv[eng.off] = eng.eo[eng.off]; 944 945 /* Skip repetition instruction */ 946 eng.cod += 4; 947 } 948 else { 949 /* Last try failed */ 950 len = eng.cod[2] | (eng.cod[3] << 8); 951 if (eng.sv[eng.off] >= eng.so[eng.off]) { 952 /* Something matched earlier, update */ 953 eng.so[bas] = eng.ss[eng.off]; 954 eng.eo[bas] = eng.sv[eng.off]; 955 eng.str = eng.bas + eng.eo[bas]; 956 } 957 else { 958 /* Do it here, so that the fail label does 959 * not need to do too expensive work for 960 * simple patterns. */ 961 eng.so[bas] = eng.str - eng.bas; 962 963 /* Zero matches, pop stack and restart */ 964 --eng.off; 965 goto fail; 966 } 967 968 /* Pop stack and skip code */ 969 --eng.off; 970 eng.cod += len; 971 } 972 } 973 continue; 974 975 976 /**************************************************** 977 * Repetition with arguments * 978 ****************************************************/ 979 case Re_Exact: 980#define COMPLEX_REPETITION_SETUP_0() \ 981 i = eng.cod[1]; \ 982 bas = eng.cod[2]; 983 984#define COMPLEX_REPETITION_SETUP() \ 985 /* First iteration */ \ 986 if (++eng.off >= MAX_DEPTH) \ 987 return (RE_ASSERT); \ 988 \ 989 /* Remeber number or repetitions */ \ 990 eng.re[eng.off] = 0; \ 991 \ 992 /* Return here for recovery if match fail */ \ 993 eng.rcod[eng.off] = eng.cod; \ 994 \ 995 /* Setup match offsets */ \ 996 if (eng.so[bas] <= eng.eo[bas]) \ 997 eng.so[eng.off] = eng.eo[bas]; \ 998 else \ 999 eng.so[eng.off] = eng.so[bas]; \ 1000 eng.sv[eng.off] = eng.eo[eng.off] = eng.so[eng.off] - 1;\ 1001 eng.ss[eng.off] = eng.so[bas]; \ 1002 \ 1003 /* Skip repetition instruction */ \ 1004 eng.cod += 5; 1005 1006 COMPLEX_REPETITION_SETUP_0(); 1007 if (eng.off == bas) { 1008 /* First iteration */ 1009 COMPLEX_REPETITION_SETUP(); 1010 } 1011 else { 1012 if (eng.eo[eng.off] >= eng.so[eng.off] && 1013 eng.eo[eng.off] > eng.sv[eng.off]) { 1014 /* Update offset of match */ 1015 eng.sv[eng.off] = eng.eo[eng.off]; 1016 1017 /* Update repetition counter */ 1018 if (++eng.re[eng.off] == i) { 1019 /* Matched the required times */ 1020 eng.so[bas] = eng.ss[eng.off]; 1021 eng.eo[bas] = eng.sv[eng.off]; 1022 eng.str = eng.bas + eng.eo[bas]; 1023 1024 /* Update code */ 1025 k = eng.cod[3] | (eng.cod[4] << 8); 1026 eng.cod += k; 1027 1028 /* Pop stack and go for next pattern */ 1029 --eng.off; 1030 continue; 1031 } 1032 1033 /* Skip repetition instruction */ 1034 eng.cod += 5; 1035 } 1036 else { 1037 /* Do it here, so that the fail label does 1038 * not need to do too expensive work for 1039 * simple patterns. */ 1040 eng.so[bas] = eng.str - eng.bas; 1041 1042 /* Pop stack and restart */ 1043 --eng.off; 1044 goto fail; 1045 } 1046 } 1047 continue; 1048 1049 case Re_Min: 1050 COMPLEX_REPETITION_SETUP_0(); 1051 if (eng.off == bas) { 1052 /* First iteration */ 1053 COMPLEX_REPETITION_SETUP(); 1054 } 1055 else { 1056 if (eng.eo[eng.off] >= eng.so[eng.off] && 1057 eng.eo[eng.off] > eng.sv[eng.off]) { 1058 /* Update offset of match */ 1059 eng.sv[eng.off] = eng.eo[eng.off]; 1060 1061 /* Update repetition counter */ 1062 ++eng.re[eng.off]; 1063 1064 /* Skip repetition instruction and try again */ 1065 eng.cod += 5; 1066 } 1067 else { 1068 /* Match failed! */ 1069 if (eng.re[eng.off] < i) { 1070 /* Do it here, so that the fail label does 1071 * not need to do too expensive work for 1072 * simple patterns. */ 1073 eng.so[bas] = eng.str - eng.bas; 1074 1075 /* Didn't match required number of times */ 1076 --eng.off; 1077 goto fail; 1078 } 1079 else { 1080 /* Matched minimum number of times */ 1081 eng.eo[bas] = eng.sv[eng.off]; 1082 eng.str = eng.bas + eng.eo[bas]; 1083 k = eng.cod[3] | (eng.cod[4] << 8); 1084 1085 /* Update code and pop stack */ 1086 eng.cod += k; 1087 --eng.off; 1088 } 1089 } 1090 } 1091 continue; 1092 1093 case Re_Max: 1094 COMPLEX_REPETITION_SETUP_0(); 1095 if (eng.off == bas) { 1096 /* First iteration */ 1097 COMPLEX_REPETITION_SETUP(); 1098 } 1099 else { 1100 if (eng.eo[eng.off] >= eng.so[eng.off] && 1101 eng.eo[eng.off] > eng.sv[eng.off]) { 1102 /* Update offset of match */ 1103 eng.sv[eng.off] = eng.eo[eng.off]; 1104 1105 /* Update repetition counter */ 1106 if (++eng.re[eng.off] == i) { 1107 /* Matched the maximum times */ 1108 eng.so[bas] = eng.ss[eng.off]; 1109 eng.eo[bas] = eng.sv[eng.off]; 1110 eng.str = eng.bas + eng.eo[bas]; 1111 1112 k = eng.cod[3] | (eng.cod[4] << 8); 1113 1114 /* Update code and pop stack */ 1115 eng.cod += k; 1116 --eng.off; 1117 continue; 1118 } 1119 1120 /* Skip repetition instruction and try again */ 1121 eng.cod += 5; 1122 } 1123 else { 1124 /* No matches, but zero matches are ok */ 1125 k = eng.cod[3] | (eng.cod[4] << 8); 1126 if (eng.sv[eng.off] >= eng.so[eng.off]) { 1127 /* Something matched earlier, update */ 1128 eng.so[bas] = eng.ss[eng.off]; 1129 eng.eo[bas] = eng.sv[eng.off]; 1130 eng.str = eng.bas + eng.eo[bas]; 1131 } 1132 else { 1133 /* Empty match */ 1134 if (eng.eo[bas] < eng.so[bas]) 1135 eng.eo[bas] = eng.so[bas]; 1136 1137 /* Try next pattern at correct offset */ 1138 eng.str = eng.bas + eng.eo[bas]; 1139 } 1140 1141 /* Pop stack and update code */ 1142 --eng.off; 1143 eng.cod += k; 1144 } 1145 } 1146 continue; 1147 1148 case Re_MinMax: 1149 bas = eng.cod[3]; 1150 if (eng.off == bas) { 1151 /* First iteration */ 1152 COMPLEX_REPETITION_SETUP(); 1153 } 1154 else { 1155 if (eng.eo[eng.off] >= eng.so[eng.off] && 1156 eng.eo[eng.off] > eng.sv[eng.off]) { 1157 /* Update offset of match */ 1158 eng.sv[eng.off] = eng.eo[eng.off]; 1159 1160 /* Update repetition counter */ 1161 if (++eng.re[eng.off] == eng.cod[2]) { 1162 /* Matched the maximum times */ 1163 eng.so[bas] = eng.ss[eng.off]; 1164 eng.eo[bas] = eng.sv[eng.off]; 1165 eng.str = eng.bas + eng.eo[bas]; 1166 k = eng.cod[4] | (eng.cod[5] << 8); 1167 1168 /* Update code and pop stack */ 1169 eng.cod += k; 1170 --eng.off; 1171 continue; 1172 } 1173 1174 /* Skip repetition instruction and try again */ 1175 eng.cod += 6; 1176 } 1177 else { 1178 /* Match failed! */ 1179 if (eng.re[eng.off] < eng.cod[1]) { 1180 /* Do it here, so that the fail label does 1181 * not need to do too expensive work for 1182 * simple patterns. */ 1183 eng.so[bas] = eng.str - eng.bas; 1184 1185 /* Didn't match required number of times */ 1186 --eng.off; 1187 goto fail; 1188 } 1189 else { 1190 /* Matched minimum number of times */ 1191 eng.so[bas] = eng.ss[eng.off]; 1192 eng.eo[bas] = eng.sv[eng.off]; 1193 eng.str = eng.bas + eng.eo[bas]; 1194 k = eng.cod[4] | (eng.cod[5] << 8); 1195 1196 /* Update code and pop stack */ 1197 eng.cod += k; 1198 --eng.off; 1199 } 1200 } 1201 } 1202 continue; 1203 1204 1205 /**************************************************** 1206 * Special repetition handling * 1207 ****************************************************/ 1208 case Re_AnyAnyTimes: 1209 /* code(1) + bas(1) + gbas(1) + jump(2) */ 1210 bas = eng.cod[1]; 1211 if (eng.off == bas) { 1212 /* First iteration */ 1213 if (++eng.off >= MAX_DEPTH) 1214 return (RE_ASSERT); 1215 1216 /* Return here for recovery if match fail */ 1217 eng.rcod[eng.off] = eng.cod; 1218 1219 /* If fail, test the next pattern at the same point */ 1220 eng.rstr[eng.off] = eng.str; 1221 1222 /* Setup match offsets */ 1223 eng.so[eng.off] = eng.str - eng.bas; 1224 eng.eo[eng.off] = eng.so[eng.off] - 1; 1225 1226 if (newline) 1227 /* Use the repetition counter to store start of 1228 * skipped string, to later check if skipping a 1229 * newline. */ 1230 eng.re[eng.off] = eng.so[eng.off]; 1231 1232 /* Save start of possible previous matches */ 1233 eng.ss[eng.off] = eng.so[bas]; 1234 1235 /* Skip repetition instruction */ 1236 eng.cod += 5; 1237 } 1238 else { 1239 /* -1 as an unsigned char */ 1240 if (eng.cod[2] != 0xff) 1241 eng.goff = eng.cod[2]; 1242 else 1243 eng.goff = -1; 1244 1245 if (newline) { 1246 ptr = eng.bas + eng.re[eng.off]; 1247 str = eng.bas + eng.so[eng.off]; 1248 for (; ptr < str; ptr++) 1249 if (*ptr == '\n') { 1250 eng.cod = eng.rcod[0]; 1251 eng.so[0] = ptr - eng.bas + 1; 1252 eng.eo[0] = eng.so[0] - 1; 1253 eng.rstr[0] = eng.str = ptr + 1; 1254 eng.off = 0; 1255 goto reset; 1256 } 1257 /* If looping, don't do too many noops */ 1258 eng.re[eng.off] = ptr - eng.bas; 1259 } 1260 1261 if (eng.eo[eng.off] >= eng.so[eng.off]) { 1262 /* Note that this is only true if all possibly 1263 * nested special repetitions also matched. */ 1264 1265 if (eng.goff >= 0) { 1266 if (eng.cod[5] == Re_Update) 1267 eng.gso[eng.goff] = eng.eo[bas] + 1268 (eng.so[bas] > eng.eo[bas]); 1269 else if (eng.geo[eng.goff] < eng.so[eng.off]) 1270 eng.geo[eng.goff] = eng.so[eng.off]; 1271 } 1272 1273 /* Jump relative offset */ 1274 len = eng.cod[3] | (eng.cod[4] << 8); 1275 1276 /* Restore offset from where started trying */ 1277 eng.so[bas] = eng.ss[eng.off]; 1278 eng.eo[bas] = eng.eo[eng.off]; 1279 1280 /* Pop stack and skip code */ 1281 --eng.off; 1282 eng.cod += len; 1283 } 1284 else { 1285 /* Only give up if the entire string was scanned */ 1286 if (eng.str < eng.end) { 1287 /* Update restart point for next pattern */ 1288 eng.str = ++eng.rstr[eng.off]; 1289 1290 /* Reset start of nested match */ 1291 eng.so[eng.off] = eng.str - eng.bas; 1292 1293 /* Skip repetition instruction */ 1294 eng.cod += 5; 1295 } 1296 else { 1297 /* Entire string scanned and failed */ 1298 1299 /* Jump relative offset */ 1300 len = eng.cod[3] | (eng.cod[4] << 8); 1301 1302 /* Restore offset from where started trying */ 1303 eng.so[bas] = eng.ss[eng.off]; 1304 eng.eo[bas] = eng.ss[eng.off] - 1; 1305 1306 /* Pop stack and skip code */ 1307 --eng.off; 1308 eng.cod += len; 1309 } 1310 } 1311 } 1312 continue; 1313 1314 /* This is significantly different than matching <re>.*<re> 1315 * because it may need to restart several times since it is 1316 * possible to find too many false positives, for example: 1317 * a.*b => once one "a" is found, scan all 1318 * the remaining string searching for a "b" 1319 * a.?b => the string may have too many "a"s, but the 1320 * first occurrences of "a" may not be followed 1321 * by any-character and a "b" or a single "b". 1322 */ 1323 case Re_AnyMaybe: 1324 bas = eng.cod[1]; 1325 if (eng.off == bas) { 1326 /* First iteration */ 1327 if (++eng.off >= MAX_DEPTH) 1328 return (RE_ASSERT); 1329 1330 /* Return here for recovery if match fail */ 1331 eng.rcod[eng.off] = eng.cod; 1332 1333 /* First try without eating a byte */ 1334 eng.rstr[eng.off] = eng.str; 1335 1336 /* Remember this is the first try if match fail */ 1337 eng.re[eng.off] = 0; 1338 1339 /* Setup match offsets */ 1340 eng.so[eng.off] = eng.str - eng.bas; 1341 eng.eo[eng.off] = eng.so[eng.off] - 1; 1342 1343 /* Save start of possible previous matches */ 1344 eng.ss[eng.off] = eng.so[bas]; 1345 1346 /* Skip repetition instruction */ 1347 eng.cod += 5; 1348 } 1349 else { 1350 /* -1 as an unsigned char */ 1351 if (eng.cod[2] != 0xff) 1352 eng.goff = eng.cod[2]; 1353 else 1354 eng.goff = -1; 1355 1356 if (eng.eo[eng.off] >= eng.so[eng.off]) { 1357 /* Something matched */ 1358 1359 if (eng.goff >= 0) { 1360 if (eng.cod[5] == Re_Update) 1361 eng.gso[eng.goff] = eng.eo[bas] + 1362 (eng.so[bas] > eng.eo[bas]); 1363 else if (eng.geo[eng.goff] < eng.so[eng.off]) 1364 eng.geo[eng.goff] = eng.so[eng.off]; 1365 } 1366 1367 /* Jump relative offset */ 1368 len = eng.cod[3] | (eng.cod[4] << 8); 1369 1370 /* Update offset of match */ 1371 eng.eo[bas] = eng.eo[eng.off]; 1372 1373 /* Pop stack and skip code */ 1374 --eng.off; 1375 eng.cod += len; 1376 } 1377 else if (eng.re[eng.off] == 0 && 1378 (!newline || eng.rstr[eng.off][1] != '\n')) { 1379 /* Try this time skiping a byte */ 1380 ++eng.re[eng.off]; 1381 1382 /* Reset string, skip code and go try one time more */ 1383 eng.str = ++eng.rstr[eng.off]; 1384 eng.cod += 5; 1385 } 1386 else { 1387 /* Failed to match */ 1388 1389 /* Update offsets */ 1390 eng.eo[bas] = eng.ss[eng.off]; 1391 eng.so[bas] = eng.eo[bas] + 1; 1392 1393 eng.str = eng.rstr[eng.off] + (eng.re[eng.off] == 0); 1394 1395 /* Pop stack and return to toplevel code */ 1396 --eng.off; 1397 if (eng.str >= eng.end) 1398 goto wont; 1399 eng.cod = eng.rcod[bas]; 1400 } 1401 } 1402 continue; 1403 1404 /* .+ almost identical to .* but requires eating at least one byte */ 1405 case Re_AnyAtLeast: 1406 bas = eng.cod[1]; 1407 if (eng.off == bas) { 1408 /* First iteration */ 1409 if (++eng.off >= MAX_DEPTH) 1410 return (RE_ASSERT); 1411 1412 /* Return here for recovery if match fail */ 1413 eng.rcod[eng.off] = eng.cod; 1414 1415 /* Skip one byte for the restart string */ 1416 if (newline && eng.str[0] == '\n') { 1417 /* Cannot skip newline */ 1418 eng.cod = eng.rcod[0]; 1419 eng.rstr[0] = ++eng.str; 1420 eng.so[0] = eng.str - eng.bas; 1421 eng.eo[0] = eng.so[0] - 1; 1422 eng.off = 0; 1423 goto reset; 1424 } 1425 eng.rstr[eng.off] = ++eng.str; 1426 1427 /* Setup match offsets */ 1428 eng.so[eng.off] = eng.str - eng.bas; 1429 eng.eo[eng.off] = eng.so[eng.off] - 1; 1430 1431 if (newline) 1432 /* Use the repetition counter to store start of 1433 * skipped string, to later check if skipping a 1434 * newline. */ 1435 eng.re[eng.off] = eng.so[eng.off]; 1436 1437 /* Save start of possible previous matches */ 1438 eng.ss[eng.off] = eng.so[bas]; 1439 1440 /* Skip repetition instruction */ 1441 eng.cod += 5; 1442 } 1443 else { 1444 /* -1 as an unsigned char */ 1445 if (eng.cod[2] != 0xff) 1446 eng.goff = eng.cod[2]; 1447 else 1448 eng.goff = -1; 1449 1450 if (newline) { 1451 ptr = eng.bas + eng.re[eng.off]; 1452 str = eng.bas + eng.so[eng.off]; 1453 for (; ptr < str; ptr++) 1454 if (*ptr == '\n') { 1455 eng.cod = eng.rcod[0]; 1456 eng.so[0] = ptr - eng.bas + 1; 1457 eng.eo[0] = eng.so[0] - 1; 1458 eng.rstr[0] = eng.str = ptr + 1; 1459 eng.off = 0; 1460 goto reset; 1461 } 1462 /* If looping, don't do too many noops */ 1463 eng.re[eng.off] = ptr - eng.bas; 1464 } 1465 1466 if (eng.eo[eng.off] >= eng.so[eng.off]) { 1467 /* Note that this is only true if all possibly 1468 * nested special repetitions also matched. */ 1469 1470 if (eng.goff >= 0) { 1471 if (eng.cod[5] == Re_Update) 1472 eng.gso[eng.goff] = eng.eo[bas] + 1473 (eng.so[bas] > eng.eo[bas]); 1474 else if (eng.geo[eng.goff] < eng.so[eng.off]) 1475 eng.geo[eng.goff] = eng.so[eng.off]; 1476 } 1477 1478 /* Jump relative offset */ 1479 len = eng.cod[3] | (eng.cod[4] << 8); 1480 1481 /* Restore offset from where started trying */ 1482 eng.so[bas] = eng.ss[eng.off]; 1483 eng.eo[bas] = eng.eo[eng.off]; 1484 1485 /* Pop stack and skip code */ 1486 --eng.off; 1487 eng.cod += len; 1488 } 1489 else { 1490 /* Only give up if the entire string was scanned */ 1491 if (eng.str < eng.end) { 1492 /* Update restart point for next pattern */ 1493 eng.str = ++eng.rstr[eng.off]; 1494 1495 /* Reset start of nested match */ 1496 eng.so[eng.off] = eng.str - eng.bas; 1497 1498 /* Skip repetition instruction */ 1499 eng.cod += 5; 1500 } 1501 else { 1502 /* Entire string scanned and failed */ 1503 1504 /* Jump relative offset */ 1505 len = eng.cod[3] | (eng.cod[4] << 8); 1506 1507 /* Restore offset from where started trying */ 1508 eng.so[bas] = eng.ss[eng.off]; 1509 eng.eo[bas] = eng.ss[eng.off] - 1; 1510 1511 /* Pop stack and skip code */ 1512 --eng.off; 1513 eng.cod += len; 1514 } 1515 } 1516 } 1517 continue; 1518 1519 1520 /**************************************************** 1521 * Repetition matched! * 1522 ****************************************************/ 1523 case Re_RepJump: 1524 /* eng.cod[1] is toplevel offset of repetition */ 1525 if (eng.off > eng.cod[1]) 1526 /* If still needs to try matches */ 1527 eng.cod -= eng.cod[2]; 1528 else 1529 eng.cod += 3; /* + RepJump + bas + len-size */ 1530 continue; 1531 1532 case Re_RepLongJump: 1533 /* eng.cod[1] is toplevel offset of repetition */ 1534 if (eng.off > eng.cod[1]) 1535 /* If still needs to try matches */ 1536 eng.cod -= eng.cod[2] | (eng.cod[3] << 8); 1537 else 1538 eng.cod += 4; /* + RepLongJump + bas + len-size */ 1539 continue; 1540 1541 /**************************************************** 1542 * Finished * 1543 ****************************************************/ 1544 case Re_DoneIf: 1545 if (eng.eo[eng.off] >= eng.so[eng.off]) { 1546 eng.so[0] = eng.ss[eng.off]; 1547 eng.eo[0] = eng.eo[eng.off]; 1548 goto done; 1549 } 1550 ++eng.cod; 1551 continue; 1552 case Re_MaybeDone: 1553 if (eng.eo[eng.off] >= eng.so[eng.off]) { 1554 eng.so[0] = eng.ss[eng.off]; 1555 eng.eo[0] = eng.eo[eng.off]; 1556 goto done; 1557 } 1558 ++eng.cod; 1559 continue; 1560 case Re_Done: 1561 goto done; 1562 1563 default: 1564 /* Fatal internal error */ 1565 return (RE_ASSERT); 1566 } 1567 1568 1569wont: 1570 /* Surely won't match */ 1571 if (eng.off == 0) { 1572 eng.eo[0] = eng.so[0] - 1; 1573 break; 1574 } 1575 1576 1577fail: 1578 if (eng.off == 0) { 1579 /* If the entire string scanned */ 1580 if (++eng.str > eng.end) { 1581 eng.eo[0] = eng.so[0] - 1; 1582 break; 1583 } 1584 eng.goff = -1; 1585 /* Update start of possible match after restart */ 1586 if (eng.eo[0] >= eng.so[0]) { 1587 /* If first fail */ 1588 eng.str = eng.rstr[0]; 1589 ++eng.rstr[0]; 1590 eng.so[0] = eng.str - eng.bas; 1591 eng.eo[0] = eng.so[eng.off] - 1; 1592 } 1593 else 1594 /* Just trying at next byte */ 1595 ++eng.so[0]; 1596 } 1597 else 1598 /* Remember this match failed */ 1599 eng.eo[eng.off] = eng.so[eng.off] - 1; 1600 1601 /* Restart code */ 1602 eng.cod = eng.rcod[eng.off]; 1603 continue; 1604 1605 1606match: 1607 /* If first match */ 1608 if (eng.eo[eng.off] < eng.so[eng.off]) { 1609 if (eng.off == 0) 1610 eng.rstr[0] = eng.str + 1; 1611 eng.so[eng.off] = eng.eo[eng.off] = eng.str - eng.bas; 1612 } 1613 eng.eo[eng.off] += si; 1614 eng.cod += ci; 1615 eng.str += si; 1616 ci = si = 1; 1617 continue; 1618 1619done: 1620 break; 1621 } 1622 1623 if (nmatch) { 1624 if (flags & RE_STARTEND) 1625 len = pmat[0].rm_so; 1626 else 1627 len = 0; 1628 if (!nosub) { 1629 if (preg->cod[1] != 0xff) 1630 eng.goff = preg->cod[1]; 1631 pmat[0].rm_so = eng.so[0]; 1632 pmat[0].rm_eo = eng.eo[0]; 1633 for (i = 1; i < nmatch; i++) { 1634 if (i - 1 <= eng.goff) { 1635 pmat[i].rm_so = eng.gso[i - 1]; 1636 pmat[i].rm_eo = eng.geo[i - 1]; 1637 } 1638 else { 1639 pmat[i].rm_so = 0; 1640 pmat[i].rm_eo = -1; 1641 } 1642 } 1643 if (len) { 1644 /* Update offsets, since the match was done in a substring */ 1645 j = eng.goff + 2 > nmatch ? nmatch : eng.goff + 2; 1646 for (i = 0; i < j; i++) { 1647 pmat[i].rm_so += len; 1648 pmat[i].rm_eo += len; 1649 } 1650 } 1651 } 1652 else { 1653 /* Already know these values, allow compiling the regex with 1654 * RE_NOSUB to use parenthesis only for grouping, but avoiding 1655 * the runtime overhead of keeping track of the subexpression 1656 * offsets. */ 1657 pmat[0].rm_so = eng.so[0] + len; 1658 pmat[0].rm_eo = eng.eo[0] + len; 1659 } 1660 } 1661 1662 return (eng.so[0] <= eng.eo[0] ? 0 : RE_NOMATCH); 1663} 1664 1665int 1666reerror(int ecode, const re_cod *preg, char *ebuffer, int ebuffer_size) 1667{ 1668 static const char *errors[] = { 1669 "No error", 1670 "Failed to match", /* NOMATCH */ 1671 1672 /* Errors not generated */ 1673 "Invalid regular expression", /* BADPAT */ 1674 "Invalid collating element", /* ECOLLATE */ 1675 "Invalid character class", /* ECTYPE */ 1676 1677 "`\' applied to unescapable character", /* EESCAPE */ 1678 "Invalid backreference number", /* ESUBREG */ 1679 "Brackets `[ ]' not balanced", /* EBRACK */ 1680 "Parentheses `( )' not balanced", /* EPAREN */ 1681 "Braces `{ }' not balanced", /* EBRACE */ 1682 "Invalid repetition count(s) in `{ }'", /* BADBR */ 1683 "Invalid character range in `[ ]'", /* ERANGE */ 1684 "Out of memory", /* ESPACE */ 1685 "`?', `*', or `+' operand invalid", /* BADRPT */ 1686 "Empty (sub)expression", /* EMPTY */ 1687 "Assertion error - you found a bug", /* ASSERT */ 1688 "Invalid argument" /* INVARG */ 1689 }; 1690 const char *str; 1691 1692 if (ecode >= 0 && ecode < sizeof(errors) / sizeof(errors[0])) 1693 str = errors[ecode]; 1694 else 1695 str = "Unknown error"; 1696 1697 return (snprintf(ebuffer, ebuffer_size, "%s", str)); 1698} 1699 1700void 1701refree(re_cod *cod) 1702{ 1703 free(cod->cod); 1704 cod->cod = NULL; 1705} 1706 1707static void 1708reinit(void) 1709{ 1710 int i; 1711 static int first = 1; 1712 1713 if (!first) 1714 return; 1715 first = 0; 1716 1717 re__alnum['_'] = 1; 1718 1719 for (i = '0'; i <= '7'; i++) 1720 re__alnum[i] = re__odigit[i] = re__ddigit[i] = re__xdigit[i] = 1; 1721 1722 for (; i <= '9'; i++) 1723 re__alnum[i] = re__ddigit[i] = re__xdigit[i] = 1; 1724 1725 for (i = 'a'; i <= 'f'; i++) 1726 re__alnum[i] = re__xdigit[i] = 1; 1727 for (; i <= 'z'; i++) 1728 re__alnum[i] = 1; 1729 1730 for (i = 'A'; i <= 'F'; i++) 1731 re__alnum[i] = re__xdigit[i] = 1; 1732 for (; i <= 'Z'; i++) 1733 re__alnum[i] = 1; 1734 1735 for (i = 1; i < 32; i++) 1736 re__control[i] = 1; 1737 re__control[127] = 1; 1738 /* Don't show tabs as control characters */ 1739 re__control['\t'] = 0; 1740} 1741 1742static int 1743rec_check(re_inf *inf, int count) 1744{ 1745 if (inf->len + count >= inf->spc) { 1746 int spc; 1747 unsigned char *cod; 1748 1749 if ((spc = (count % 64)) != 0) 1750 spc = 64 - spc; 1751 spc += count + inf->spc; 1752 if ((cod = realloc(inf->cod, spc)) == NULL) 1753 return (inf->ecode = RE_ESPACE); 1754 inf->cod = cod; 1755 inf->spc = spc; 1756 } 1757 1758 return (inf->ecode); 1759} 1760 1761static int 1762rec_code(re_inf *inf, ReCode code) 1763{ 1764 if (rec_check(inf, 1) == 0) 1765 inf->cod[inf->len++] = code; 1766 1767 return (inf->ecode); 1768} 1769 1770static int 1771rec_byte(re_inf *inf, int value) 1772{ 1773 if (rec_check(inf, 1) == 0) 1774 inf->cod[inf->len++] = value; 1775 1776 return (inf->ecode); 1777} 1778 1779static int 1780rec_code_byte(re_inf *inf, ReCode code, int value) 1781{ 1782 if (rec_check(inf, 2) == 0) { 1783 inf->cod[inf->len++] = code; 1784 inf->cod[inf->len++] = value; 1785 } 1786 1787 return (inf->ecode); 1788} 1789 1790static int 1791rec_length(re_inf *inf, int length) 1792{ 1793 int lo, hi, two; 1794 1795 if (length >= 16384) 1796 return (inf->ecode = RE_ESPACE); 1797 1798 lo = length & 0xff; 1799 hi = length & 0xff00; 1800 two = ((length > 0x7f) != 0) + 1; 1801 if (two == 2) { 1802 hi <<= 1; 1803 hi |= (lo & 0x80) != 0; 1804 lo |= 0x80; 1805 } 1806 1807 if (rec_check(inf, two) == 0) { 1808 inf->cod[inf->len++] = lo; 1809 if (two == 2) 1810 inf->cod[inf->len++] = hi >> 8; 1811 } 1812 1813 return (inf->ecode); 1814} 1815 1816static int 1817rec_byte_byte(re_inf *inf, int value0, int value1) 1818{ 1819 if (rec_check(inf, 2) == 0) { 1820 inf->cod[inf->len++] = value0; 1821 inf->cod[inf->len++] = value1; 1822 } 1823 1824 return (inf->ecode); 1825} 1826 1827static int 1828rec_code_byte_byte(re_inf *inf, ReCode code, int value0, int value1) 1829{ 1830 if (rec_check(inf, 3) == 0) { 1831 inf->cod[inf->len++] = code; 1832 inf->cod[inf->len++] = value0; 1833 inf->cod[inf->len++] = value1; 1834 } 1835 1836 return (inf->ecode); 1837} 1838 1839static int 1840rec_build_alt(re_inf *inf, rec_alt *alt) 1841{ 1842 int offset, value, bas = inf->bas + 1; 1843 1844 if (alt) { 1845 if (alt->next) { 1846 if (rec_inc_spc(inf)) 1847 return (inf->ecode); 1848 1849 /* A real a list of alternatives */ 1850 rec_code(inf, Re_Alt); 1851 1852 offset = inf->len; /* Remember current offset */ 1853 rec_byte_byte(inf, 0, 0); /* Reserve two bytes for retry address */ 1854 while (alt && inf->ecode == 0) { 1855 if (rec_build_pat(inf, alt->pat)) 1856 break; 1857 alt = alt->next; 1858 if (alt && inf->ecode == 0) { 1859 /* Handle (hyper)complex repetitions */ 1860 if (inf->bas != bas) { 1861 /* Duplicate patterns up to end of expression */ 1862 rec_build_pat(inf, inf->apat); 1863 /* Restore engine state for next alternative(s) */ 1864 rec_alt_spc(inf, bas - 1); 1865 } 1866 1867 /* If the jump would be so long */ 1868 if ((value = inf->len - offset) >= 16384) { 1869 inf->ecode = RE_ESPACE; 1870 break; 1871 } 1872 inf->cod[offset] = value & 0xff; 1873 inf->cod[offset + 1] = (value & 0xff00) >> 8; 1874 1875 rec_code(inf, Re_AltNext); 1876 offset = inf->len; 1877 rec_byte_byte(inf, 0, 0); 1878 } 1879 } 1880 if (inf->ecode == 0) { 1881 /* Handle (hyper)complex repetitions */ 1882 if (inf->bas != bas) { 1883 /* Duplicate patterns up to end of expression */ 1884 rec_build_pat(inf, inf->apat); 1885 /* Restore engine state for next alternative(s) */ 1886 rec_alt_spc(inf, bas - 1); 1887 } 1888 1889 /* If the jump would be so long */ 1890 if ((value = inf->len - offset) >= 16384) 1891 return (inf->ecode = RE_ESPACE); 1892 inf->cod[offset] = value & 0xff; 1893 inf->cod[offset + 1] = (value & 0xff00) >> 8; 1894 /* Last jump is here */ 1895 rec_code(inf, Re_AltDone); 1896 } 1897 rec_dec_spc(inf); 1898 } 1899 else 1900 /* Single alternative */ 1901 rec_build_pat(inf, alt->pat); 1902 } 1903 1904 return (inf->ecode); 1905} 1906 1907static int 1908rec_build_pat(re_inf *inf, rec_pat *pat) 1909{ 1910 rec_pat *apat; 1911 int length, offset = 0, distance, jump = 0, bas = 0; 1912 1913 while (pat && inf->ecode == 0) { 1914 if (pat->rep) { 1915 bas = inf->bas; 1916 if (pat->type == Rep_Group && !inf->par && rec_code(inf, Re_Open)) 1917 return (inf->ecode); 1918 if (rec_inc_spc(inf)) 1919 return (inf->ecode); 1920 offset = inf->len; 1921 if (rec_build_rep(inf, pat->rep)) 1922 break; 1923 /* Reserve space to jump after repetition done */ 1924 jump = inf->len; 1925 rec_byte_byte(inf, 0, 0); 1926 } 1927 switch (pat->type) { 1928 case Rep_AnyAnyTimes: 1929 case Rep_AnyMaybe: 1930 case Rep_AnyAtLeast: 1931 if (rec_add_spc(inf, pat->type == Rep_AnyMaybe)) 1932 return (inf->ecode); 1933 if (rec_code(inf, (ReCode)pat->type) == 0 && 1934 rec_byte(inf, inf->bas - 1) == 0 && 1935 rec_byte(inf, inf->ref - 1) == 0) 1936 rec_off_spc(inf); 1937 break; 1938 case Rep_Literal: 1939 case Rep_LiteralNot: 1940 case Rep_SearchLiteral: 1941 rec_code_byte(inf, (ReCode)pat->type, pat->data.chr); 1942 break; 1943 case Rep_CaseLiteral: 1944 case Rep_CaseLiteralNot: 1945 case Rep_SearchCaseLiteral: 1946 rec_code_byte_byte(inf, (ReCode)pat->type, 1947 pat->data.cse.lower, pat->data.cse.upper); 1948 break; 1949 case Rep_Range: 1950 case Rep_RangeNot: 1951 if (rec_code(inf, (ReCode)pat->type) == 0) 1952 rec_build_rng(inf, pat->data.rng); 1953 break; 1954 case Rep_String: 1955 case Rep_SearchString: 1956 case Rep_CaseString: 1957 case Rep_SearchCaseString: 1958 rec_code(inf, (ReCode)pat->type); 1959 length = strlen((char*)pat->data.str); 1960 if (rec_length(inf, length) == 0 && rec_check(inf, length) == 0) { 1961 memcpy(inf->cod + inf->len, pat->data.str, length); 1962 inf->len += length; 1963 } 1964 break; 1965 case Rep_Any: 1966 case Rep_AnyEatAnyTimes: 1967 case Rep_AnyEatMaybe: 1968 case Rep_AnyEatAtLeast: 1969 case Rep_Odigit: 1970 case Rep_OdigitNot: 1971 case Rep_Digit: 1972 case Rep_DigitNot: 1973 case Rep_Xdigit: 1974 case Rep_XdigitNot: 1975 case Rep_Space: 1976 case Rep_SpaceNot: 1977 case Rep_Tab: 1978 case Rep_Newline: 1979 case Rep_Lower: 1980 case Rep_Upper: 1981 case Rep_Alnum: 1982 case Rep_AlnumNot: 1983 case Rep_Control: 1984 case Rep_ControlNot: 1985 case Rep_Bol: 1986 case Rep_Eol: 1987 case Rep_Bow: 1988 case Rep_Eow: 1989 rec_code(inf, (ReCode)pat->type); 1990 break; 1991 case Rep_Backref: 1992 rec_code_byte(inf, Re_Backref, pat->data.chr); 1993 break; 1994 case Rep_Group: 1995 if (pat->rep == NULL && !inf->par && rec_code(inf, Re_Open)) 1996 break; 1997 apat = inf->apat; 1998 inf->apat = pat->next; 1999 rec_build_grp(inf, pat->data.grp); 2000 inf->apat = apat; 2001 break; 2002 case Rep_StringList: 2003 rec_build_stl(inf, pat->data.stl); 2004 break; 2005 } 2006 if (pat->rep) { 2007#if 0 2008 if (rec_dec_spc(inf)) 2009 return (inf->ecode); 2010#else 2011 if (rec_rep_spc(inf, bas)) 2012 return (inf->ecode); 2013#endif 2014 distance = inf->len - offset; 2015 if (distance > 255) { 2016 if (rec_code(inf, Re_RepLongJump) || 2017 rec_byte(inf, inf->bas) || 2018 rec_byte(inf, distance & 0xff) || 2019 rec_byte(inf, (distance & 0xff00) >> 8)) 2020 break; 2021 } 2022 else if (rec_code(inf, Re_RepJump) || 2023 rec_byte(inf, inf->bas) || 2024 rec_byte(inf, distance)) 2025 break; 2026 distance = inf->len - offset; 2027 inf->cod[jump] = distance & 0xff; 2028 inf->cod[jump + 1] = (distance & 0xff00) >> 8; 2029 } 2030 pat = pat->next; 2031 } 2032 2033 return (inf->ecode); 2034} 2035 2036static int 2037rec_build_rng(re_inf *inf, rec_rng *rng) 2038{ 2039 if (rec_check(inf, sizeof(rng->range)) == 0) { 2040 memcpy(inf->cod + inf->len, rng->range, sizeof(rng->range)); 2041 inf->len += sizeof(rng->range); 2042 } 2043 2044 return (inf->ecode); 2045} 2046 2047static int 2048rec_build_grp(re_inf *inf, rec_grp *grp) 2049{ 2050 int par = inf->par; 2051 2052 if (!(inf->flags & RE_NOSUB)) { 2053 ++inf->par; 2054 if (par == 0) 2055 ++inf->ref; 2056 if (rec_build_alt(inf, grp->alt) == 0) { 2057 if (par == 0) { 2058 if (grp->comp) 2059 rec_code_byte(inf, Re_Update, inf->ref - 1); 2060 else 2061 rec_code(inf, Re_Close); 2062 } 2063 } 2064 --inf->par; 2065 } 2066 else 2067 rec_build_alt(inf, grp->alt); 2068 2069 return (inf->ecode); 2070} 2071 2072static int 2073rec_build_stl(re_inf *inf, rec_stl *stl) 2074{ 2075 int i, len, rlen; 2076 ReCode code; 2077 2078 /* Calculate jump distance information */ 2079 rlen = stl->tlen + stl->nstrs + 4; 2080 /* + code + nstrs + place-offset + data-length */ 2081 2082 if (stl->nstrs >= LARGE_STL_COUNT) { 2083 rlen += 511; /* Don't write number of strings */ 2084 code = stl->type == Rep_StringList ? 2085 Re_LargeStringList : Re_LargeCaseStringList; 2086 } 2087 else 2088 code = (ReCode)stl->type; 2089 2090 if (rlen >= 16386) 2091 return (inf->ecode = RE_ESPACE); 2092 if (rec_check(inf, rlen) || 2093 rec_code(inf, code)) 2094 return (inf->ecode); 2095 2096 /* Space is allocated, just write the data */ 2097 if (stl->nstrs < LARGE_STL_COUNT) 2098 inf->cod[inf->len++] = stl->nstrs; 2099 2100 inf->cod[inf->len++] = rlen & 0xff; 2101 inf->cod[inf->len++] = (rlen & 0xff00) >> 8; 2102 2103 if (stl->nstrs < LARGE_STL_COUNT) { 2104 for (i = 0; i < stl->nstrs; i++) 2105 inf->cod[inf->len++] = stl->lens[i]; 2106 for (i = 0; i < stl->nstrs; i++) { 2107 len = stl->lens[i]; 2108 if (len > 2) { 2109 memcpy(inf->cod + inf->len, stl->strs[i], len); 2110 inf->len += len; 2111 } 2112 else { 2113 if (len == 1) 2114 inf->cod[inf->len++] = (long)stl->strs[i]; 2115 else { 2116 inf->cod[inf->len++] = (long)stl->strs[i] & 0xff; 2117 inf->cod[inf->len++] = ((long)stl->strs[i] & 0xff00) >> 8; 2118 } 2119 } 2120 } 2121 } 2122 else { 2123 /* The string length goes before the string itself */ 2124 int j, chl, chu; 2125 2126 /* Fill everything with an invalid jump address */ 2127 memset(inf->cod + inf->len, 0xff, 512); 2128 for (i = len = 0, j = -1; i < stl->nstrs; i++) { 2129 chl = stl->lens[i] > 2 ? stl->strs[i][0] : (long)stl->strs[i] & 0xff; 2130 if (chl != j) { 2131 inf->cod[inf->len + (chl << 1)] = len & 0xff; 2132 inf->cod[inf->len + (chl << 1) + 1] = (len & 0xff00) >> 8; 2133 if (code == Re_LargeCaseStringList) { 2134 chu = stl->lens[i] > 2 ? 2135 stl->strs[i][1] : ((long)(stl->strs[i]) & 0xff00) >> 8; 2136 inf->cod[inf->len + (chu << 1)] = len & 0xff; 2137 inf->cod[inf->len + (chu << 1) + 1] = (len & 0xff00) >> 8; 2138 } 2139 j = chl; 2140 } 2141 len += stl->lens[i] + 1; 2142 } 2143 inf->len += 512; 2144 2145 for (i = 0; i < stl->nstrs; i++) { 2146 len = stl->lens[i]; 2147 inf->cod[inf->len++] = len; 2148 if (len > 2) { 2149 memcpy(inf->cod + inf->len, stl->strs[i], len); 2150 inf->len += len; 2151 } 2152 else { 2153 if (len == 1) 2154 inf->cod[inf->len++] = (long)stl->strs[i]; 2155 else { 2156 inf->cod[inf->len++] = (long)stl->strs[i] & 0xff; 2157 inf->cod[inf->len++] = ((long)stl->strs[i] & 0xff00) >> 8; 2158 } 2159 } 2160 } 2161 } 2162 2163 return (inf->ecode); 2164} 2165 2166static int 2167rec_build_rep(re_inf *inf, rec_rep *rep) 2168{ 2169 if (rep) { 2170 switch (rep->type) { 2171 case Rer_AnyTimes: 2172 case Rer_AtLeast: 2173 case Rer_Maybe: 2174 rec_code(inf, (ReCode)rep->type); 2175 break; 2176 case Rer_Exact: 2177 if (rec_code(inf, Re_Exact) == 0) 2178 rec_byte(inf, rep->mine); 2179 break; 2180 case Rer_Min: 2181 if (rec_code(inf, Re_Min) == 0) 2182 rec_byte(inf, rep->mine); 2183 break; 2184 case Rer_Max: 2185 if (rec_code(inf, Re_Max) == 0) 2186 rec_byte(inf, rep->maxc); 2187 break; 2188 case Rer_MinMax: 2189 if (rec_code(inf, Re_MinMax) == 0 && 2190 rec_byte(inf, rep->mine) == 0) 2191 rec_byte(inf, rep->maxc); 2192 break; 2193 } 2194 /* It is incremented in rec_build_pat */ 2195 rec_byte(inf, inf->bas - 1); 2196 } 2197 2198 return (inf->ecode); 2199} 2200 2201static int 2202rec_inc_spc(re_inf *inf) 2203{ 2204 if (++inf->bas >= MAX_DEPTH) 2205 return (inf->ecode = RE_ESPACE); 2206 2207 return (inf->ecode); 2208} 2209 2210static int 2211rec_dec_spc(re_inf *inf) 2212{ 2213 if (--inf->bas < 0) 2214 return (inf->ecode = RE_ASSERT); 2215 2216 return (inf->ecode); 2217} 2218 2219static int 2220rec_add_spc(re_inf *inf, int maybe) 2221{ 2222 if (++inf->bas >= MAX_DEPTH) 2223 return (inf->ecode = RE_ESPACE); 2224 inf->sp[inf->bas] = maybe + 1; 2225 2226 return (inf->ecode); 2227} 2228 2229/* Could be joined with rec_rep_spc, code almost identical */ 2230static int 2231rec_alt_spc(re_inf *inf, int top) 2232{ 2233 int distance, i, bas = inf->bas; 2234 2235 while ((inf->bas > top) && inf->sp[inf->bas]) { 2236 /* Jump to this repetition for cleanup */ 2237 distance = inf->len - inf->sr[inf->bas]; 2238 2239 /* This will generate a jump to a jump decision opcode */ 2240 inf->sj[inf->bas] = inf->len; 2241 2242 if (distance > 255) { 2243 if (rec_code(inf, Re_RepLongJump) || 2244 rec_byte(inf, inf->bas - 1) || 2245 rec_byte(inf, distance & 0xff) || 2246 rec_byte(inf, (distance & 0xff00) >> 8)) 2247 break; 2248 } 2249 else if (rec_code(inf, Re_RepJump) || 2250 rec_byte(inf, inf->bas - 1) || 2251 rec_byte(inf, distance)) 2252 break; 2253 2254 /* Top of stack value before repetition, or end condition value */ 2255 --inf->bas; 2256 } 2257 2258 i = inf->bas + 1; 2259 2260 if (inf->ecode == 0 && i <= bas && inf->sp[i]) { 2261 /* Only the repetition at the bottom jump to code after testing 2262 * all possibilities */ 2263 distance = inf->len - inf->sr[i]; 2264 inf->cod[inf->sr[i] + 3] = distance & 0xff; 2265 inf->cod[inf->sr[i] + 4] = (distance & 0xff00) >> 8; 2266 2267 /* The bottom jump is here */ 2268 if (rec_code(inf, inf->sp[i] == 1 ? Re_DoneIf : Re_MaybeDone)) 2269 return (inf->ecode); 2270 2271 /* Generate jumps to the previous special repetition */ 2272 for (++i; i <= bas; i++) { 2273 if (inf->sp[i]) { 2274 distance = inf->sj[i] - inf->sr[i]; 2275 inf->cod[inf->sr[i] + 3] = distance & 0xff; 2276 inf->cod[inf->sr[i] + 4] = (distance & 0xff00) >> 8; 2277 } 2278 } 2279 } 2280 2281 return (inf->ecode); 2282} 2283 2284static int 2285rec_rep_spc(re_inf *inf, int top) 2286{ 2287 int distance, i, bas = inf->bas; 2288 2289 while (inf->bas > top) { 2290 if (inf->sp[inf->bas]) { 2291 /* Jump to this repetition for cleanup */ 2292 distance = inf->len - inf->sr[inf->bas]; 2293 2294 /* This will generate a jump to a jump decision opcode */ 2295 inf->sj[inf->bas] = inf->len; 2296 2297 if (distance > 255) { 2298 if (rec_code(inf, Re_RepLongJump) || 2299 rec_byte(inf, inf->bas - 1) || 2300 rec_byte(inf, distance & 0xff) || 2301 rec_byte(inf, (distance & 0xff00) >> 8)) 2302 break; 2303 } 2304 else if (rec_code(inf, Re_RepJump) || 2305 rec_byte(inf, inf->bas - 1) || 2306 rec_byte(inf, distance)) 2307 break; 2308 } 2309 2310 /* Top of stack value before repetition, or end condition value */ 2311 --inf->bas; 2312 } 2313 2314 /* Find first special repetition offset. XXX This should be a noop */ 2315 for (i = 0; i < bas; i++) 2316 if (inf->sp[i]) 2317 break; 2318 2319 if (inf->ecode == 0 && i <= bas && inf->sp[i]) { 2320 /* Only the repetition at the bottom jump to code after testing 2321 * all possibilities */ 2322 distance = inf->len - inf->sr[i]; 2323 inf->cod[inf->sr[i] + 3] = distance & 0xff; 2324 inf->cod[inf->sr[i] + 4] = (distance & 0xff00) >> 8; 2325 2326 /* Generate jumps to the previous special repetition */ 2327 for (++i; i <= bas; i++) { 2328 if (inf->sp[i]) { 2329 distance = inf->sj[i] - inf->sr[i]; 2330 inf->cod[inf->sr[i] + 3] = distance & 0xff; 2331 inf->cod[inf->sr[i] + 4] = (distance & 0xff00) >> 8; 2332 } 2333 } 2334 } 2335 2336 return (inf->ecode); 2337} 2338 2339static int 2340rec_off_spc(re_inf *inf) 2341{ 2342 /* The jump address before the three bytes instruction */ 2343 inf->sr[inf->bas] = inf->len - 3; 2344 /* Don't know yet where to go after done with the special 2345 * repetition, just reserve two bytes for the jump address. */ 2346 return (rec_byte_byte(inf, 0, 0)); 2347} 2348 2349#ifdef DEBUG 2350static void 2351redump(re_cod *code) 2352{ 2353 int i, j, k; 2354 unsigned char *cod = code->cod, *stl; 2355 2356 if (cod[0] & RE_NOSUB) 2357 printf("Nosub\n"); 2358 if (cod[0] & RE_NEWLINE) 2359 printf("Newline\n"); 2360 ++cod; 2361 if (cod[0] != 0xff) 2362 printf("%d backrefs\n", cod[0] + 1); 2363 ++cod; 2364 for (;;) { 2365 switch (*cod++) { 2366 case Re_Open: 2367 printf("Open"); 2368 break; 2369 case Re_Close: 2370 printf("Close"); 2371 break; 2372 case Re_Update: 2373 printf("Update (%d)", (int)*cod++); 2374 break; 2375 case Re_Alt: 2376 printf("Alt"); 2377 i = cod[0] | cod[1]; 2378 cod += 2; 2379 printf(" %d", i); 2380 break; 2381 case Re_AltNext: 2382 printf("Alt-next"); 2383 i = cod[0] | cod[1]; 2384 cod += 2; 2385 printf(" %d", i); 2386 break; 2387 case Re_AltDone: 2388 printf("Alt-done"); 2389 break; 2390 case Re_AnyTimes: 2391 printf("-> Anytimes %d", (int)*cod++); 2392 i = cod[0] | (cod[1] << 8); 2393 cod += 2; 2394 printf(" /%d", i); 2395 break; 2396 case Re_AnyEatAnyTimes: 2397 printf("Any-eat-anytimes"); 2398 break; 2399 case Re_AnyAnyTimes: 2400 printf("-> Any-anytimes %d", (int)*cod++); 2401 printf(" (%d)", (int)*cod++); 2402 i = cod[0] | (cod[1] << 8); 2403 cod += 2; 2404 printf(" /%d", i); 2405 break; 2406 case Re_AnyEatMaybe: 2407 printf("Any-eat-maybe"); 2408 break; 2409 case Re_AnyMaybe: 2410 printf("-> Any-maybe %d", (int)*cod++); 2411 printf(" (%d)", (int)*cod++); 2412 i = cod[0] | (cod[1] << 8); 2413 cod += 2; 2414 printf(" /%d", i); 2415 break; 2416 case Re_AnyAtLeast: 2417 printf("-> Any-atleast %d", (int)*cod++); 2418 printf(" (%d)", (int)*cod++); 2419 i = cod[0] | (cod[1] << 8); 2420 cod += 2; 2421 printf(" /%d", i); 2422 break; 2423 case Re_AnyEatAtLeast: 2424 printf("Any-eat-atleast"); 2425 break; 2426 case Re_Maybe: 2427 printf("-> Maybe %d", (int)*cod++); 2428 i = cod[0] | (cod[1] << 8); 2429 cod += 2; 2430 printf(" /%d", i); 2431 break; 2432 case Re_AtLeast: 2433 printf("-> Atleast %d", (int)*cod++); 2434 i = cod[0] | (cod[1] << 8); 2435 cod += 2; 2436 printf(" /%d", i); 2437 break; 2438 case Re_Exact: 2439 printf("-> Exact "); 2440 i = *cod++; 2441 printf("%d", i); 2442 printf(" %d", (int)*cod++); 2443 i = cod[0] | (cod[1] << 8); 2444 cod += 2; 2445 printf(" /%d", i); 2446 break; 2447 case Re_Min: 2448 printf("-> Min "); 2449 i = *cod++; 2450 printf("%d", i); 2451 printf(" %d", (int)*cod++); 2452 i = cod[0] | (cod[1] << 8); 2453 cod += 2; 2454 printf(" /%d", i); 2455 break; 2456 case Re_Max: 2457 printf("-> Max "); 2458 i = *cod++; 2459 printf("%d", i); 2460 printf(" %d", (int)*cod++); 2461 i = cod[0] | (cod[1] << 8); 2462 cod += 2; 2463 printf(" /%d", i); 2464 break; 2465 case Re_MinMax: 2466 printf("-> Min-max "); 2467 i = *cod++; 2468 printf("%d ", i); 2469 i = *cod++; 2470 printf("%d", i); 2471 printf(" %d", (int)*cod++); 2472 i = cod[0] | (cod[1] << 8); 2473 cod += 2; 2474 printf(" /%d", i); 2475 break; 2476 case Re_RepJump: 2477 printf("<- Rep-jump %d ", (int)*cod++); 2478 i = *cod++; 2479 printf("%d", i); 2480 break; 2481 case Re_RepLongJump: 2482 printf("<- Rep-long-jump %d ", (int)*cod++); 2483 i = cod[0] | (cod[1] << 8); 2484 printf("%d", i); 2485 break; 2486 case Re_Any: 2487 printf("Any"); 2488 break; 2489 case Re_Odigit: 2490 printf("Odigit"); 2491 break; 2492 case Re_OdigitNot: 2493 printf("Odigit-not"); 2494 break; 2495 case Re_Digit: 2496 printf("Digit"); 2497 break; 2498 case Re_DigitNot: 2499 printf("Digit-not"); 2500 break; 2501 case Re_Xdigit: 2502 printf("Xdigit"); 2503 break; 2504 case Re_XdigitNot: 2505 printf("Xdigit-not"); 2506 break; 2507 case Re_Space: 2508 printf("Space"); 2509 break; 2510 case Re_SpaceNot: 2511 printf("Space-not"); 2512 break; 2513 case Re_Tab: 2514 printf("Tab"); 2515 break; 2516 case Re_Newline: 2517 printf("Newline"); 2518 break; 2519 case Re_Lower: 2520 printf("Lower"); 2521 break; 2522 case Re_Upper: 2523 printf("Upper"); 2524 break; 2525 case Re_Alnum: 2526 printf("Alnum"); 2527 break; 2528 case Re_AlnumNot: 2529 printf("Alnum-not"); 2530 break; 2531 case Re_Control: 2532 printf("Control"); 2533 break; 2534 case Re_ControlNot: 2535 printf("Control-not"); 2536 break; 2537 case Re_Bol: 2538 printf("Bol"); 2539 break; 2540 case Re_Eol: 2541 printf("Eol"); 2542 break; 2543 case Re_Bow: 2544 printf("Bow"); 2545 break; 2546 case Re_Eow: 2547 printf("Eow"); 2548 break; 2549 case Re_Range: 2550 printf("Range "); 2551 goto range; 2552 case Re_RangeNot: 2553 printf("Range-not "); 2554range: 2555 for (i = 0; i < 256; i += 32) { 2556 for (j = k = 0; j < 32; j++) 2557 k |= (*cod++ & 1) << (31 - j); 2558 printf("%x ", k); 2559 } 2560 break; 2561 case Re_Literal: 2562 printf("Literal %c", *cod++); 2563 break; 2564 case Re_LiteralNot: 2565 printf("Literal-not %c", *cod++); 2566 break; 2567 case Re_SearchLiteral: 2568 printf("Search-literal %c", *cod++); 2569 break; 2570 case Re_CaseLiteral: 2571 printf("Case-literal %c", *cod++); 2572 putchar(*cod++); 2573 break; 2574 case Re_CaseLiteralNot: 2575 printf("Case-literal-not %c", *cod++); 2576 putchar(*cod++); 2577 break; 2578 case Re_SearchCaseLiteral: 2579 printf("Search-case-literal %c", *cod++); 2580 putchar(*cod++); 2581 break; 2582 case Re_String: 2583 printf("String "); 2584 goto string; 2585 case Re_SearchString: 2586 printf("Search-string "); 2587 goto string; 2588 case Re_CaseString: 2589 printf("Case-string "); 2590 goto string; 2591 case Re_SearchCaseString: 2592 printf("Search-case-string "); 2593string: 2594 i = *cod++; 2595 if (i & 0x80) 2596 i = (i & 0x7f) | (*cod++ << 7); 2597 for (j = 0; j < i; j++) 2598 putchar(*cod++); 2599 break; 2600 case Re_StringList: 2601 printf("String-list"); 2602 goto string_list; 2603 case Re_CaseStringList: 2604 printf("Case-string-list"); 2605string_list: 2606 j = *cod++; 2607 cod += 2; 2608 stl = cod + j; 2609 for (i = 0; i < j; i++) { 2610 k = *cod++; 2611 putchar(i ? ',' : ' '); 2612 fwrite(stl, k, 1, stdout); 2613 stl += k; 2614 } 2615 cod = stl; 2616 break; 2617 case Re_LargeStringList: 2618 printf("Large-string-list"); 2619large_string_list: 2620 i = cod[0] | (cod[1] << 8); 2621 stl = cod + i - 1; 2622 for (i = 0, cod += 514; cod < stl; i++) { 2623 k = *cod++; 2624 putchar(i ? ',' : ' '); 2625 fwrite(cod, k, 1, stdout); 2626 cod += k; 2627 } 2628 cod = stl; 2629 break; 2630 case Re_LargeCaseStringList: 2631 printf("Large-case-string-list"); 2632 goto large_string_list; 2633 case Re_Backref: 2634 printf("Backref %d", (int)*cod++); 2635 break; 2636 case Re_DoneIf: 2637 printf("Done-if"); 2638 break; 2639 case Re_MaybeDone: 2640 printf("Maybe-done"); 2641 break; 2642 case Re_Done: 2643 printf("Done\n"); 2644 return; 2645 } 2646 putchar('\n'); 2647 } 2648} 2649#endif 2650