1 /* $NetBSD: regex.c,v 1.3 2020/09/26 11:39:17 mlelstv Exp $ */ 2 3 /* Extended regular expression matching and search library, 4 version 0.12. 5 (Implements POSIX draft P1003.2/D11.2, except for some of the 6 internationalization features.) 7 Copyright (C) 1993-1999, 2000, 2001 Free Software Foundation, Inc. 8 9 The GNU C Library is free software; you can redistribute it and/or 10 modify it under the terms of the GNU Library General Public License as 11 published by the Free Software Foundation; either version 2 of the 12 License, or (at your option) any later version. 13 14 The GNU C Library is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 Library General Public License for more details. 18 19 You should have received a copy of the GNU Library General Public 20 License along with the GNU C Library; see the file COPYING.LIB. If not, 21 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, 22 Boston, MA 02111-1307, USA. */ 23 24 /* AIX requires this to be the first thing in the file. */ 25 #if defined _AIX && !defined REGEX_MALLOC 26 #pragma alloca 27 #endif 28 29 #undef _GNU_SOURCE 30 #define _GNU_SOURCE 31 32 #ifdef HAVE_CONFIG_H 33 # include <config.h> 34 #endif 35 36 #ifndef PARAMS 37 # if defined __GNUC__ || (defined __STDC__ && __STDC__) 38 # define PARAMS(args) args 39 # else 40 # define PARAMS(args) () 41 # endif /* GCC. */ 42 #endif /* Not PARAMS. */ 43 44 #if defined STDC_HEADERS && !defined emacs 45 # include <stddef.h> 46 #else 47 /* We need this for `regex.h', and perhaps for the Emacs include files. */ 48 # include <sys/types.h> 49 #endif 50 51 #define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC) 52 53 /* For platform which support the ISO C amendement 1 functionality we 54 support user defined character classes. */ 55 #if defined _LIBC || WIDE_CHAR_SUPPORT 56 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */ 57 # include <wchar.h> 58 # include <wctype.h> 59 #endif 60 61 /* This is for multi byte string support. */ 62 #ifdef MBS_SUPPORT 63 # define CHAR_TYPE wchar_t 64 # define US_CHAR_TYPE wchar_t/* unsigned character type */ 65 # define COMPILED_BUFFER_VAR wc_buffer 66 # define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */ 67 # define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_TYPE)+1) 68 # define PUT_CHAR(c) \ 69 do { \ 70 if (MB_CUR_MAX == 1) \ 71 putchar (c); \ 72 else \ 73 printf ("%C", (wint_t) c); /* Should we use wide stream?? */ \ 74 } while (0) 75 # define TRUE 1 76 # define FALSE 0 77 #else 78 # define CHAR_TYPE char 79 # define US_CHAR_TYPE unsigned char /* unsigned character type */ 80 # define COMPILED_BUFFER_VAR bufp->buffer 81 # define OFFSET_ADDRESS_SIZE 2 82 # define PUT_CHAR(c) putchar (c) 83 #endif /* MBS_SUPPORT */ 84 85 #ifdef _LIBC 86 /* We have to keep the namespace clean. */ 87 # define regfree(preg) __regfree (preg) 88 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef) 89 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags) 90 # define regerror(errcode, preg, errbuf, errbuf_size) \ 91 __regerror(errcode, preg, errbuf, errbuf_size) 92 # define re_set_registers(bu, re, nu, st, en) \ 93 __re_set_registers (bu, re, nu, st, en) 94 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \ 95 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) 96 # define re_match(bufp, string, size, pos, regs) \ 97 __re_match (bufp, string, size, pos, regs) 98 # define re_search(bufp, string, size, startpos, range, regs) \ 99 __re_search (bufp, string, size, startpos, range, regs) 100 # define re_compile_pattern(pattern, length, bufp) \ 101 __re_compile_pattern (pattern, length, bufp) 102 # define re_set_syntax(syntax) __re_set_syntax (syntax) 103 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \ 104 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop) 105 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp) 106 107 # define btowc __btowc 108 109 /* We are also using some library internals. */ 110 # include <locale/localeinfo.h> 111 # include <locale/elem-hash.h> 112 # include <langinfo.h> 113 # include <locale/coll-lookup.h> 114 #endif 115 116 /* This is for other GNU distributions with internationalized messages. */ 117 #if HAVE_LIBINTL_H || defined _LIBC 118 # include <libintl.h> 119 # ifdef _LIBC 120 # undef gettext 121 # define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES) 122 # endif 123 #else 124 # define gettext(msgid) (msgid) 125 #endif 126 127 #ifndef gettext_noop 128 /* This define is so xgettext can find the internationalizable 129 strings. */ 130 # define gettext_noop(String) String 131 #endif 132 133 /* The `emacs' switch turns on certain matching commands 134 that make sense only in Emacs. */ 135 #ifdef emacs 136 137 # include "lisp.h" 138 # include "buffer.h" 139 # include "syntax.h" 140 141 #else /* not emacs */ 142 143 /* If we are not linking with Emacs proper, 144 we can't use the relocating allocator 145 even if config.h says that we can. */ 146 # undef REL_ALLOC 147 148 # if defined STDC_HEADERS || defined _LIBC 149 # include <stdlib.h> 150 # else 151 char *malloc (); 152 char *realloc (); 153 # endif 154 155 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow. 156 If nothing else has been done, use the method below. */ 157 # ifdef INHIBIT_STRING_HEADER 158 # if !(defined HAVE_BZERO && defined HAVE_BCOPY) 159 # if !defined bzero && !defined bcopy 160 # undef INHIBIT_STRING_HEADER 161 # endif 162 # endif 163 # endif 164 165 /* This is the normal way of making sure we have a bcopy and a bzero. 166 This is used in most programs--a few other programs avoid this 167 by defining INHIBIT_STRING_HEADER. */ 168 # ifndef INHIBIT_STRING_HEADER 169 # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC 170 # include <string.h> 171 # ifndef bzero 172 # ifndef _LIBC 173 # define bzero(s, n) (memset (s, '\0', n), (s)) 174 # else 175 # define bzero(s, n) __bzero (s, n) 176 # endif 177 # endif 178 # else 179 # include <strings.h> 180 # ifndef memcmp 181 # define memcmp(s1, s2, n) bcmp (s1, s2, n) 182 # endif 183 # ifndef memcpy 184 # define memcpy(d, s, n) (bcopy (s, d, n), (d)) 185 # endif 186 # endif 187 # endif 188 189 /* Define the syntax stuff for \<, \>, etc. */ 190 191 /* This must be nonzero for the wordchar and notwordchar pattern 192 commands in re_match_2. */ 193 # ifndef Sword 194 # define Sword 1 195 # endif 196 197 # ifdef SWITCH_ENUM_BUG 198 # define SWITCH_ENUM_CAST(x) ((int)(x)) 199 # else 200 # define SWITCH_ENUM_CAST(x) (x) 201 # endif 202 203 #endif /* not emacs */ 204 205 #if defined _LIBC || HAVE_LIMITS_H 206 # include <limits.h> 207 #endif 208 209 #ifndef MB_LEN_MAX 210 # define MB_LEN_MAX 1 211 #endif 212 213 /* Get the interface, including the syntax bits. */ 215 #include <regex.h> 216 217 /* isalpha etc. are used for the character classes. */ 218 #include <ctype.h> 219 220 /* Jim Meyering writes: 221 222 "... Some ctype macros are valid only for character codes that 223 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when 224 using /bin/cc or gcc but without giving an ansi option). So, all 225 ctype uses should be through macros like ISPRINT... If 226 STDC_HEADERS is defined, then autoconf has verified that the ctype 227 macros don't need to be guarded with references to isascii. ... 228 Defining isascii to 1 should let any compiler worth its salt 229 eliminate the && through constant folding." 230 Solaris defines some of these symbols so we must undefine them first. */ 231 232 #undef ISASCII 233 #if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII) 234 # define ISASCII(c) 1 235 #else 236 # define ISASCII(c) isascii(c) 237 #endif 238 239 #ifdef isblank 240 # define ISBLANK(c) (ISASCII (c) && isblank (c)) 241 #else 242 # define ISBLANK(c) ((c) == ' ' || (c) == '\t') 243 #endif 244 #ifdef isgraph 245 # define ISGRAPH(c) (ISASCII (c) && isgraph (c)) 246 #else 247 # define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) 248 #endif 249 250 #undef ISPRINT 251 #define ISPRINT(c) (ISASCII (c) && isprint (c)) 252 #define ISDIGIT(c) (ISASCII (c) && isdigit (c)) 253 #define ISALNUM(c) (ISASCII (c) && isalnum (c)) 254 #define ISALPHA(c) (ISASCII (c) && isalpha (c)) 255 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) 256 #define ISLOWER(c) (ISASCII (c) && islower (c)) 257 #define ISPUNCT(c) (ISASCII (c) && ispunct (c)) 258 #define ISSPACE(c) (ISASCII (c) && isspace (c)) 259 #define ISUPPER(c) (ISASCII (c) && isupper (c)) 260 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) 261 262 #ifdef _tolower 263 # define TOLOWER(c) _tolower(c) 264 #else 265 # define TOLOWER(c) tolower(c) 266 #endif 267 268 #ifndef NULL 269 # define NULL (void *)0 270 #endif 271 272 /* We remove any previous definition of `SIGN_EXTEND_CHAR', 273 since ours (we hope) works properly with all combinations of 274 machines, compilers, `char' and `unsigned char' argument types. 275 (Per Bothner suggested the basic approach.) */ 276 #undef SIGN_EXTEND_CHAR 277 #if __STDC__ 278 # define SIGN_EXTEND_CHAR(c) ((signed char) (c)) 279 #else /* not __STDC__ */ 280 /* As in Harbison and Steele. */ 281 # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) 282 #endif 283 284 #ifndef emacs 286 /* How many characters in the character set. */ 287 # define CHAR_SET_SIZE 256 288 289 # ifdef SYNTAX_TABLE 290 291 extern char *re_syntax_table; 292 293 # else /* not SYNTAX_TABLE */ 294 295 static char re_syntax_table[CHAR_SET_SIZE]; 296 297 static void init_syntax_once PARAMS ((void)); 298 299 static void 300 init_syntax_once () 301 { 302 register int c; 303 static int done = 0; 304 305 if (done) 306 return; 307 bzero (re_syntax_table, sizeof re_syntax_table); 308 309 for (c = 0; c < CHAR_SET_SIZE; ++c) 310 if (ISALNUM (c)) 311 re_syntax_table[c] = Sword; 312 313 re_syntax_table['_'] = Sword; 314 315 done = 1; 316 } 317 318 # endif /* not SYNTAX_TABLE */ 319 320 # define SYNTAX(c) re_syntax_table[(unsigned char) (c)] 321 322 #endif /* emacs */ 323 324 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we 326 use `alloca' instead of `malloc'. This is because using malloc in 327 re_search* or re_match* could cause memory leaks when C-g is used in 328 Emacs; also, malloc is slower and causes storage fragmentation. On 329 the other hand, malloc is more portable, and easier to debug. 330 331 Because we sometimes use alloca, some routines have to be macros, 332 not functions -- `alloca'-allocated space disappears at the end of the 333 function it is called in. */ 334 335 #ifdef REGEX_MALLOC 336 337 # define REGEX_ALLOCATE malloc 338 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) 339 # define REGEX_FREE free 340 341 #else /* not REGEX_MALLOC */ 342 343 /* Emacs already defines alloca, sometimes. */ 344 # ifndef alloca 345 346 /* Make alloca work the best possible way. */ 347 # ifdef __GNUC__ 348 # define alloca __builtin_alloca 349 # else /* not __GNUC__ */ 350 # if HAVE_ALLOCA_H 351 # include <alloca.h> 352 # endif /* HAVE_ALLOCA_H */ 353 # endif /* not __GNUC__ */ 354 355 # endif /* not alloca */ 356 357 # define REGEX_ALLOCATE alloca 358 359 /* Assumes a `char *destination' variable. */ 360 # define REGEX_REALLOCATE(source, osize, nsize) \ 361 (destination = (char *) alloca (nsize), \ 362 memcpy (destination, source, osize)) 363 364 /* No need to do anything to free, after alloca. */ 365 # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */ 366 367 #endif /* not REGEX_MALLOC */ 368 369 /* Define how to allocate the failure stack. */ 370 371 #if defined REL_ALLOC && defined REGEX_MALLOC 372 373 # define REGEX_ALLOCATE_STACK(size) \ 374 r_alloc (&failure_stack_ptr, (size)) 375 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 376 r_re_alloc (&failure_stack_ptr, (nsize)) 377 # define REGEX_FREE_STACK(ptr) \ 378 r_alloc_free (&failure_stack_ptr) 379 380 #else /* not using relocating allocator */ 381 382 # ifdef REGEX_MALLOC 383 384 # define REGEX_ALLOCATE_STACK malloc 385 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize) 386 # define REGEX_FREE_STACK free 387 388 # else /* not REGEX_MALLOC */ 389 390 # define REGEX_ALLOCATE_STACK alloca 391 392 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 393 REGEX_REALLOCATE (source, osize, nsize) 394 /* No need to explicitly free anything. */ 395 # define REGEX_FREE_STACK(arg) 396 397 # endif /* not REGEX_MALLOC */ 398 #endif /* not using relocating allocator */ 399 400 401 /* True if `size1' is non-NULL and PTR is pointing anywhere inside 402 `string1' or just past its end. This works if PTR is NULL, which is 403 a good thing. */ 404 #define FIRST_STRING_P(ptr) \ 405 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) 406 407 /* (Re)Allocate N items of type T using malloc, or fail. */ 408 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) 409 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) 410 #define RETALLOC_IF(addr, n, t) \ 411 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t) 412 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) 413 414 #define BYTEWIDTH 8 /* In bits. */ 415 416 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) 417 418 #undef MAX 419 #undef MIN 420 #define MAX(a, b) ((a) > (b) ? (a) : (b)) 421 #define MIN(a, b) ((a) < (b) ? (a) : (b)) 422 423 typedef char boolean; 424 #define false 0 425 #define true 1 426 427 static int re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp, 428 const char *string1, int size1, 429 const char *string2, int size2, 430 int pos, 431 struct re_registers *regs, 432 int stop)); 433 434 /* These are the command codes that appear in compiled regular 436 expressions. Some opcodes are followed by argument bytes. A 437 command code can specify any interpretation whatsoever for its 438 arguments. Zero bytes may appear in the compiled regular expression. */ 439 440 typedef enum 441 { 442 no_op = 0, 443 444 /* Succeed right away--no more backtracking. */ 445 succeed, 446 447 /* Followed by one byte giving n, then by n literal bytes. */ 448 exactn, 449 450 #ifdef MBS_SUPPORT 451 /* Same as exactn, but contains binary data. */ 452 exactn_bin, 453 #endif 454 455 /* Matches any (more or less) character. */ 456 anychar, 457 458 /* Matches any one char belonging to specified set. First 459 following byte is number of bitmap bytes. Then come bytes 460 for a bitmap saying which chars are in. Bits in each byte 461 are ordered low-bit-first. A character is in the set if its 462 bit is 1. A character too large to have a bit in the map is 463 automatically not in the set. */ 464 /* ifdef MBS_SUPPORT, following element is length of character 465 classes, length of collating symbols, length of equivalence 466 classes, length of character ranges, and length of characters. 467 Next, character class element, collating symbols elements, 468 equivalence class elements, range elements, and character 469 elements follow. 470 See regex_compile function. */ 471 charset, 472 473 /* Same parameters as charset, but match any character that is 474 not one of those specified. */ 475 charset_not, 476 477 /* Start remembering the text that is matched, for storing in a 478 register. Followed by one byte with the register number, in 479 the range 0 to one less than the pattern buffer's re_nsub 480 field. Then followed by one byte with the number of groups 481 inner to this one. (This last has to be part of the 482 start_memory only because we need it in the on_failure_jump 483 of re_match_2.) */ 484 start_memory, 485 486 /* Stop remembering the text that is matched and store it in a 487 memory register. Followed by one byte with the register 488 number, in the range 0 to one less than `re_nsub' in the 489 pattern buffer, and one byte with the number of inner groups, 490 just like `start_memory'. (We need the number of inner 491 groups here because we don't have any easy way of finding the 492 corresponding start_memory when we're at a stop_memory.) */ 493 stop_memory, 494 495 /* Match a duplicate of something remembered. Followed by one 496 byte containing the register number. */ 497 duplicate, 498 499 /* Fail unless at beginning of line. */ 500 begline, 501 502 /* Fail unless at end of line. */ 503 endline, 504 505 /* Succeeds if at beginning of buffer (if emacs) or at beginning 506 of string to be matched (if not). */ 507 begbuf, 508 509 /* Analogously, for end of buffer/string. */ 510 endbuf, 511 512 /* Followed by two byte relative address to which to jump. */ 513 jump, 514 515 /* Same as jump, but marks the end of an alternative. */ 516 jump_past_alt, 517 518 /* Followed by two-byte relative address of place to resume at 519 in case of failure. */ 520 /* ifdef MBS_SUPPORT, the size of address is 1. */ 521 on_failure_jump, 522 523 /* Like on_failure_jump, but pushes a placeholder instead of the 524 current string position when executed. */ 525 on_failure_keep_string_jump, 526 527 /* Throw away latest failure point and then jump to following 528 two-byte relative address. */ 529 /* ifdef MBS_SUPPORT, the size of address is 1. */ 530 pop_failure_jump, 531 532 /* Change to pop_failure_jump if know won't have to backtrack to 533 match; otherwise change to jump. This is used to jump 534 back to the beginning of a repeat. If what follows this jump 535 clearly won't match what the repeat does, such that we can be 536 sure that there is no use backtracking out of repetitions 537 already matched, then we change it to a pop_failure_jump. 538 Followed by two-byte address. */ 539 /* ifdef MBS_SUPPORT, the size of address is 1. */ 540 maybe_pop_jump, 541 542 /* Jump to following two-byte address, and push a dummy failure 543 point. This failure point will be thrown away if an attempt 544 is made to use it for a failure. A `+' construct makes this 545 before the first repeat. Also used as an intermediary kind 546 of jump when compiling an alternative. */ 547 /* ifdef MBS_SUPPORT, the size of address is 1. */ 548 dummy_failure_jump, 549 550 /* Push a dummy failure point and continue. Used at the end of 551 alternatives. */ 552 push_dummy_failure, 553 554 /* Followed by two-byte relative address and two-byte number n. 555 After matching N times, jump to the address upon failure. */ 556 /* ifdef MBS_SUPPORT, the size of address is 1. */ 557 succeed_n, 558 559 /* Followed by two-byte relative address, and two-byte number n. 560 Jump to the address N times, then fail. */ 561 /* ifdef MBS_SUPPORT, the size of address is 1. */ 562 jump_n, 563 564 /* Set the following two-byte relative address to the 565 subsequent two-byte number. The address *includes* the two 566 bytes of number. */ 567 /* ifdef MBS_SUPPORT, the size of address is 1. */ 568 set_number_at, 569 570 wordchar, /* Matches any word-constituent character. */ 571 notwordchar, /* Matches any char that is not a word-constituent. */ 572 573 wordbeg, /* Succeeds if at word beginning. */ 574 wordend, /* Succeeds if at word end. */ 575 576 wordbound, /* Succeeds if at a word boundary. */ 577 notwordbound /* Succeeds if not at a word boundary. */ 578 579 #ifdef emacs 580 ,before_dot, /* Succeeds if before point. */ 581 at_dot, /* Succeeds if at point. */ 582 after_dot, /* Succeeds if after point. */ 583 584 /* Matches any character whose syntax is specified. Followed by 585 a byte which contains a syntax code, e.g., Sword. */ 586 syntaxspec, 587 588 /* Matches any character whose syntax is not that specified. */ 589 notsyntaxspec 590 #endif /* emacs */ 591 } re_opcode_t; 592 593 /* Common operations on the compiled pattern. */ 595 596 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ 597 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ 598 599 #ifdef MBS_SUPPORT 600 # define STORE_NUMBER(destination, number) \ 601 do { \ 602 *(destination) = (US_CHAR_TYPE)(number); \ 603 } while (0) 604 #else 605 # define STORE_NUMBER(destination, number) \ 606 do { \ 607 (destination)[0] = (number) & 0377; \ 608 (destination)[1] = (number) >> 8; \ 609 } while (0) 610 #endif /* MBS_SUPPORT */ 611 612 /* Same as STORE_NUMBER, except increment DESTINATION to 613 the byte after where the number is stored. Therefore, DESTINATION 614 must be an lvalue. */ 615 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ 616 617 #define STORE_NUMBER_AND_INCR(destination, number) \ 618 do { \ 619 STORE_NUMBER (destination, number); \ 620 (destination) += OFFSET_ADDRESS_SIZE; \ 621 } while (0) 622 623 /* Put into DESTINATION a number stored in two contiguous bytes starting 624 at SOURCE. */ 625 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ 626 627 #ifdef MBS_SUPPORT 628 # define EXTRACT_NUMBER(destination, source) \ 629 do { \ 630 (destination) = *(source); \ 631 } while (0) 632 #else 633 # define EXTRACT_NUMBER(destination, source) \ 634 do { \ 635 (destination) = *(source) & 0377; \ 636 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ 637 } while (0) 638 #endif 639 640 #ifdef DEBUG 641 static void extract_number _RE_ARGS ((int *dest, US_CHAR_TYPE *source)); 642 static void 643 extract_number (dest, source) 644 int *dest; 645 US_CHAR_TYPE *source; 646 { 647 #ifdef MBS_SUPPORT 648 *dest = *source; 649 #else 650 int temp = SIGN_EXTEND_CHAR (*(source + 1)); 651 *dest = *source & 0377; 652 *dest += temp << 8; 653 #endif 654 } 655 656 # ifndef EXTRACT_MACROS /* To debug the macros. */ 657 # undef EXTRACT_NUMBER 658 # define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) 659 # endif /* not EXTRACT_MACROS */ 660 661 #endif /* DEBUG */ 662 663 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. 664 SOURCE must be an lvalue. */ 665 666 #define EXTRACT_NUMBER_AND_INCR(destination, source) \ 667 do { \ 668 EXTRACT_NUMBER (destination, source); \ 669 (source) += OFFSET_ADDRESS_SIZE; \ 670 } while (0) 671 672 #ifdef DEBUG 673 static void extract_number_and_incr _RE_ARGS ((int *destination, 674 US_CHAR_TYPE **source)); 675 static void 676 extract_number_and_incr (destination, source) 677 int *destination; 678 US_CHAR_TYPE **source; 679 { 680 extract_number (destination, *source); 681 *source += OFFSET_ADDRESS_SIZE; 682 } 683 684 # ifndef EXTRACT_MACROS 685 # undef EXTRACT_NUMBER_AND_INCR 686 # define EXTRACT_NUMBER_AND_INCR(dest, src) \ 687 extract_number_and_incr (&dest, &src) 688 # endif /* not EXTRACT_MACROS */ 689 690 #endif /* DEBUG */ 691 692 /* If DEBUG is defined, Regex prints many voluminous messages about what 694 it is doing (if the variable `debug' is nonzero). If linked with the 695 main program in `iregex.c', you can enter patterns and strings 696 interactively. And if linked with the main program in `main.c' and 697 the other test files, you can run the already-written tests. */ 698 699 #ifdef DEBUG 700 701 /* We use standard I/O for debugging. */ 702 # include <stdio.h> 703 704 /* It is useful to test things that ``must'' be true when debugging. */ 705 # include <assert.h> 706 707 static int debug; 708 709 # define DEBUG_STATEMENT(e) e 710 # define DEBUG_PRINT1(x) if (debug) printf (x) 711 # define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) 712 # define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) 713 # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) 714 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ 715 if (debug) print_partial_compiled_pattern (s, e) 716 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ 717 if (debug) print_double_string (w, s1, sz1, s2, sz2) 718 719 720 /* Print the fastmap in human-readable form. */ 721 722 void 723 print_fastmap (fastmap) 724 char *fastmap; 725 { 726 unsigned was_a_range = 0; 727 unsigned i = 0; 728 729 while (i < (1 << BYTEWIDTH)) 730 { 731 if (fastmap[i++]) 732 { 733 was_a_range = 0; 734 putchar (i - 1); 735 while (i < (1 << BYTEWIDTH) && fastmap[i]) 736 { 737 was_a_range = 1; 738 i++; 739 } 740 if (was_a_range) 741 { 742 printf ("-"); 743 putchar (i - 1); 744 } 745 } 746 } 747 putchar ('\n'); 748 } 749 750 751 /* Print a compiled pattern string in human-readable form, starting at 752 the START pointer into it and ending just before the pointer END. */ 753 754 void 755 print_partial_compiled_pattern (start, end) 756 US_CHAR_TYPE *start; 757 US_CHAR_TYPE *end; 758 { 759 int mcnt, mcnt2; 760 US_CHAR_TYPE *p1; 761 US_CHAR_TYPE *p = start; 762 US_CHAR_TYPE *pend = end; 763 764 if (start == NULL) 765 { 766 printf ("(null)\n"); 767 return; 768 } 769 770 /* Loop over pattern commands. */ 771 while (p < pend) 772 { 773 #ifdef _LIBC 774 printf ("%td:\t", p - start); 775 #else 776 printf ("%ld:\t", (long int) (p - start)); 777 #endif 778 779 switch ((re_opcode_t) *p++) 780 { 781 case no_op: 782 printf ("/no_op"); 783 break; 784 785 case exactn: 786 mcnt = *p++; 787 printf ("/exactn/%d", mcnt); 788 do 789 { 790 putchar ('/'); 791 PUT_CHAR (*p++); 792 } 793 while (--mcnt); 794 break; 795 796 #ifdef MBS_SUPPORT 797 case exactn_bin: 798 mcnt = *p++; 799 printf ("/exactn_bin/%d", mcnt); 800 do 801 { 802 printf("/%lx", (long int) *p++); 803 } 804 while (--mcnt); 805 break; 806 #endif /* MBS_SUPPORT */ 807 808 case start_memory: 809 mcnt = *p++; 810 printf ("/start_memory/%d/%ld", mcnt, (long int) *p++); 811 break; 812 813 case stop_memory: 814 mcnt = *p++; 815 printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++); 816 break; 817 818 case duplicate: 819 printf ("/duplicate/%ld", (long int) *p++); 820 break; 821 822 case anychar: 823 printf ("/anychar"); 824 break; 825 826 case charset: 827 case charset_not: 828 { 829 #ifdef MBS_SUPPORT 830 int i, length; 831 wchar_t *workp = p; 832 printf ("/charset [%s", 833 (re_opcode_t) *(workp - 1) == charset_not ? "^" : ""); 834 p += 5; 835 length = *workp++; /* the length of char_classes */ 836 for (i=0 ; i<length ; i++) 837 printf("[:%lx:]", (long int) *p++); 838 length = *workp++; /* the length of collating_symbol */ 839 for (i=0 ; i<length ;) 840 { 841 printf("[."); 842 while(*p != 0) 843 PUT_CHAR((i++,*p++)); 844 i++,p++; 845 printf(".]"); 846 } 847 length = *workp++; /* the length of equivalence_class */ 848 for (i=0 ; i<length ;) 849 { 850 printf("[="); 851 while(*p != 0) 852 PUT_CHAR((i++,*p++)); 853 i++,p++; 854 printf("=]"); 855 } 856 length = *workp++; /* the length of char_range */ 857 for (i=0 ; i<length ; i++) 858 { 859 wchar_t range_start = *p++; 860 wchar_t range_end = *p++; 861 if (MB_CUR_MAX == 1) 862 printf("%c-%c", (char) range_start, (char) range_end); 863 else 864 printf("%C-%C", (wint_t) range_start, (wint_t) range_end); 865 } 866 length = *workp++; /* the length of char */ 867 for (i=0 ; i<length ; i++) 868 if (MB_CUR_MAX == 1) 869 putchar (*p++); 870 else 871 printf("%C", (wint_t) *p++); 872 putchar (']'); 873 #else 874 register int c, last = -100; 875 register int in_range = 0; 876 877 printf ("/charset [%s", 878 (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); 879 880 assert (p + *p < pend); 881 882 for (c = 0; c < 256; c++) 883 if (c / 8 < *p 884 && (p[1 + (c/8)] & (1 << (c % 8)))) 885 { 886 /* Are we starting a range? */ 887 if (last + 1 == c && ! in_range) 888 { 889 putchar ('-'); 890 in_range = 1; 891 } 892 /* Have we broken a range? */ 893 else if (last + 1 != c && in_range) 894 { 895 putchar (last); 896 in_range = 0; 897 } 898 899 if (! in_range) 900 putchar (c); 901 902 last = c; 903 } 904 905 if (in_range) 906 putchar (last); 907 908 putchar (']'); 909 910 p += 1 + *p; 911 #endif /* MBS_SUPPORT */ 912 } 913 break; 914 915 case begline: 916 printf ("/begline"); 917 break; 918 919 case endline: 920 printf ("/endline"); 921 break; 922 923 case on_failure_jump: 924 extract_number_and_incr (&mcnt, &p); 925 #ifdef _LIBC 926 printf ("/on_failure_jump to %td", p + mcnt - start); 927 #else 928 printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start)); 929 #endif 930 break; 931 932 case on_failure_keep_string_jump: 933 extract_number_and_incr (&mcnt, &p); 934 #ifdef _LIBC 935 printf ("/on_failure_keep_string_jump to %td", p + mcnt - start); 936 #else 937 printf ("/on_failure_keep_string_jump to %ld", 938 (long int) (p + mcnt - start)); 939 #endif 940 break; 941 942 case dummy_failure_jump: 943 extract_number_and_incr (&mcnt, &p); 944 #ifdef _LIBC 945 printf ("/dummy_failure_jump to %td", p + mcnt - start); 946 #else 947 printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start)); 948 #endif 949 break; 950 951 case push_dummy_failure: 952 printf ("/push_dummy_failure"); 953 break; 954 955 case maybe_pop_jump: 956 extract_number_and_incr (&mcnt, &p); 957 #ifdef _LIBC 958 printf ("/maybe_pop_jump to %td", p + mcnt - start); 959 #else 960 printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start)); 961 #endif 962 break; 963 964 case pop_failure_jump: 965 extract_number_and_incr (&mcnt, &p); 966 #ifdef _LIBC 967 printf ("/pop_failure_jump to %td", p + mcnt - start); 968 #else 969 printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start)); 970 #endif 971 break; 972 973 case jump_past_alt: 974 extract_number_and_incr (&mcnt, &p); 975 #ifdef _LIBC 976 printf ("/jump_past_alt to %td", p + mcnt - start); 977 #else 978 printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start)); 979 #endif 980 break; 981 982 case jump: 983 extract_number_and_incr (&mcnt, &p); 984 #ifdef _LIBC 985 printf ("/jump to %td", p + mcnt - start); 986 #else 987 printf ("/jump to %ld", (long int) (p + mcnt - start)); 988 #endif 989 break; 990 991 case succeed_n: 992 extract_number_and_incr (&mcnt, &p); 993 p1 = p + mcnt; 994 extract_number_and_incr (&mcnt2, &p); 995 #ifdef _LIBC 996 printf ("/succeed_n to %td, %d times", p1 - start, mcnt2); 997 #else 998 printf ("/succeed_n to %ld, %d times", 999 (long int) (p1 - start), mcnt2); 1000 #endif 1001 break; 1002 1003 case jump_n: 1004 extract_number_and_incr (&mcnt, &p); 1005 p1 = p + mcnt; 1006 extract_number_and_incr (&mcnt2, &p); 1007 printf ("/jump_n to %d, %d times", p1 - start, mcnt2); 1008 break; 1009 1010 case set_number_at: 1011 extract_number_and_incr (&mcnt, &p); 1012 p1 = p + mcnt; 1013 extract_number_and_incr (&mcnt2, &p); 1014 #ifdef _LIBC 1015 printf ("/set_number_at location %td to %d", p1 - start, mcnt2); 1016 #else 1017 printf ("/set_number_at location %ld to %d", 1018 (long int) (p1 - start), mcnt2); 1019 #endif 1020 break; 1021 1022 case wordbound: 1023 printf ("/wordbound"); 1024 break; 1025 1026 case notwordbound: 1027 printf ("/notwordbound"); 1028 break; 1029 1030 case wordbeg: 1031 printf ("/wordbeg"); 1032 break; 1033 1034 case wordend: 1035 printf ("/wordend"); 1036 break; 1037 1038 # ifdef emacs 1039 case before_dot: 1040 printf ("/before_dot"); 1041 break; 1042 1043 case at_dot: 1044 printf ("/at_dot"); 1045 break; 1046 1047 case after_dot: 1048 printf ("/after_dot"); 1049 break; 1050 1051 case syntaxspec: 1052 printf ("/syntaxspec"); 1053 mcnt = *p++; 1054 printf ("/%d", mcnt); 1055 break; 1056 1057 case notsyntaxspec: 1058 printf ("/notsyntaxspec"); 1059 mcnt = *p++; 1060 printf ("/%d", mcnt); 1061 break; 1062 # endif /* emacs */ 1063 1064 case wordchar: 1065 printf ("/wordchar"); 1066 break; 1067 1068 case notwordchar: 1069 printf ("/notwordchar"); 1070 break; 1071 1072 case begbuf: 1073 printf ("/begbuf"); 1074 break; 1075 1076 case endbuf: 1077 printf ("/endbuf"); 1078 break; 1079 1080 default: 1081 printf ("?%ld", (long int) *(p-1)); 1082 } 1083 1084 putchar ('\n'); 1085 } 1086 1087 #ifdef _LIBC 1088 printf ("%td:\tend of pattern.\n", p - start); 1089 #else 1090 printf ("%ld:\tend of pattern.\n", (long int) (p - start)); 1091 #endif 1092 } 1093 1094 1095 void 1096 print_compiled_pattern (bufp) 1097 struct re_pattern_buffer *bufp; 1098 { 1099 US_CHAR_TYPE *buffer = (US_CHAR_TYPE*) bufp->buffer; 1100 1101 print_partial_compiled_pattern (buffer, buffer 1102 + bufp->used / sizeof(US_CHAR_TYPE)); 1103 printf ("%ld bytes used/%ld bytes allocated.\n", 1104 bufp->used, bufp->allocated); 1105 1106 if (bufp->fastmap_accurate && bufp->fastmap) 1107 { 1108 printf ("fastmap: "); 1109 print_fastmap (bufp->fastmap); 1110 } 1111 1112 #ifdef _LIBC 1113 printf ("re_nsub: %Zd\t", bufp->re_nsub); 1114 #else 1115 printf ("re_nsub: %ld\t", (long int) bufp->re_nsub); 1116 #endif 1117 printf ("regs_alloc: %d\t", bufp->regs_allocated); 1118 printf ("can_be_null: %d\t", bufp->can_be_null); 1119 printf ("newline_anchor: %d\n", bufp->newline_anchor); 1120 printf ("no_sub: %d\t", bufp->no_sub); 1121 printf ("not_bol: %d\t", bufp->not_bol); 1122 printf ("not_eol: %d\t", bufp->not_eol); 1123 printf ("syntax: %lx\n", bufp->syntax); 1124 /* Perhaps we should print the translate table? */ 1125 } 1126 1127 1128 void 1129 print_double_string (where, string1, size1, string2, size2) 1130 const CHAR_TYPE *where; 1131 const CHAR_TYPE *string1; 1132 const CHAR_TYPE *string2; 1133 int size1; 1134 int size2; 1135 { 1136 ptrdiff_t this_char; 1137 1138 if (where == NULL) 1139 printf ("(null)"); 1140 else 1141 { 1142 if (FIRST_STRING_P (where)) 1143 { 1144 for (this_char = where - string1; this_char < size1; this_char++) 1145 PUT_CHAR (string1[this_char]); 1146 1147 where = string2; 1148 } 1149 1150 for (this_char = where - string2; this_char < size2; this_char++) 1151 PUT_CHAR (string2[this_char]); 1152 } 1153 } 1154 1155 void 1156 printchar (c) 1157 int c; 1158 { 1159 putc (c, stderr); 1160 } 1161 1162 #else /* not DEBUG */ 1163 1164 # undef assert 1165 # define assert(e) 1166 1167 # define DEBUG_STATEMENT(e) 1168 # define DEBUG_PRINT1(x) 1169 # define DEBUG_PRINT2(x1, x2) 1170 # define DEBUG_PRINT3(x1, x2, x3) 1171 # define DEBUG_PRINT4(x1, x2, x3, x4) 1172 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) 1173 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) 1174 1175 #endif /* not DEBUG */ 1176 1177 #ifdef MBS_SUPPORT 1179 /* This convert a multibyte string to a wide character string. 1180 And write their correspondances to offset_buffer(see below) 1181 and write whether each wchar_t is binary data to is_binary. 1182 This assume invalid multibyte sequences as binary data. 1183 We assume offset_buffer and is_binary is already allocated 1184 enough space. */ 1185 1186 static size_t convert_mbs_to_wcs (CHAR_TYPE *dest, const unsigned char* src, 1187 size_t len, int *offset_buffer, 1188 char *is_binary); 1189 static size_t 1190 convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary) 1191 CHAR_TYPE *dest; 1192 const unsigned char* src; 1193 size_t len; /* the length of multibyte string. */ 1194 1195 /* It hold correspondances between src(char string) and 1196 dest(wchar_t string) for optimization. 1197 e.g. src = "xxxyzz" 1198 dest = {'X', 'Y', 'Z'} 1199 (each "xxx", "y" and "zz" represent one multibyte character 1200 corresponding to 'X', 'Y' and 'Z'.) 1201 offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")} 1202 = {0, 3, 4, 6} 1203 */ 1204 int *offset_buffer; 1205 char *is_binary; 1206 { 1207 wchar_t *pdest = dest; 1208 const unsigned char *psrc = src; 1209 size_t wc_count = 0; 1210 1211 if (MB_CUR_MAX == 1) 1212 { /* We don't need conversion. */ 1213 for ( ; wc_count < len ; ++wc_count) 1214 { 1215 *pdest++ = *psrc++; 1216 is_binary[wc_count] = FALSE; 1217 offset_buffer[wc_count] = wc_count; 1218 } 1219 offset_buffer[wc_count] = wc_count; 1220 } 1221 else 1222 { 1223 /* We need conversion. */ 1224 mbstate_t mbs; 1225 int consumed; 1226 size_t mb_remain = len; 1227 size_t mb_count = 0; 1228 1229 /* Initialize the conversion state. */ 1230 memset (&mbs, 0, sizeof (mbstate_t)); 1231 1232 offset_buffer[0] = 0; 1233 for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed, 1234 psrc += consumed) 1235 { 1236 consumed = mbrtowc (pdest, psrc, mb_remain, &mbs); 1237 1238 if (consumed <= 0) 1239 /* failed to convert. maybe src contains binary data. 1240 So we consume 1 byte manualy. */ 1241 { 1242 *pdest = *psrc; 1243 consumed = 1; 1244 is_binary[wc_count] = TRUE; 1245 } 1246 else 1247 is_binary[wc_count] = FALSE; 1248 /* In sjis encoding, we use yen sign as escape character in 1249 place of reverse solidus. So we convert 0x5c(yen sign in 1250 sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse 1251 solidus in UCS2). */ 1252 if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5) 1253 *pdest = (wchar_t) *psrc; 1254 1255 offset_buffer[wc_count + 1] = mb_count += consumed; 1256 } 1257 } 1258 1259 return wc_count; 1260 } 1261 1262 #endif /* MBS_SUPPORT */ 1263 1264 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 1265 also be assigned to arbitrarily: each pattern buffer stores its own 1266 syntax, so it can be changed between regex compilations. */ 1267 /* This has no initializer because initialized variables in Emacs 1268 become read-only after dumping. */ 1269 reg_syntax_t re_syntax_options; 1270 1271 1272 /* Specify the precise syntax of regexps for compilation. This provides 1273 for compatibility for various utilities which historically have 1274 different, incompatible syntaxes. 1275 1276 The argument SYNTAX is a bit mask comprised of the various bits 1277 defined in regex.h. We return the old syntax. */ 1278 1279 reg_syntax_t 1280 re_set_syntax (syntax) 1281 reg_syntax_t syntax; 1282 { 1283 reg_syntax_t ret = re_syntax_options; 1284 1285 re_syntax_options = syntax; 1286 #ifdef DEBUG 1287 if (syntax & RE_DEBUG) 1288 debug = 1; 1289 else if (debug) /* was on but now is not */ 1290 debug = 0; 1291 #endif /* DEBUG */ 1292 return ret; 1293 } 1294 #ifdef _LIBC 1295 weak_alias (__re_set_syntax, re_set_syntax) 1296 #endif 1297 1298 /* This table gives an error message for each of the error codes listed 1300 in regex.h. Obviously the order here has to be same as there. 1301 POSIX doesn't require that we do anything for REG_NOERROR, 1302 but why not be nice? */ 1303 1304 static const char re_error_msgid[] = 1305 { 1306 #define REG_NOERROR_IDX 0 1307 gettext_noop ("Success") /* REG_NOERROR */ 1308 "\0" 1309 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 1310 gettext_noop ("No match") /* REG_NOMATCH */ 1311 "\0" 1312 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 1313 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 1314 "\0" 1315 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 1316 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 1317 "\0" 1318 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 1319 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 1320 "\0" 1321 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 1322 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 1323 "\0" 1324 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 1325 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 1326 "\0" 1327 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 1328 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 1329 "\0" 1330 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 1331 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 1332 "\0" 1333 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 1334 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 1335 "\0" 1336 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 1337 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 1338 "\0" 1339 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 1340 gettext_noop ("Invalid range end") /* REG_ERANGE */ 1341 "\0" 1342 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 1343 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 1344 "\0" 1345 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 1346 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 1347 "\0" 1348 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 1349 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 1350 "\0" 1351 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 1352 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 1353 "\0" 1354 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 1355 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 1356 }; 1357 1358 static const size_t re_error_msgid_idx[] = 1359 { 1360 REG_NOERROR_IDX, 1361 REG_NOMATCH_IDX, 1362 REG_BADPAT_IDX, 1363 REG_ECOLLATE_IDX, 1364 REG_ECTYPE_IDX, 1365 REG_EESCAPE_IDX, 1366 REG_ESUBREG_IDX, 1367 REG_EBRACK_IDX, 1368 REG_EPAREN_IDX, 1369 REG_EBRACE_IDX, 1370 REG_BADBR_IDX, 1371 REG_ERANGE_IDX, 1372 REG_ESPACE_IDX, 1373 REG_BADRPT_IDX, 1374 REG_EEND_IDX, 1375 REG_ESIZE_IDX, 1376 REG_ERPAREN_IDX 1377 }; 1378 1379 /* Avoiding alloca during matching, to placate r_alloc. */ 1381 1382 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the 1383 searching and matching functions should not call alloca. On some 1384 systems, alloca is implemented in terms of malloc, and if we're 1385 using the relocating allocator routines, then malloc could cause a 1386 relocation, which might (if the strings being searched are in the 1387 ralloc heap) shift the data out from underneath the regexp 1388 routines. 1389 1390 Here's another reason to avoid allocation: Emacs 1391 processes input from X in a signal handler; processing X input may 1392 call malloc; if input arrives while a matching routine is calling 1393 malloc, then we're scrod. But Emacs can't just block input while 1394 calling matching routines; then we don't notice interrupts when 1395 they come in. So, Emacs blocks input around all regexp calls 1396 except the matching calls, which it leaves unprotected, in the 1397 faith that they will not malloc. */ 1398 1399 /* Normally, this is fine. */ 1400 #define MATCH_MAY_ALLOCATE 1401 1402 /* When using GNU C, we are not REALLY using the C alloca, no matter 1403 what config.h may say. So don't take precautions for it. */ 1404 #ifdef __GNUC__ 1405 # undef C_ALLOCA 1406 #endif 1407 1408 /* The match routines may not allocate if (1) they would do it with malloc 1409 and (2) it's not safe for them to use malloc. 1410 Note that if REL_ALLOC is defined, matching would not use malloc for the 1411 failure stack, but we would still use it for the register vectors; 1412 so REL_ALLOC should not affect this. */ 1413 #if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs 1414 # undef MATCH_MAY_ALLOCATE 1415 #endif 1416 1417 1418 /* Failure stack declarations and macros; both re_compile_fastmap and 1420 re_match_2 use a failure stack. These have to be macros because of 1421 REGEX_ALLOCATE_STACK. */ 1422 1423 1424 /* Number of failure points for which to initially allocate space 1425 when matching. If this number is exceeded, we allocate more 1426 space, so it is not a hard limit. */ 1427 #ifndef INIT_FAILURE_ALLOC 1428 # define INIT_FAILURE_ALLOC 5 1429 #endif 1430 1431 /* Roughly the maximum number of failure points on the stack. Would be 1432 exactly that if always used MAX_FAILURE_ITEMS items each time we failed. 1433 This is a variable only so users of regex can assign to it; we never 1434 change it ourselves. */ 1435 1436 #ifdef INT_IS_16BIT 1437 1438 # if defined MATCH_MAY_ALLOCATE 1439 /* 4400 was enough to cause a crash on Alpha OSF/1, 1440 whose default stack limit is 2mb. */ 1441 long int re_max_failures = 4000; 1442 # else 1443 long int re_max_failures = 2000; 1444 # endif 1445 1446 union fail_stack_elt 1447 { 1448 US_CHAR_TYPE *pointer; 1449 long int integer; 1450 }; 1451 1452 typedef union fail_stack_elt fail_stack_elt_t; 1453 1454 typedef struct 1455 { 1456 fail_stack_elt_t *stack; 1457 unsigned long int size; 1458 unsigned long int avail; /* Offset of next open position. */ 1459 } fail_stack_type; 1460 1461 #else /* not INT_IS_16BIT */ 1462 1463 # if defined MATCH_MAY_ALLOCATE 1464 /* 4400 was enough to cause a crash on Alpha OSF/1, 1465 whose default stack limit is 2mb. */ 1466 int re_max_failures = 4000; 1467 # else 1468 int re_max_failures = 2000; 1469 # endif 1470 1471 union fail_stack_elt 1472 { 1473 US_CHAR_TYPE *pointer; 1474 int integer; 1475 }; 1476 1477 typedef union fail_stack_elt fail_stack_elt_t; 1478 1479 typedef struct 1480 { 1481 fail_stack_elt_t *stack; 1482 unsigned size; 1483 unsigned avail; /* Offset of next open position. */ 1484 } fail_stack_type; 1485 1486 #endif /* INT_IS_16BIT */ 1487 1488 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) 1489 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) 1490 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) 1491 1492 1493 /* Define macros to initialize and free the failure stack. 1494 Do `return -2' if the alloc fails. */ 1495 1496 #ifdef MATCH_MAY_ALLOCATE 1497 # define INIT_FAIL_STACK() \ 1498 do { \ 1499 fail_stack.stack = (fail_stack_elt_t *) \ 1500 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ 1501 \ 1502 if (fail_stack.stack == NULL) \ 1503 return -2; \ 1504 \ 1505 fail_stack.size = INIT_FAILURE_ALLOC; \ 1506 fail_stack.avail = 0; \ 1507 } while (0) 1508 1509 # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) 1510 #else 1511 # define INIT_FAIL_STACK() \ 1512 do { \ 1513 fail_stack.avail = 0; \ 1514 } while (0) 1515 1516 # define RESET_FAIL_STACK() 1517 #endif 1518 1519 1520 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. 1521 1522 Return 1 if succeeds, and 0 if either ran out of memory 1523 allocating space for it or it was already too large. 1524 1525 REGEX_REALLOCATE_STACK requires `destination' be declared. */ 1526 1527 #define DOUBLE_FAIL_STACK(fail_stack) \ 1528 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \ 1529 ? 0 \ 1530 : ((fail_stack).stack = (fail_stack_elt_t *) \ 1531 REGEX_REALLOCATE_STACK ((fail_stack).stack, \ 1532 (fail_stack).size * sizeof (fail_stack_elt_t), \ 1533 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ 1534 \ 1535 (fail_stack).stack == NULL \ 1536 ? 0 \ 1537 : ((fail_stack).size <<= 1, \ 1538 1))) 1539 1540 1541 /* Push pointer POINTER on FAIL_STACK. 1542 Return 1 if was able to do so and 0 if ran out of memory allocating 1543 space to do so. */ 1544 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ 1545 ((FAIL_STACK_FULL () \ 1546 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ 1547 ? 0 \ 1548 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ 1549 1)) 1550 1551 /* Push a pointer value onto the failure stack. 1552 Assumes the variable `fail_stack'. Probably should only 1553 be called from within `PUSH_FAILURE_POINT'. */ 1554 #define PUSH_FAILURE_POINTER(item) \ 1555 fail_stack.stack[fail_stack.avail++].pointer = (US_CHAR_TYPE *) (item) 1556 1557 /* This pushes an integer-valued item onto the failure stack. 1558 Assumes the variable `fail_stack'. Probably should only 1559 be called from within `PUSH_FAILURE_POINT'. */ 1560 #define PUSH_FAILURE_INT(item) \ 1561 fail_stack.stack[fail_stack.avail++].integer = (item) 1562 1563 /* Push a fail_stack_elt_t value onto the failure stack. 1564 Assumes the variable `fail_stack'. Probably should only 1565 be called from within `PUSH_FAILURE_POINT'. */ 1566 #define PUSH_FAILURE_ELT(item) \ 1567 fail_stack.stack[fail_stack.avail++] = (item) 1568 1569 /* These three POP... operations complement the three PUSH... operations. 1570 All assume that `fail_stack' is nonempty. */ 1571 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer 1572 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer 1573 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] 1574 1575 /* Used to omit pushing failure point id's when we're not debugging. */ 1576 #ifdef DEBUG 1577 # define DEBUG_PUSH PUSH_FAILURE_INT 1578 # define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () 1579 #else 1580 # define DEBUG_PUSH(item) 1581 # define DEBUG_POP(item_addr) 1582 #endif 1583 1584 1585 /* Push the information about the state we will need 1586 if we ever fail back to it. 1587 1588 Requires variables fail_stack, regstart, regend, reg_info, and 1589 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination' 1590 be declared. 1591 1592 Does `return FAILURE_CODE' if runs out of memory. */ 1593 1594 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ 1595 do { \ 1596 char *destination; \ 1597 /* Must be int, so when we don't save any registers, the arithmetic \ 1598 of 0 + -1 isn't done as unsigned. */ \ 1599 /* Can't be int, since there is not a shred of a guarantee that int \ 1600 is wide enough to hold a value of something to which pointer can \ 1601 be assigned */ \ 1602 active_reg_t this_reg; \ 1603 \ 1604 DEBUG_STATEMENT (failure_id++); \ 1605 DEBUG_STATEMENT (nfailure_points_pushed++); \ 1606 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ 1607 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ 1608 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ 1609 \ 1610 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \ 1611 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ 1612 \ 1613 /* Ensure we have enough space allocated for what we will push. */ \ 1614 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ 1615 { \ 1616 if (!DOUBLE_FAIL_STACK (fail_stack)) \ 1617 return failure_code; \ 1618 \ 1619 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ 1620 (fail_stack).size); \ 1621 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ 1622 } \ 1623 \ 1624 /* Push the info, starting with the registers. */ \ 1625 DEBUG_PRINT1 ("\n"); \ 1626 \ 1627 if (1) \ 1628 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ 1629 this_reg++) \ 1630 { \ 1631 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \ 1632 DEBUG_STATEMENT (num_regs_pushed++); \ 1633 \ 1634 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ 1635 PUSH_FAILURE_POINTER (regstart[this_reg]); \ 1636 \ 1637 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ 1638 PUSH_FAILURE_POINTER (regend[this_reg]); \ 1639 \ 1640 DEBUG_PRINT2 (" info: %p\n ", \ 1641 reg_info[this_reg].word.pointer); \ 1642 DEBUG_PRINT2 (" match_null=%d", \ 1643 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ 1644 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ 1645 DEBUG_PRINT2 (" matched_something=%d", \ 1646 MATCHED_SOMETHING (reg_info[this_reg])); \ 1647 DEBUG_PRINT2 (" ever_matched=%d", \ 1648 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ 1649 DEBUG_PRINT1 ("\n"); \ 1650 PUSH_FAILURE_ELT (reg_info[this_reg].word); \ 1651 } \ 1652 \ 1653 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\ 1654 PUSH_FAILURE_INT (lowest_active_reg); \ 1655 \ 1656 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\ 1657 PUSH_FAILURE_INT (highest_active_reg); \ 1658 \ 1659 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \ 1660 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ 1661 PUSH_FAILURE_POINTER (pattern_place); \ 1662 \ 1663 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \ 1664 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ 1665 size2); \ 1666 DEBUG_PRINT1 ("'\n"); \ 1667 PUSH_FAILURE_POINTER (string_place); \ 1668 \ 1669 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ 1670 DEBUG_PUSH (failure_id); \ 1671 } while (0) 1672 1673 /* This is the number of items that are pushed and popped on the stack 1674 for each register. */ 1675 #define NUM_REG_ITEMS 3 1676 1677 /* Individual items aside from the registers. */ 1678 #ifdef DEBUG 1679 # define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ 1680 #else 1681 # define NUM_NONREG_ITEMS 4 1682 #endif 1683 1684 /* We push at most this many items on the stack. */ 1685 /* We used to use (num_regs - 1), which is the number of registers 1686 this regexp will save; but that was changed to 5 1687 to avoid stack overflow for a regexp with lots of parens. */ 1688 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) 1689 1690 /* We actually push this many items. */ 1691 #define NUM_FAILURE_ITEMS \ 1692 (((0 \ 1693 ? 0 : highest_active_reg - lowest_active_reg + 1) \ 1694 * NUM_REG_ITEMS) \ 1695 + NUM_NONREG_ITEMS) 1696 1697 /* How many items can still be added to the stack without overflowing it. */ 1698 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) 1699 1700 1701 /* Pops what PUSH_FAIL_STACK pushes. 1702 1703 We restore into the parameters, all of which should be lvalues: 1704 STR -- the saved data position. 1705 PAT -- the saved pattern position. 1706 LOW_REG, HIGH_REG -- the highest and lowest active registers. 1707 REGSTART, REGEND -- arrays of string positions. 1708 REG_INFO -- array of information about each subexpression. 1709 1710 Also assumes the variables `fail_stack' and (if debugging), `bufp', 1711 `pend', `string1', `size1', `string2', and `size2'. */ 1712 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ 1713 { \ 1714 DEBUG_STATEMENT (unsigned failure_id;) \ 1715 active_reg_t this_reg; \ 1716 const US_CHAR_TYPE *string_temp; \ 1717 \ 1718 assert (!FAIL_STACK_EMPTY ()); \ 1719 \ 1720 /* Remove failure points and point to how many regs pushed. */ \ 1721 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ 1722 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ 1723 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ 1724 \ 1725 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ 1726 \ 1727 DEBUG_POP (&failure_id); \ 1728 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ 1729 \ 1730 /* If the saved string location is NULL, it came from an \ 1731 on_failure_keep_string_jump opcode, and we want to throw away the \ 1732 saved NULL, thus retaining our current position in the string. */ \ 1733 string_temp = POP_FAILURE_POINTER (); \ 1734 if (string_temp != NULL) \ 1735 str = (const CHAR_TYPE *) string_temp; \ 1736 \ 1737 DEBUG_PRINT2 (" Popping string %p: `", str); \ 1738 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ 1739 DEBUG_PRINT1 ("'\n"); \ 1740 \ 1741 pat = (US_CHAR_TYPE *) POP_FAILURE_POINTER (); \ 1742 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \ 1743 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ 1744 \ 1745 /* Restore register info. */ \ 1746 high_reg = (active_reg_t) POP_FAILURE_INT (); \ 1747 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \ 1748 \ 1749 low_reg = (active_reg_t) POP_FAILURE_INT (); \ 1750 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \ 1751 \ 1752 if (1) \ 1753 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ 1754 { \ 1755 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \ 1756 \ 1757 reg_info[this_reg].word = POP_FAILURE_ELT (); \ 1758 DEBUG_PRINT2 (" info: %p\n", \ 1759 reg_info[this_reg].word.pointer); \ 1760 \ 1761 regend[this_reg] = (const CHAR_TYPE *) POP_FAILURE_POINTER (); \ 1762 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ 1763 \ 1764 regstart[this_reg] = (const CHAR_TYPE *) POP_FAILURE_POINTER ();\ 1765 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ 1766 } \ 1767 else \ 1768 { \ 1769 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \ 1770 { \ 1771 reg_info[this_reg].word.integer = 0; \ 1772 regend[this_reg] = 0; \ 1773 regstart[this_reg] = 0; \ 1774 } \ 1775 highest_active_reg = high_reg; \ 1776 } \ 1777 \ 1778 set_regs_matched_done = 0; \ 1779 DEBUG_STATEMENT (nfailure_points_popped++); \ 1780 } /* POP_FAILURE_POINT */ 1781 1782 1783 /* Structure for per-register (a.k.a. per-group) information. 1785 Other register information, such as the 1786 starting and ending positions (which are addresses), and the list of 1787 inner groups (which is a bits list) are maintained in separate 1788 variables. 1789 1790 We are making a (strictly speaking) nonportable assumption here: that 1791 the compiler will pack our bit fields into something that fits into 1792 the type of `word', i.e., is something that fits into one item on the 1793 failure stack. */ 1794 1795 1796 /* Declarations and macros for re_match_2. */ 1797 1798 typedef union 1799 { 1800 fail_stack_elt_t word; 1801 struct 1802 { 1803 /* This field is one if this group can match the empty string, 1804 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ 1805 #define MATCH_NULL_UNSET_VALUE 3 1806 unsigned match_null_string_p : 2; 1807 unsigned is_active : 1; 1808 unsigned matched_something : 1; 1809 unsigned ever_matched_something : 1; 1810 } bits; 1811 } register_info_type; 1812 1813 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) 1814 #define IS_ACTIVE(R) ((R).bits.is_active) 1815 #define MATCHED_SOMETHING(R) ((R).bits.matched_something) 1816 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) 1817 1818 1819 /* Call this when have matched a real character; it sets `matched' flags 1820 for the subexpressions which we are currently inside. Also records 1821 that those subexprs have matched. */ 1822 #define SET_REGS_MATCHED() \ 1823 do \ 1824 { \ 1825 if (!set_regs_matched_done) \ 1826 { \ 1827 active_reg_t r; \ 1828 set_regs_matched_done = 1; \ 1829 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ 1830 { \ 1831 MATCHED_SOMETHING (reg_info[r]) \ 1832 = EVER_MATCHED_SOMETHING (reg_info[r]) \ 1833 = 1; \ 1834 } \ 1835 } \ 1836 } \ 1837 while (0) 1838 1839 /* Registers are set to a sentinel when they haven't yet matched. */ 1840 static CHAR_TYPE reg_unset_dummy; 1841 #define REG_UNSET_VALUE (®_unset_dummy) 1842 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) 1843 1844 /* Subroutine declarations and macros for regex_compile. */ 1846 1847 static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size, 1848 reg_syntax_t syntax, 1849 struct re_pattern_buffer *bufp)); 1850 static void store_op1 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc, int arg)); 1851 static void store_op2 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc, 1852 int arg1, int arg2)); 1853 static void insert_op1 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc, 1854 int arg, US_CHAR_TYPE *end)); 1855 static void insert_op2 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc, 1856 int arg1, int arg2, US_CHAR_TYPE *end)); 1857 static boolean at_begline_loc_p _RE_ARGS ((const CHAR_TYPE *pattern, 1858 const CHAR_TYPE *p, 1859 reg_syntax_t syntax)); 1860 static boolean at_endline_loc_p _RE_ARGS ((const CHAR_TYPE *p, 1861 const CHAR_TYPE *pend, 1862 reg_syntax_t syntax)); 1863 #ifdef MBS_SUPPORT 1864 static reg_errcode_t compile_range _RE_ARGS ((CHAR_TYPE range_start, 1865 const CHAR_TYPE **p_ptr, 1866 const CHAR_TYPE *pend, 1867 char *translate, 1868 reg_syntax_t syntax, 1869 US_CHAR_TYPE *b, 1870 CHAR_TYPE *char_set)); 1871 static void insert_space _RE_ARGS ((int num, CHAR_TYPE *loc, CHAR_TYPE *end)); 1872 #else 1873 static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start, 1874 const CHAR_TYPE **p_ptr, 1875 const CHAR_TYPE *pend, 1876 char *translate, 1877 reg_syntax_t syntax, 1878 US_CHAR_TYPE *b)); 1879 #endif /* MBS_SUPPORT */ 1880 1881 /* Fetch the next character in the uncompiled pattern---translating it 1882 if necessary. Also cast from a signed character in the constant 1883 string passed to us by the user to an unsigned char that we can use 1884 as an array index (in, e.g., `translate'). */ 1885 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff, 1886 because it is impossible to allocate 4GB array for some encodings 1887 which have 4 byte character_set like UCS4. */ 1888 #ifndef PATFETCH 1889 # ifdef MBS_SUPPORT 1890 # define PATFETCH(c) \ 1891 do {if (p == pend) return REG_EEND; \ 1892 c = (US_CHAR_TYPE) *p++; \ 1893 if (translate && (c <= 0xff)) c = (US_CHAR_TYPE) translate[c]; \ 1894 } while (0) 1895 # else 1896 # define PATFETCH(c) \ 1897 do {if (p == pend) return REG_EEND; \ 1898 c = (unsigned char) *p++; \ 1899 if (translate) c = (unsigned char) translate[c]; \ 1900 } while (0) 1901 # endif /* MBS_SUPPORT */ 1902 #endif 1903 1904 /* Fetch the next character in the uncompiled pattern, with no 1905 translation. */ 1906 #define PATFETCH_RAW(c) \ 1907 do {if (p == pend) return REG_EEND; \ 1908 c = (US_CHAR_TYPE) *p++; \ 1909 } while (0) 1910 1911 /* Go backwards one character in the pattern. */ 1912 #define PATUNFETCH p-- 1913 1914 1915 /* If `translate' is non-null, return translate[D], else just D. We 1916 cast the subscript to translate because some data is declared as 1917 `char *', to avoid warnings when a string constant is passed. But 1918 when we use a character as a subscript we must make it unsigned. */ 1919 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff, 1920 because it is impossible to allocate 4GB array for some encodings 1921 which have 4 byte character_set like UCS4. */ 1922 #ifndef TRANSLATE 1923 # ifdef MBS_SUPPORT 1924 # define TRANSLATE(d) \ 1925 ((translate && ((US_CHAR_TYPE) (d)) <= 0xff) \ 1926 ? (char) translate[(unsigned char) (d)] : (d)) 1927 #else 1928 # define TRANSLATE(d) \ 1929 (translate ? (char) translate[(unsigned char) (d)] : (d)) 1930 # endif /* MBS_SUPPORT */ 1931 #endif 1932 1933 1934 /* Macros for outputting the compiled pattern into `buffer'. */ 1935 1936 /* If the buffer isn't allocated when it comes in, use this. */ 1937 #define INIT_BUF_SIZE (32 * sizeof(US_CHAR_TYPE)) 1938 1939 /* Make sure we have at least N more bytes of space in buffer. */ 1940 #ifdef MBS_SUPPORT 1941 # define GET_BUFFER_SPACE(n) \ 1942 while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \ 1943 + (n)*sizeof(CHAR_TYPE)) > bufp->allocated) \ 1944 EXTEND_BUFFER () 1945 #else 1946 # define GET_BUFFER_SPACE(n) \ 1947 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \ 1948 EXTEND_BUFFER () 1949 #endif /* MBS_SUPPORT */ 1950 1951 /* Make sure we have one more byte of buffer space and then add C to it. */ 1952 #define BUF_PUSH(c) \ 1953 do { \ 1954 GET_BUFFER_SPACE (1); \ 1955 *b++ = (US_CHAR_TYPE) (c); \ 1956 } while (0) 1957 1958 1959 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ 1960 #define BUF_PUSH_2(c1, c2) \ 1961 do { \ 1962 GET_BUFFER_SPACE (2); \ 1963 *b++ = (US_CHAR_TYPE) (c1); \ 1964 *b++ = (US_CHAR_TYPE) (c2); \ 1965 } while (0) 1966 1967 1968 /* As with BUF_PUSH_2, except for three bytes. */ 1969 #define BUF_PUSH_3(c1, c2, c3) \ 1970 do { \ 1971 GET_BUFFER_SPACE (3); \ 1972 *b++ = (US_CHAR_TYPE) (c1); \ 1973 *b++ = (US_CHAR_TYPE) (c2); \ 1974 *b++ = (US_CHAR_TYPE) (c3); \ 1975 } while (0) 1976 1977 /* Store a jump with opcode OP at LOC to location TO. We store a 1978 relative address offset by the three bytes the jump itself occupies. */ 1979 #define STORE_JUMP(op, loc, to) \ 1980 store_op1 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE))) 1981 1982 /* Likewise, for a two-argument jump. */ 1983 #define STORE_JUMP2(op, loc, to, arg) \ 1984 store_op2 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg) 1985 1986 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ 1987 #define INSERT_JUMP(op, loc, to) \ 1988 insert_op1 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b) 1989 1990 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ 1991 #define INSERT_JUMP2(op, loc, to, arg) \ 1992 insert_op2 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\ 1993 arg, b) 1994 1995 1996 /* This is not an arbitrary limit: the arguments which represent offsets 1997 into the pattern are two bytes long. So if 2^16 bytes turns out to 1998 be too small, many things would have to change. */ 1999 /* Any other compiler which, like MSC, has allocation limit below 2^16 2000 bytes will have to use approach similar to what was done below for 2001 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up 2002 reallocating to 0 bytes. Such thing is not going to work too well. 2003 You have been warned!! */ 2004 #if defined _MSC_VER && !defined WIN32 2005 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. 2006 The REALLOC define eliminates a flurry of conversion warnings, 2007 but is not required. */ 2008 # define MAX_BUF_SIZE 65500L 2009 # define REALLOC(p,s) realloc ((p), (size_t) (s)) 2010 #else 2011 # define MAX_BUF_SIZE (1L << 16) 2012 # define REALLOC(p,s) realloc ((p), (s)) 2013 #endif 2014 2015 /* Extend the buffer by twice its current size via realloc and 2016 reset the pointers that pointed into the old block to point to the 2017 correct places in the new one. If extending the buffer results in it 2018 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ 2019 #if __BOUNDED_POINTERS__ 2020 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated) 2021 # define MOVE_BUFFER_POINTER(P) \ 2022 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr) 2023 # define ELSE_EXTEND_BUFFER_HIGH_BOUND \ 2024 else \ 2025 { \ 2026 SET_HIGH_BOUND (b); \ 2027 SET_HIGH_BOUND (begalt); \ 2028 if (fixup_alt_jump) \ 2029 SET_HIGH_BOUND (fixup_alt_jump); \ 2030 if (laststart) \ 2031 SET_HIGH_BOUND (laststart); \ 2032 if (pending_exact) \ 2033 SET_HIGH_BOUND (pending_exact); \ 2034 } 2035 #else 2036 # define MOVE_BUFFER_POINTER(P) (P) += incr 2037 # define ELSE_EXTEND_BUFFER_HIGH_BOUND 2038 #endif 2039 2040 #ifdef MBS_SUPPORT 2041 # define EXTEND_BUFFER() \ 2042 do { \ 2043 US_CHAR_TYPE *old_buffer = COMPILED_BUFFER_VAR; \ 2044 int wchar_count; \ 2045 if (bufp->allocated + sizeof(US_CHAR_TYPE) > MAX_BUF_SIZE) \ 2046 return REG_ESIZE; \ 2047 bufp->allocated <<= 1; \ 2048 if (bufp->allocated > MAX_BUF_SIZE) \ 2049 bufp->allocated = MAX_BUF_SIZE; \ 2050 /* How many characters the new buffer can have? */ \ 2051 wchar_count = bufp->allocated / sizeof(US_CHAR_TYPE); \ 2052 if (wchar_count == 0) wchar_count = 1; \ 2053 /* Truncate the buffer to CHAR_TYPE align. */ \ 2054 bufp->allocated = wchar_count * sizeof(US_CHAR_TYPE); \ 2055 RETALLOC (COMPILED_BUFFER_VAR, wchar_count, US_CHAR_TYPE); \ 2056 bufp->buffer = (char*)COMPILED_BUFFER_VAR; \ 2057 if (COMPILED_BUFFER_VAR == NULL) \ 2058 return REG_ESPACE; \ 2059 /* If the buffer moved, move all the pointers into it. */ \ 2060 if (old_buffer != COMPILED_BUFFER_VAR) \ 2061 { \ 2062 ptrdiff_t incr = COMPILED_BUFFER_VAR - old_buffer; \ 2063 MOVE_BUFFER_POINTER (b); \ 2064 MOVE_BUFFER_POINTER (begalt); \ 2065 if (fixup_alt_jump) \ 2066 MOVE_BUFFER_POINTER (fixup_alt_jump); \ 2067 if (laststart) \ 2068 MOVE_BUFFER_POINTER (laststart); \ 2069 if (pending_exact) \ 2070 MOVE_BUFFER_POINTER (pending_exact); \ 2071 } \ 2072 ELSE_EXTEND_BUFFER_HIGH_BOUND \ 2073 } while (0) 2074 #else 2075 # define EXTEND_BUFFER() \ 2076 do { \ 2077 US_CHAR_TYPE *old_buffer = COMPILED_BUFFER_VAR; \ 2078 if (bufp->allocated == MAX_BUF_SIZE) \ 2079 return REG_ESIZE; \ 2080 bufp->allocated <<= 1; \ 2081 if (bufp->allocated > MAX_BUF_SIZE) \ 2082 bufp->allocated = MAX_BUF_SIZE; \ 2083 bufp->buffer = (US_CHAR_TYPE *) REALLOC (COMPILED_BUFFER_VAR, \ 2084 bufp->allocated); \ 2085 if (COMPILED_BUFFER_VAR == NULL) \ 2086 return REG_ESPACE; \ 2087 /* If the buffer moved, move all the pointers into it. */ \ 2088 if (old_buffer != COMPILED_BUFFER_VAR) \ 2089 { \ 2090 ptrdiff_t incr = COMPILED_BUFFER_VAR - old_buffer; \ 2091 MOVE_BUFFER_POINTER (b); \ 2092 MOVE_BUFFER_POINTER (begalt); \ 2093 if (fixup_alt_jump) \ 2094 MOVE_BUFFER_POINTER (fixup_alt_jump); \ 2095 if (laststart) \ 2096 MOVE_BUFFER_POINTER (laststart); \ 2097 if (pending_exact) \ 2098 MOVE_BUFFER_POINTER (pending_exact); \ 2099 } \ 2100 ELSE_EXTEND_BUFFER_HIGH_BOUND \ 2101 } while (0) 2102 #endif /* MBS_SUPPORT */ 2103 2104 /* Since we have one byte reserved for the register number argument to 2105 {start,stop}_memory, the maximum number of groups we can report 2106 things about is what fits in that byte. */ 2107 #define MAX_REGNUM 255 2108 2109 /* But patterns can have more than `MAX_REGNUM' registers. We just 2110 ignore the excess. */ 2111 typedef unsigned regnum_t; 2112 2113 2114 /* Macros for the compile stack. */ 2115 2116 /* Since offsets can go either forwards or backwards, this type needs to 2117 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ 2118 /* int may be not enough when sizeof(int) == 2. */ 2119 typedef long pattern_offset_t; 2120 2121 typedef struct 2122 { 2123 pattern_offset_t begalt_offset; 2124 pattern_offset_t fixup_alt_jump; 2125 pattern_offset_t inner_group_offset; 2126 pattern_offset_t laststart_offset; 2127 regnum_t regnum; 2128 } compile_stack_elt_t; 2129 2130 2131 typedef struct 2132 { 2133 compile_stack_elt_t *stack; 2134 unsigned size; 2135 unsigned avail; /* Offset of next open position. */ 2136 } compile_stack_type; 2137 2138 2139 #define INIT_COMPILE_STACK_SIZE 32 2140 2141 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) 2142 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) 2143 2144 /* The next available element. */ 2145 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) 2146 2147 2148 /* Set the bit for character C in a list. */ 2149 #define SET_LIST_BIT(c) \ 2150 (b[((unsigned char) (c)) / BYTEWIDTH] \ 2151 |= 1 << (((unsigned char) c) % BYTEWIDTH)) 2152 2153 2154 /* Get the next unsigned number in the uncompiled pattern. */ 2155 #define GET_UNSIGNED_NUMBER(num) \ 2156 { \ 2157 while (p != pend) \ 2158 { \ 2159 PATFETCH (c); \ 2160 if (! ('0' <= c && c <= '9')) \ 2161 break; \ 2162 if (num <= RE_DUP_MAX) \ 2163 { \ 2164 if (num < 0) \ 2165 num = 0; \ 2166 num = num * 10 + c - '0'; \ 2167 } \ 2168 } \ 2169 } 2170 2171 #if defined _LIBC || WIDE_CHAR_SUPPORT 2172 /* The GNU C library provides support for user-defined character classes 2173 and the functions from ISO C amendement 1. */ 2174 # ifdef CHARCLASS_NAME_MAX 2175 # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX 2176 # else 2177 /* This shouldn't happen but some implementation might still have this 2178 problem. Use a reasonable default value. */ 2179 # define CHAR_CLASS_MAX_LENGTH 256 2180 # endif 2181 2182 # ifdef _LIBC 2183 # define IS_CHAR_CLASS(string) __wctype (string) 2184 # else 2185 # define IS_CHAR_CLASS(string) wctype (string) 2186 # endif 2187 #else 2188 # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ 2189 2190 # define IS_CHAR_CLASS(string) \ 2191 (STREQ (string, "alpha") || STREQ (string, "upper") \ 2192 || STREQ (string, "lower") || STREQ (string, "digit") \ 2193 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ 2194 || STREQ (string, "space") || STREQ (string, "print") \ 2195 || STREQ (string, "punct") || STREQ (string, "graph") \ 2196 || STREQ (string, "cntrl") || STREQ (string, "blank")) 2197 #endif 2198 2199 #ifndef MATCH_MAY_ALLOCATE 2201 2202 /* If we cannot allocate large objects within re_match_2_internal, 2203 we make the fail stack and register vectors global. 2204 The fail stack, we grow to the maximum size when a regexp 2205 is compiled. 2206 The register vectors, we adjust in size each time we 2207 compile a regexp, according to the number of registers it needs. */ 2208 2209 static fail_stack_type fail_stack; 2210 2211 /* Size with which the following vectors are currently allocated. 2212 That is so we can make them bigger as needed, 2213 but never make them smaller. */ 2214 static int regs_allocated_size; 2215 2216 static const char ** regstart, ** regend; 2217 static const char ** old_regstart, ** old_regend; 2218 static const char **best_regstart, **best_regend; 2219 static register_info_type *reg_info; 2220 static const char **reg_dummy; 2221 static register_info_type *reg_info_dummy; 2222 2223 /* Make the register vectors big enough for NUM_REGS registers, 2224 but don't make them smaller. */ 2225 2226 static 2227 regex_grow_registers (num_regs) 2228 int num_regs; 2229 { 2230 if (num_regs > regs_allocated_size) 2231 { 2232 RETALLOC_IF (regstart, num_regs, const char *); 2233 RETALLOC_IF (regend, num_regs, const char *); 2234 RETALLOC_IF (old_regstart, num_regs, const char *); 2235 RETALLOC_IF (old_regend, num_regs, const char *); 2236 RETALLOC_IF (best_regstart, num_regs, const char *); 2237 RETALLOC_IF (best_regend, num_regs, const char *); 2238 RETALLOC_IF (reg_info, num_regs, register_info_type); 2239 RETALLOC_IF (reg_dummy, num_regs, const char *); 2240 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type); 2241 2242 regs_allocated_size = num_regs; 2243 } 2244 } 2245 2246 #endif /* not MATCH_MAY_ALLOCATE */ 2247 2248 static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type 2250 compile_stack, 2251 regnum_t regnum)); 2252 2253 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. 2254 Returns one of error codes defined in `regex.h', or zero for success. 2255 2256 Assumes the `allocated' (and perhaps `buffer') and `translate' 2257 fields are set in BUFP on entry. 2258 2259 If it succeeds, results are put in BUFP (if it returns an error, the 2260 contents of BUFP are undefined): 2261 `buffer' is the compiled pattern; 2262 `syntax' is set to SYNTAX; 2263 `used' is set to the length of the compiled pattern; 2264 `fastmap_accurate' is zero; 2265 `re_nsub' is the number of subexpressions in PATTERN; 2266 `not_bol' and `not_eol' are zero; 2267 2268 The `fastmap' and `newline_anchor' fields are neither 2269 examined nor set. */ 2270 2271 /* Return, freeing storage we allocated. */ 2272 #ifdef MBS_SUPPORT 2273 # define FREE_STACK_RETURN(value) \ 2274 return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value) 2275 #else 2276 # define FREE_STACK_RETURN(value) \ 2277 return (free (compile_stack.stack), value) 2278 #endif /* MBS_SUPPORT */ 2279 2280 static reg_errcode_t 2281 #ifdef MBS_SUPPORT 2282 regex_compile (cpattern, csize, syntax, bufp) 2283 const char *cpattern; 2284 size_t csize; 2285 #else 2286 regex_compile (pattern, size, syntax, bufp) 2287 const char *pattern; 2288 size_t size; 2289 #endif /* MBS_SUPPORT */ 2290 reg_syntax_t syntax; 2291 struct re_pattern_buffer *bufp; 2292 { 2293 /* We fetch characters from PATTERN here. Even though PATTERN is 2294 `char *' (i.e., signed), we declare these variables as unsigned, so 2295 they can be reliably used as array indices. */ 2296 register US_CHAR_TYPE c, c1; 2297 2298 #ifdef MBS_SUPPORT 2299 /* A temporary space to keep wchar_t pattern and compiled pattern. */ 2300 CHAR_TYPE *pattern, *COMPILED_BUFFER_VAR; 2301 size_t size; 2302 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */ 2303 int *mbs_offset = NULL; 2304 /* It hold whether each wchar_t is binary data or not. */ 2305 char *is_binary = NULL; 2306 /* A flag whether exactn is handling binary data or not. */ 2307 char is_exactn_bin = FALSE; 2308 #endif /* MBS_SUPPORT */ 2309 2310 /* A random temporary spot in PATTERN. */ 2311 const CHAR_TYPE *p1; 2312 2313 /* Points to the end of the buffer, where we should append. */ 2314 register US_CHAR_TYPE *b; 2315 2316 /* Keeps track of unclosed groups. */ 2317 compile_stack_type compile_stack; 2318 2319 /* Points to the current (ending) position in the pattern. */ 2320 #ifdef MBS_SUPPORT 2321 const CHAR_TYPE *p; 2322 const CHAR_TYPE *pend; 2323 #else 2324 const CHAR_TYPE *p = pattern; 2325 const CHAR_TYPE *pend = pattern + size; 2326 #endif /* MBS_SUPPORT */ 2327 2328 /* How to translate the characters in the pattern. */ 2329 RE_TRANSLATE_TYPE translate = bufp->translate; 2330 2331 /* Address of the count-byte of the most recently inserted `exactn' 2332 command. This makes it possible to tell if a new exact-match 2333 character can be added to that command or if the character requires 2334 a new `exactn' command. */ 2335 US_CHAR_TYPE *pending_exact = 0; 2336 2337 /* Address of start of the most recently finished expression. 2338 This tells, e.g., postfix * where to find the start of its 2339 operand. Reset at the beginning of groups and alternatives. */ 2340 US_CHAR_TYPE *laststart = 0; 2341 2342 /* Address of beginning of regexp, or inside of last group. */ 2343 US_CHAR_TYPE *begalt; 2344 2345 /* Address of the place where a forward jump should go to the end of 2346 the containing expression. Each alternative of an `or' -- except the 2347 last -- ends with a forward jump of this sort. */ 2348 US_CHAR_TYPE *fixup_alt_jump = 0; 2349 2350 /* Counts open-groups as they are encountered. Remembered for the 2351 matching close-group on the compile stack, so the same register 2352 number is put in the stop_memory as the start_memory. */ 2353 regnum_t regnum = 0; 2354 2355 #ifdef MBS_SUPPORT 2356 /* Initialize the wchar_t PATTERN and offset_buffer. */ 2357 p = pend = pattern = TALLOC(csize + 1, CHAR_TYPE); 2358 p[csize] = L'\0'; /* sentinel */ 2359 mbs_offset = TALLOC(csize + 1, int); 2360 is_binary = TALLOC(csize + 1, char); 2361 if (pattern == NULL || mbs_offset == NULL || is_binary == NULL) 2362 { 2363 if (pattern) free(pattern); 2364 if (mbs_offset) free(mbs_offset); 2365 if (is_binary) free(is_binary); 2366 return REG_ESPACE; 2367 } 2368 size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary); 2369 pend = p + size; 2370 if (size < 0) 2371 { 2372 if (pattern) free(pattern); 2373 if (mbs_offset) free(mbs_offset); 2374 if (is_binary) free(is_binary); 2375 return REG_BADPAT; 2376 } 2377 #endif 2378 2379 #ifdef DEBUG 2380 DEBUG_PRINT1 ("\nCompiling pattern: "); 2381 if (debug) 2382 { 2383 unsigned debug_count; 2384 2385 for (debug_count = 0; debug_count < size; debug_count++) 2386 PUT_CHAR (pattern[debug_count]); 2387 putchar ('\n'); 2388 } 2389 #endif /* DEBUG */ 2390 2391 /* Initialize the compile stack. */ 2392 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); 2393 if (compile_stack.stack == NULL) 2394 { 2395 #ifdef MBS_SUPPORT 2396 if (pattern) free(pattern); 2397 if (mbs_offset) free(mbs_offset); 2398 if (is_binary) free(is_binary); 2399 #endif 2400 return REG_ESPACE; 2401 } 2402 2403 compile_stack.size = INIT_COMPILE_STACK_SIZE; 2404 compile_stack.avail = 0; 2405 2406 /* Initialize the pattern buffer. */ 2407 bufp->syntax = syntax; 2408 bufp->fastmap_accurate = 0; 2409 bufp->not_bol = bufp->not_eol = 0; 2410 2411 /* Set `used' to zero, so that if we return an error, the pattern 2412 printer (for debugging) will think there's no pattern. We reset it 2413 at the end. */ 2414 bufp->used = 0; 2415 2416 /* Always count groups, whether or not bufp->no_sub is set. */ 2417 bufp->re_nsub = 0; 2418 2419 #if !defined emacs && !defined SYNTAX_TABLE 2420 /* Initialize the syntax table. */ 2421 init_syntax_once (); 2422 #endif 2423 2424 if (bufp->allocated == 0) 2425 { 2426 if (bufp->buffer) 2427 { /* If zero allocated, but buffer is non-null, try to realloc 2428 enough space. This loses if buffer's address is bogus, but 2429 that is the user's responsibility. */ 2430 #ifdef MBS_SUPPORT 2431 /* Free bufp->buffer and allocate an array for wchar_t pattern 2432 buffer. */ 2433 free(bufp->buffer); 2434 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(US_CHAR_TYPE), 2435 US_CHAR_TYPE); 2436 #else 2437 RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, US_CHAR_TYPE); 2438 #endif /* MBS_SUPPORT */ 2439 } 2440 else 2441 { /* Caller did not allocate a buffer. Do it for them. */ 2442 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(US_CHAR_TYPE), 2443 US_CHAR_TYPE); 2444 } 2445 2446 if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE); 2447 #ifdef MBS_SUPPORT 2448 bufp->buffer = (char*)COMPILED_BUFFER_VAR; 2449 #endif /* MBS_SUPPORT */ 2450 bufp->allocated = INIT_BUF_SIZE; 2451 } 2452 #ifdef MBS_SUPPORT 2453 else 2454 COMPILED_BUFFER_VAR = (US_CHAR_TYPE*) bufp->buffer; 2455 #endif 2456 2457 begalt = b = COMPILED_BUFFER_VAR; 2458 2459 /* Loop through the uncompiled pattern until we're at the end. */ 2460 while (p != pend) 2461 { 2462 PATFETCH (c); 2463 2464 switch (c) 2465 { 2466 case '^': 2467 { 2468 if ( /* If at start of pattern, it's an operator. */ 2469 p == pattern + 1 2470 /* If context independent, it's an operator. */ 2471 || syntax & RE_CONTEXT_INDEP_ANCHORS 2472 /* Otherwise, depends on what's come before. */ 2473 || at_begline_loc_p (pattern, p, syntax)) 2474 BUF_PUSH (begline); 2475 else 2476 goto normal_char; 2477 } 2478 break; 2479 2480 2481 case '$': 2482 { 2483 if ( /* If at end of pattern, it's an operator. */ 2484 p == pend 2485 /* If context independent, it's an operator. */ 2486 || syntax & RE_CONTEXT_INDEP_ANCHORS 2487 /* Otherwise, depends on what's next. */ 2488 || at_endline_loc_p (p, pend, syntax)) 2489 BUF_PUSH (endline); 2490 else 2491 goto normal_char; 2492 } 2493 break; 2494 2495 2496 case '+': 2497 case '?': 2498 if ((syntax & RE_BK_PLUS_QM) 2499 || (syntax & RE_LIMITED_OPS)) 2500 goto normal_char; 2501 handle_plus: 2502 case '*': 2503 /* If there is no previous pattern... */ 2504 if (!laststart) 2505 { 2506 if (syntax & RE_CONTEXT_INVALID_OPS) 2507 FREE_STACK_RETURN (REG_BADRPT); 2508 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) 2509 goto normal_char; 2510 } 2511 2512 { 2513 /* Are we optimizing this jump? */ 2514 boolean keep_string_p = false; 2515 2516 /* 1 means zero (many) matches is allowed. */ 2517 char zero_times_ok = 0, many_times_ok = 0; 2518 2519 /* If there is a sequence of repetition chars, collapse it 2520 down to just one (the right one). We can't combine 2521 interval operators with these because of, e.g., `a{2}*', 2522 which should only match an even number of `a's. */ 2523 2524 for (;;) 2525 { 2526 zero_times_ok |= c != '+'; 2527 many_times_ok |= c != '?'; 2528 2529 if (p == pend) 2530 break; 2531 2532 PATFETCH (c); 2533 2534 if (c == '*' 2535 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) 2536 ; 2537 2538 else if (syntax & RE_BK_PLUS_QM && c == '\\') 2539 { 2540 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2541 2542 PATFETCH (c1); 2543 if (!(c1 == '+' || c1 == '?')) 2544 { 2545 PATUNFETCH; 2546 PATUNFETCH; 2547 break; 2548 } 2549 2550 c = c1; 2551 } 2552 else 2553 { 2554 PATUNFETCH; 2555 break; 2556 } 2557 2558 /* If we get here, we found another repeat character. */ 2559 } 2560 2561 /* Star, etc. applied to an empty pattern is equivalent 2562 to an empty pattern. */ 2563 if (!laststart) 2564 break; 2565 2566 /* Now we know whether or not zero matches is allowed 2567 and also whether or not two or more matches is allowed. */ 2568 if (many_times_ok) 2569 { /* More than one repetition is allowed, so put in at the 2570 end a backward relative jump from `b' to before the next 2571 jump we're going to put in below (which jumps from 2572 laststart to after this jump). 2573 2574 But if we are at the `*' in the exact sequence `.*\n', 2575 insert an unconditional jump backwards to the ., 2576 instead of the beginning of the loop. This way we only 2577 push a failure point once, instead of every time 2578 through the loop. */ 2579 assert (p - 1 > pattern); 2580 2581 /* Allocate the space for the jump. */ 2582 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 2583 2584 /* We know we are not at the first character of the pattern, 2585 because laststart was nonzero. And we've already 2586 incremented `p', by the way, to be the character after 2587 the `*'. Do we have to do something analogous here 2588 for null bytes, because of RE_DOT_NOT_NULL? */ 2589 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') 2590 && zero_times_ok 2591 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') 2592 && !(syntax & RE_DOT_NEWLINE)) 2593 { /* We have .*\n. */ 2594 STORE_JUMP (jump, b, laststart); 2595 keep_string_p = true; 2596 } 2597 else 2598 /* Anything else. */ 2599 STORE_JUMP (maybe_pop_jump, b, laststart - 2600 (1 + OFFSET_ADDRESS_SIZE)); 2601 2602 /* We've added more stuff to the buffer. */ 2603 b += 1 + OFFSET_ADDRESS_SIZE; 2604 } 2605 2606 /* On failure, jump from laststart to b + 3, which will be the 2607 end of the buffer after this jump is inserted. */ 2608 /* ifdef MBS_SUPPORT, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of 2609 'b + 3'. */ 2610 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 2611 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump 2612 : on_failure_jump, 2613 laststart, b + 1 + OFFSET_ADDRESS_SIZE); 2614 pending_exact = 0; 2615 b += 1 + OFFSET_ADDRESS_SIZE; 2616 2617 if (!zero_times_ok) 2618 { 2619 /* At least one repetition is required, so insert a 2620 `dummy_failure_jump' before the initial 2621 `on_failure_jump' instruction of the loop. This 2622 effects a skip over that instruction the first time 2623 we hit that loop. */ 2624 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 2625 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 2626 2 + 2 * OFFSET_ADDRESS_SIZE); 2627 b += 1 + OFFSET_ADDRESS_SIZE; 2628 } 2629 } 2630 break; 2631 2632 2633 case '.': 2634 laststart = b; 2635 BUF_PUSH (anychar); 2636 break; 2637 2638 2639 case '[': 2640 { 2641 boolean had_char_class = false; 2642 #ifdef MBS_SUPPORT 2643 CHAR_TYPE range_start = 0xffffffff; 2644 #else 2645 unsigned int range_start = 0xffffffff; 2646 #endif 2647 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2648 2649 #ifdef MBS_SUPPORT 2650 /* We assume a charset(_not) structure as a wchar_t array. 2651 charset[0] = (re_opcode_t) charset(_not) 2652 charset[1] = l (= length of char_classes) 2653 charset[2] = m (= length of collating_symbols) 2654 charset[3] = n (= length of equivalence_classes) 2655 charset[4] = o (= length of char_ranges) 2656 charset[5] = p (= length of chars) 2657 2658 charset[6] = char_class (wctype_t) 2659 charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t) 2660 ... 2661 charset[l+5] = char_class (wctype_t) 2662 2663 charset[l+6] = collating_symbol (wchar_t) 2664 ... 2665 charset[l+m+5] = collating_symbol (wchar_t) 2666 ifdef _LIBC we use the index if 2667 _NL_COLLATE_SYMB_EXTRAMB instead of 2668 wchar_t string. 2669 2670 charset[l+m+6] = equivalence_classes (wchar_t) 2671 ... 2672 charset[l+m+n+5] = equivalence_classes (wchar_t) 2673 ifdef _LIBC we use the index in 2674 _NL_COLLATE_WEIGHT instead of 2675 wchar_t string. 2676 2677 charset[l+m+n+6] = range_start 2678 charset[l+m+n+7] = range_end 2679 ... 2680 charset[l+m+n+2o+4] = range_start 2681 charset[l+m+n+2o+5] = range_end 2682 ifdef _LIBC we use the value looked up 2683 in _NL_COLLATE_COLLSEQ instead of 2684 wchar_t character. 2685 2686 charset[l+m+n+2o+6] = char 2687 ... 2688 charset[l+m+n+2o+p+5] = char 2689 2690 */ 2691 2692 /* We need at least 6 spaces: the opcode, the length of 2693 char_classes, the length of collating_symbols, the length of 2694 equivalence_classes, the length of char_ranges, the length of 2695 chars. */ 2696 GET_BUFFER_SPACE (6); 2697 2698 /* Save b as laststart. And We use laststart as the pointer 2699 to the first element of the charset here. 2700 In other words, laststart[i] indicates charset[i]. */ 2701 laststart = b; 2702 2703 /* We test `*p == '^' twice, instead of using an if 2704 statement, so we only need one BUF_PUSH. */ 2705 BUF_PUSH (*p == '^' ? charset_not : charset); 2706 if (*p == '^') 2707 p++; 2708 2709 /* Push the length of char_classes, the length of 2710 collating_symbols, the length of equivalence_classes, the 2711 length of char_ranges and the length of chars. */ 2712 BUF_PUSH_3 (0, 0, 0); 2713 BUF_PUSH_2 (0, 0); 2714 2715 /* Remember the first position in the bracket expression. */ 2716 p1 = p; 2717 2718 /* charset_not matches newline according to a syntax bit. */ 2719 if ((re_opcode_t) b[-6] == charset_not 2720 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 2721 { 2722 BUF_PUSH('\n'); 2723 laststart[5]++; /* Update the length of characters */ 2724 } 2725 2726 /* Read in characters and ranges, setting map bits. */ 2727 for (;;) 2728 { 2729 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2730 2731 PATFETCH (c); 2732 2733 /* \ might escape characters inside [...] and [^...]. */ 2734 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 2735 { 2736 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2737 2738 PATFETCH (c1); 2739 BUF_PUSH(c1); 2740 laststart[5]++; /* Update the length of chars */ 2741 range_start = c1; 2742 continue; 2743 } 2744 2745 /* Could be the end of the bracket expression. If it's 2746 not (i.e., when the bracket expression is `[]' so 2747 far), the ']' character bit gets set way below. */ 2748 if (c == ']' && p != p1 + 1) 2749 break; 2750 2751 /* Look ahead to see if it's a range when the last thing 2752 was a character class. */ 2753 if (had_char_class && c == '-' && *p != ']') 2754 FREE_STACK_RETURN (REG_ERANGE); 2755 2756 /* Look ahead to see if it's a range when the last thing 2757 was a character: if this is a hyphen not at the 2758 beginning or the end of a list, then it's the range 2759 operator. */ 2760 if (c == '-' 2761 && !(p - 2 >= pattern && p[-2] == '[') 2762 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 2763 && *p != ']') 2764 { 2765 reg_errcode_t ret; 2766 /* Allocate the space for range_start and range_end. */ 2767 GET_BUFFER_SPACE (2); 2768 /* Update the pointer to indicate end of buffer. */ 2769 b += 2; 2770 ret = compile_range (range_start, &p, pend, translate, 2771 syntax, b, laststart); 2772 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 2773 range_start = 0xffffffff; 2774 } 2775 else if (p[0] == '-' && p[1] != ']') 2776 { /* This handles ranges made up of characters only. */ 2777 reg_errcode_t ret; 2778 2779 /* Move past the `-'. */ 2780 PATFETCH (c1); 2781 /* Allocate the space for range_start and range_end. */ 2782 GET_BUFFER_SPACE (2); 2783 /* Update the pointer to indicate end of buffer. */ 2784 b += 2; 2785 ret = compile_range (c, &p, pend, translate, syntax, b, 2786 laststart); 2787 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 2788 range_start = 0xffffffff; 2789 } 2790 2791 /* See if we're at the beginning of a possible character 2792 class. */ 2793 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 2794 { /* Leave room for the null. */ 2795 char str[CHAR_CLASS_MAX_LENGTH + 1]; 2796 2797 PATFETCH (c); 2798 c1 = 0; 2799 2800 /* If pattern is `[[:'. */ 2801 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2802 2803 for (;;) 2804 { 2805 PATFETCH (c); 2806 if ((c == ':' && *p == ']') || p == pend) 2807 break; 2808 if (c1 < CHAR_CLASS_MAX_LENGTH) 2809 str[c1++] = c; 2810 else 2811 /* This is in any case an invalid class name. */ 2812 str[0] = '\0'; 2813 } 2814 str[c1] = '\0'; 2815 2816 /* If isn't a word bracketed by `[:' and `:]': 2817 undo the ending character, the letters, and leave 2818 the leading `:' and `[' (but store them as character). */ 2819 if (c == ':' && *p == ']') 2820 { 2821 wctype_t wt; 2822 uintptr_t alignedp; 2823 2824 /* Query the character class as wctype_t. */ 2825 wt = IS_CHAR_CLASS (str); 2826 if (wt == 0) 2827 FREE_STACK_RETURN (REG_ECTYPE); 2828 2829 /* Throw away the ] at the end of the character 2830 class. */ 2831 PATFETCH (c); 2832 2833 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2834 2835 /* Allocate the space for character class. */ 2836 GET_BUFFER_SPACE(CHAR_CLASS_SIZE); 2837 /* Update the pointer to indicate end of buffer. */ 2838 b += CHAR_CLASS_SIZE; 2839 /* Move data which follow character classes 2840 not to violate the data. */ 2841 insert_space(CHAR_CLASS_SIZE, 2842 laststart + 6 + laststart[1], 2843 b - 1); 2844 alignedp = ((uintptr_t)(laststart + 6 + laststart[1]) 2845 + __alignof__(wctype_t) - 1) 2846 & ~(uintptr_t)(__alignof__(wctype_t) - 1); 2847 /* Store the character class. */ 2848 *((wctype_t*)alignedp) = wt; 2849 /* Update length of char_classes */ 2850 laststart[1] += CHAR_CLASS_SIZE; 2851 2852 had_char_class = true; 2853 } 2854 else 2855 { 2856 c1++; 2857 while (c1--) 2858 PATUNFETCH; 2859 BUF_PUSH ('['); 2860 BUF_PUSH (':'); 2861 laststart[5] += 2; /* Update the length of characters */ 2862 range_start = ':'; 2863 had_char_class = false; 2864 } 2865 } 2866 else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '=' 2867 || *p == '.')) 2868 { 2869 CHAR_TYPE str[128]; /* Should be large enough. */ 2870 CHAR_TYPE delim = *p; /* '=' or '.' */ 2871 # ifdef _LIBC 2872 uint32_t nrules = 2873 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 2874 # endif 2875 PATFETCH (c); 2876 c1 = 0; 2877 2878 /* If pattern is `[[=' or '[[.'. */ 2879 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2880 2881 for (;;) 2882 { 2883 PATFETCH (c); 2884 if ((c == delim && *p == ']') || p == pend) 2885 break; 2886 if (c1 < sizeof (str) - 1) 2887 str[c1++] = c; 2888 else 2889 /* This is in any case an invalid class name. */ 2890 str[0] = '\0'; 2891 } 2892 str[c1] = '\0'; 2893 2894 if (c == delim && *p == ']' && str[0] != '\0') 2895 { 2896 unsigned int i, offset; 2897 /* If we have no collation data we use the default 2898 collation in which each character is in a class 2899 by itself. It also means that ASCII is the 2900 character set and therefore we cannot have character 2901 with more than one byte in the multibyte 2902 representation. */ 2903 2904 /* If not defined _LIBC, we push the name and 2905 `\0' for the sake of matching performance. */ 2906 int datasize = c1 + 1; 2907 2908 # ifdef _LIBC 2909 int32_t idx = 0; 2910 if (nrules == 0) 2911 # endif 2912 { 2913 if (c1 != 1) 2914 FREE_STACK_RETURN (REG_ECOLLATE); 2915 } 2916 # ifdef _LIBC 2917 else 2918 { 2919 const int32_t *table; 2920 const int32_t *weights; 2921 const int32_t *extra; 2922 const int32_t *indirect; 2923 wint_t *cp; 2924 2925 /* This #include defines a local function! */ 2926 # include <locale/weightwc.h> 2927 2928 if(delim == '=') 2929 { 2930 /* We push the index for equivalence class. */ 2931 cp = (wint_t*)str; 2932 2933 table = (const int32_t *) 2934 _NL_CURRENT (LC_COLLATE, 2935 _NL_COLLATE_TABLEWC); 2936 weights = (const int32_t *) 2937 _NL_CURRENT (LC_COLLATE, 2938 _NL_COLLATE_WEIGHTWC); 2939 extra = (const int32_t *) 2940 _NL_CURRENT (LC_COLLATE, 2941 _NL_COLLATE_EXTRAWC); 2942 indirect = (const int32_t *) 2943 _NL_CURRENT (LC_COLLATE, 2944 _NL_COLLATE_INDIRECTWC); 2945 2946 idx = findidx ((const wint_t**)&cp); 2947 if (idx == 0 || cp < (wint_t*) str + c1) 2948 /* This is no valid character. */ 2949 FREE_STACK_RETURN (REG_ECOLLATE); 2950 2951 str[0] = (wchar_t)idx; 2952 } 2953 else /* delim == '.' */ 2954 { 2955 /* We push collation sequence value 2956 for collating symbol. */ 2957 int32_t table_size; 2958 const int32_t *symb_table; 2959 const unsigned char *extra; 2960 int32_t idx; 2961 int32_t elem; 2962 int32_t second; 2963 int32_t hash; 2964 char char_str[c1]; 2965 2966 /* We have to convert the name to a single-byte 2967 string. This is possible since the names 2968 consist of ASCII characters and the internal 2969 representation is UCS4. */ 2970 for (i = 0; i < c1; ++i) 2971 char_str[i] = str[i]; 2972 2973 table_size = 2974 _NL_CURRENT_WORD (LC_COLLATE, 2975 _NL_COLLATE_SYMB_HASH_SIZEMB); 2976 symb_table = (const int32_t *) 2977 _NL_CURRENT (LC_COLLATE, 2978 _NL_COLLATE_SYMB_TABLEMB); 2979 extra = (const unsigned char *) 2980 _NL_CURRENT (LC_COLLATE, 2981 _NL_COLLATE_SYMB_EXTRAMB); 2982 2983 /* Locate the character in the hashing table. */ 2984 hash = elem_hash (char_str, c1); 2985 2986 idx = 0; 2987 elem = hash % table_size; 2988 second = hash % (table_size - 2); 2989 while (symb_table[2 * elem] != 0) 2990 { 2991 /* First compare the hashing value. */ 2992 if (symb_table[2 * elem] == hash 2993 && c1 == extra[symb_table[2 * elem + 1]] 2994 && memcmp (str, 2995 &extra[symb_table[2 * elem + 1] 2996 + 1], c1) == 0) 2997 { 2998 /* Yep, this is the entry. */ 2999 idx = symb_table[2 * elem + 1]; 3000 idx += 1 + extra[idx]; 3001 break; 3002 } 3003 3004 /* Next entry. */ 3005 elem += second; 3006 } 3007 3008 if (symb_table[2 * elem] != 0) 3009 { 3010 /* Compute the index of the byte sequence 3011 in the table. */ 3012 idx += 1 + extra[idx]; 3013 /* Adjust for the alignment. */ 3014 idx = (idx + 3) & ~4; 3015 3016 str[0] = (wchar_t) idx + 4; 3017 } 3018 else if (symb_table[2 * elem] == 0 && c1 == 1) 3019 { 3020 /* No valid character. Match it as a 3021 single byte character. */ 3022 had_char_class = false; 3023 BUF_PUSH(str[0]); 3024 /* Update the length of characters */ 3025 laststart[5]++; 3026 range_start = str[0]; 3027 3028 /* Throw away the ] at the end of the 3029 collating symbol. */ 3030 PATFETCH (c); 3031 /* exit from the switch block. */ 3032 continue; 3033 } 3034 else 3035 FREE_STACK_RETURN (REG_ECOLLATE); 3036 } 3037 datasize = 1; 3038 } 3039 # endif 3040 /* Throw away the ] at the end of the equivalence 3041 class (or collating symbol). */ 3042 PATFETCH (c); 3043 3044 /* Allocate the space for the equivalence class 3045 (or collating symbol) (and '\0' if needed). */ 3046 GET_BUFFER_SPACE(datasize); 3047 /* Update the pointer to indicate end of buffer. */ 3048 b += datasize; 3049 3050 if (delim == '=') 3051 { /* equivalence class */ 3052 /* Calculate the offset of char_ranges, 3053 which is next to equivalence_classes. */ 3054 offset = laststart[1] + laststart[2] 3055 + laststart[3] +6; 3056 /* Insert space. */ 3057 insert_space(datasize, laststart + offset, b - 1); 3058 3059 /* Write the equivalence_class and \0. */ 3060 for (i = 0 ; i < datasize ; i++) 3061 laststart[offset + i] = str[i]; 3062 3063 /* Update the length of equivalence_classes. */ 3064 laststart[3] += datasize; 3065 had_char_class = true; 3066 } 3067 else /* delim == '.' */ 3068 { /* collating symbol */ 3069 /* Calculate the offset of the equivalence_classes, 3070 which is next to collating_symbols. */ 3071 offset = laststart[1] + laststart[2] + 6; 3072 /* Insert space and write the collationg_symbol 3073 and \0. */ 3074 insert_space(datasize, laststart + offset, b-1); 3075 for (i = 0 ; i < datasize ; i++) 3076 laststart[offset + i] = str[i]; 3077 3078 /* In re_match_2_internal if range_start < -1, we 3079 assume -range_start is the offset of the 3080 collating symbol which is specified as 3081 the character of the range start. So we assign 3082 -(laststart[1] + laststart[2] + 6) to 3083 range_start. */ 3084 range_start = -(laststart[1] + laststart[2] + 6); 3085 /* Update the length of collating_symbol. */ 3086 laststart[2] += datasize; 3087 had_char_class = false; 3088 } 3089 } 3090 else 3091 { 3092 c1++; 3093 while (c1--) 3094 PATUNFETCH; 3095 BUF_PUSH ('['); 3096 BUF_PUSH (delim); 3097 laststart[5] += 2; /* Update the length of characters */ 3098 range_start = delim; 3099 had_char_class = false; 3100 } 3101 } 3102 else 3103 { 3104 had_char_class = false; 3105 BUF_PUSH(c); 3106 laststart[5]++; /* Update the length of characters */ 3107 range_start = c; 3108 } 3109 } 3110 3111 #else /* not MBS_SUPPORT */ 3112 /* Ensure that we have enough space to push a charset: the 3113 opcode, the length count, and the bitset; 34 bytes in all. */ 3114 GET_BUFFER_SPACE (34); 3115 3116 laststart = b; 3117 3118 /* We test `*p == '^' twice, instead of using an if 3119 statement, so we only need one BUF_PUSH. */ 3120 BUF_PUSH (*p == '^' ? charset_not : charset); 3121 if (*p == '^') 3122 p++; 3123 3124 /* Remember the first position in the bracket expression. */ 3125 p1 = p; 3126 3127 /* Push the number of bytes in the bitmap. */ 3128 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); 3129 3130 /* Clear the whole map. */ 3131 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); 3132 3133 /* charset_not matches newline according to a syntax bit. */ 3134 if ((re_opcode_t) b[-2] == charset_not 3135 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 3136 SET_LIST_BIT ('\n'); 3137 3138 /* Read in characters and ranges, setting map bits. */ 3139 for (;;) 3140 { 3141 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3142 3143 PATFETCH (c); 3144 3145 /* \ might escape characters inside [...] and [^...]. */ 3146 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 3147 { 3148 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 3149 3150 PATFETCH (c1); 3151 SET_LIST_BIT (c1); 3152 range_start = c1; 3153 continue; 3154 } 3155 3156 /* Could be the end of the bracket expression. If it's 3157 not (i.e., when the bracket expression is `[]' so 3158 far), the ']' character bit gets set way below. */ 3159 if (c == ']' && p != p1 + 1) 3160 break; 3161 3162 /* Look ahead to see if it's a range when the last thing 3163 was a character class. */ 3164 if (had_char_class && c == '-' && *p != ']') 3165 FREE_STACK_RETURN (REG_ERANGE); 3166 3167 /* Look ahead to see if it's a range when the last thing 3168 was a character: if this is a hyphen not at the 3169 beginning or the end of a list, then it's the range 3170 operator. */ 3171 if (c == '-' 3172 && !(p - 2 >= pattern && p[-2] == '[') 3173 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 3174 && *p != ']') 3175 { 3176 reg_errcode_t ret 3177 = compile_range (range_start, &p, pend, translate, 3178 syntax, b); 3179 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 3180 range_start = 0xffffffff; 3181 } 3182 3183 else if (p[0] == '-' && p[1] != ']') 3184 { /* This handles ranges made up of characters only. */ 3185 reg_errcode_t ret; 3186 3187 /* Move past the `-'. */ 3188 PATFETCH (c1); 3189 3190 ret = compile_range (c, &p, pend, translate, syntax, b); 3191 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 3192 range_start = 0xffffffff; 3193 } 3194 3195 /* See if we're at the beginning of a possible character 3196 class. */ 3197 3198 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 3199 { /* Leave room for the null. */ 3200 char str[CHAR_CLASS_MAX_LENGTH + 1]; 3201 3202 PATFETCH (c); 3203 c1 = 0; 3204 3205 /* If pattern is `[[:'. */ 3206 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3207 3208 for (;;) 3209 { 3210 PATFETCH (c); 3211 if ((c == ':' && *p == ']') || p == pend) 3212 break; 3213 if (c1 < CHAR_CLASS_MAX_LENGTH) 3214 str[c1++] = c; 3215 else 3216 /* This is in any case an invalid class name. */ 3217 str[0] = '\0'; 3218 } 3219 str[c1] = '\0'; 3220 3221 /* If isn't a word bracketed by `[:' and `:]': 3222 undo the ending character, the letters, and leave 3223 the leading `:' and `[' (but set bits for them). */ 3224 if (c == ':' && *p == ']') 3225 { 3226 # if defined _LIBC || WIDE_CHAR_SUPPORT 3227 boolean is_lower = STREQ (str, "lower"); 3228 boolean is_upper = STREQ (str, "upper"); 3229 wctype_t wt; 3230 int ch; 3231 3232 wt = IS_CHAR_CLASS (str); 3233 if (wt == 0) 3234 FREE_STACK_RETURN (REG_ECTYPE); 3235 3236 /* Throw away the ] at the end of the character 3237 class. */ 3238 PATFETCH (c); 3239 3240 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3241 3242 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) 3243 { 3244 # ifdef _LIBC 3245 if (__iswctype (__btowc (ch), wt)) 3246 SET_LIST_BIT (ch); 3247 # else 3248 if (iswctype (btowc (ch), wt)) 3249 SET_LIST_BIT (ch); 3250 # endif 3251 3252 if (translate && (is_upper || is_lower) 3253 && (ISUPPER (ch) || ISLOWER (ch))) 3254 SET_LIST_BIT (ch); 3255 } 3256 3257 had_char_class = true; 3258 # else 3259 int ch; 3260 boolean is_alnum = STREQ (str, "alnum"); 3261 boolean is_alpha = STREQ (str, "alpha"); 3262 boolean is_blank = STREQ (str, "blank"); 3263 boolean is_cntrl = STREQ (str, "cntrl"); 3264 boolean is_digit = STREQ (str, "digit"); 3265 boolean is_graph = STREQ (str, "graph"); 3266 boolean is_lower = STREQ (str, "lower"); 3267 boolean is_print = STREQ (str, "print"); 3268 boolean is_punct = STREQ (str, "punct"); 3269 boolean is_space = STREQ (str, "space"); 3270 boolean is_upper = STREQ (str, "upper"); 3271 boolean is_xdigit = STREQ (str, "xdigit"); 3272 3273 if (!IS_CHAR_CLASS (str)) 3274 FREE_STACK_RETURN (REG_ECTYPE); 3275 3276 /* Throw away the ] at the end of the character 3277 class. */ 3278 PATFETCH (c); 3279 3280 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3281 3282 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) 3283 { 3284 /* This was split into 3 if's to 3285 avoid an arbitrary limit in some compiler. */ 3286 if ( (is_alnum && ISALNUM (ch)) 3287 || (is_alpha && ISALPHA (ch)) 3288 || (is_blank && ISBLANK (ch)) 3289 || (is_cntrl && ISCNTRL (ch))) 3290 SET_LIST_BIT (ch); 3291 if ( (is_digit && ISDIGIT (ch)) 3292 || (is_graph && ISGRAPH (ch)) 3293 || (is_lower && ISLOWER (ch)) 3294 || (is_print && ISPRINT (ch))) 3295 SET_LIST_BIT (ch); 3296 if ( (is_punct && ISPUNCT (ch)) 3297 || (is_space && ISSPACE (ch)) 3298 || (is_upper && ISUPPER (ch)) 3299 || (is_xdigit && ISXDIGIT (ch))) 3300 SET_LIST_BIT (ch); 3301 if ( translate && (is_upper || is_lower) 3302 && (ISUPPER (ch) || ISLOWER (ch))) 3303 SET_LIST_BIT (ch); 3304 } 3305 had_char_class = true; 3306 # endif /* libc || wctype.h */ 3307 } 3308 else 3309 { 3310 c1++; 3311 while (c1--) 3312 PATUNFETCH; 3313 SET_LIST_BIT ('['); 3314 SET_LIST_BIT (':'); 3315 range_start = ':'; 3316 had_char_class = false; 3317 } 3318 } 3319 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=') 3320 { 3321 unsigned char str[MB_LEN_MAX + 1]; 3322 # ifdef _LIBC 3323 uint32_t nrules = 3324 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3325 # endif 3326 3327 PATFETCH (c); 3328 c1 = 0; 3329 3330 /* If pattern is `[[='. */ 3331 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3332 3333 for (;;) 3334 { 3335 PATFETCH (c); 3336 if ((c == '=' && *p == ']') || p == pend) 3337 break; 3338 if (c1 < MB_LEN_MAX) 3339 str[c1++] = c; 3340 else 3341 /* This is in any case an invalid class name. */ 3342 str[0] = '\0'; 3343 } 3344 str[c1] = '\0'; 3345 3346 if (c == '=' && *p == ']' && str[0] != '\0') 3347 { 3348 /* If we have no collation data we use the default 3349 collation in which each character is in a class 3350 by itself. It also means that ASCII is the 3351 character set and therefore we cannot have character 3352 with more than one byte in the multibyte 3353 representation. */ 3354 # ifdef _LIBC 3355 if (nrules == 0) 3356 # endif 3357 { 3358 if (c1 != 1) 3359 FREE_STACK_RETURN (REG_ECOLLATE); 3360 3361 /* Throw away the ] at the end of the equivalence 3362 class. */ 3363 PATFETCH (c); 3364 3365 /* Set the bit for the character. */ 3366 SET_LIST_BIT (str[0]); 3367 } 3368 # ifdef _LIBC 3369 else 3370 { 3371 /* Try to match the byte sequence in `str' against 3372 those known to the collate implementation. 3373 First find out whether the bytes in `str' are 3374 actually from exactly one character. */ 3375 const int32_t *table; 3376 const unsigned char *weights; 3377 const unsigned char *extra; 3378 const int32_t *indirect; 3379 int32_t idx; 3380 const unsigned char *cp = str; 3381 int ch; 3382 3383 /* This #include defines a local function! */ 3384 # include <locale/weight.h> 3385 3386 table = (const int32_t *) 3387 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3388 weights = (const unsigned char *) 3389 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); 3390 extra = (const unsigned char *) 3391 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); 3392 indirect = (const int32_t *) 3393 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); 3394 3395 idx = findidx (&cp); 3396 if (idx == 0 || cp < str + c1) 3397 /* This is no valid character. */ 3398 FREE_STACK_RETURN (REG_ECOLLATE); 3399 3400 /* Throw away the ] at the end of the equivalence 3401 class. */ 3402 PATFETCH (c); 3403 3404 /* Now we have to go throught the whole table 3405 and find all characters which have the same 3406 first level weight. 3407 3408 XXX Note that this is not entirely correct. 3409 we would have to match multibyte sequences 3410 but this is not possible with the current 3411 implementation. */ 3412 for (ch = 1; ch < 256; ++ch) 3413 /* XXX This test would have to be changed if we 3414 would allow matching multibyte sequences. */ 3415 if (table[ch] > 0) 3416 { 3417 int32_t idx2 = table[ch]; 3418 size_t len = weights[idx2]; 3419 3420 /* Test whether the lenghts match. */ 3421 if (weights[idx] == len) 3422 { 3423 /* They do. New compare the bytes of 3424 the weight. */ 3425 size_t cnt = 0; 3426 3427 while (cnt < len 3428 && (weights[idx + 1 + cnt] 3429 == weights[idx2 + 1 + cnt])) 3430 ++cnt; 3431 3432 if (cnt == len) 3433 /* They match. Mark the character as 3434 acceptable. */ 3435 SET_LIST_BIT (ch); 3436 } 3437 } 3438 } 3439 # endif 3440 had_char_class = true; 3441 } 3442 else 3443 { 3444 c1++; 3445 while (c1--) 3446 PATUNFETCH; 3447 SET_LIST_BIT ('['); 3448 SET_LIST_BIT ('='); 3449 range_start = '='; 3450 had_char_class = false; 3451 } 3452 } 3453 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.') 3454 { 3455 unsigned char str[128]; /* Should be large enough. */ 3456 # ifdef _LIBC 3457 uint32_t nrules = 3458 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3459 # endif 3460 3461 PATFETCH (c); 3462 c1 = 0; 3463 3464 /* If pattern is `[[.'. */ 3465 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3466 3467 for (;;) 3468 { 3469 PATFETCH (c); 3470 if ((c == '.' && *p == ']') || p == pend) 3471 break; 3472 if (c1 < sizeof (str)) 3473 str[c1++] = c; 3474 else 3475 /* This is in any case an invalid class name. */ 3476 str[0] = '\0'; 3477 } 3478 str[c1] = '\0'; 3479 3480 if (c == '.' && *p == ']' && str[0] != '\0') 3481 { 3482 /* If we have no collation data we use the default 3483 collation in which each character is the name 3484 for its own class which contains only the one 3485 character. It also means that ASCII is the 3486 character set and therefore we cannot have character 3487 with more than one byte in the multibyte 3488 representation. */ 3489 # ifdef _LIBC 3490 if (nrules == 0) 3491 # endif 3492 { 3493 if (c1 != 1) 3494 FREE_STACK_RETURN (REG_ECOLLATE); 3495 3496 /* Throw away the ] at the end of the equivalence 3497 class. */ 3498 PATFETCH (c); 3499 3500 /* Set the bit for the character. */ 3501 SET_LIST_BIT (str[0]); 3502 range_start = ((const unsigned char *) str)[0]; 3503 } 3504 # ifdef _LIBC 3505 else 3506 { 3507 /* Try to match the byte sequence in `str' against 3508 those known to the collate implementation. 3509 First find out whether the bytes in `str' are 3510 actually from exactly one character. */ 3511 int32_t table_size; 3512 const int32_t *symb_table; 3513 const unsigned char *extra; 3514 int32_t idx; 3515 int32_t elem; 3516 int32_t second; 3517 int32_t hash; 3518 3519 table_size = 3520 _NL_CURRENT_WORD (LC_COLLATE, 3521 _NL_COLLATE_SYMB_HASH_SIZEMB); 3522 symb_table = (const int32_t *) 3523 _NL_CURRENT (LC_COLLATE, 3524 _NL_COLLATE_SYMB_TABLEMB); 3525 extra = (const unsigned char *) 3526 _NL_CURRENT (LC_COLLATE, 3527 _NL_COLLATE_SYMB_EXTRAMB); 3528 3529 /* Locate the character in the hashing table. */ 3530 hash = elem_hash (str, c1); 3531 3532 idx = 0; 3533 elem = hash % table_size; 3534 second = hash % (table_size - 2); 3535 while (symb_table[2 * elem] != 0) 3536 { 3537 /* First compare the hashing value. */ 3538 if (symb_table[2 * elem] == hash 3539 && c1 == extra[symb_table[2 * elem + 1]] 3540 && memcmp (str, 3541 &extra[symb_table[2 * elem + 1] 3542 + 1], 3543 c1) == 0) 3544 { 3545 /* Yep, this is the entry. */ 3546 idx = symb_table[2 * elem + 1]; 3547 idx += 1 + extra[idx]; 3548 break; 3549 } 3550 3551 /* Next entry. */ 3552 elem += second; 3553 } 3554 3555 if (symb_table[2 * elem] == 0) 3556 /* This is no valid character. */ 3557 FREE_STACK_RETURN (REG_ECOLLATE); 3558 3559 /* Throw away the ] at the end of the equivalence 3560 class. */ 3561 PATFETCH (c); 3562 3563 /* Now add the multibyte character(s) we found 3564 to the accept list. 3565 3566 XXX Note that this is not entirely correct. 3567 we would have to match multibyte sequences 3568 but this is not possible with the current 3569 implementation. Also, we have to match 3570 collating symbols, which expand to more than 3571 one file, as a whole and not allow the 3572 individual bytes. */ 3573 c1 = extra[idx++]; 3574 if (c1 == 1) 3575 range_start = extra[idx]; 3576 while (c1-- > 0) 3577 { 3578 SET_LIST_BIT (extra[idx]); 3579 ++idx; 3580 } 3581 } 3582 # endif 3583 had_char_class = false; 3584 } 3585 else 3586 { 3587 c1++; 3588 while (c1--) 3589 PATUNFETCH; 3590 SET_LIST_BIT ('['); 3591 SET_LIST_BIT ('.'); 3592 range_start = '.'; 3593 had_char_class = false; 3594 } 3595 } 3596 else 3597 { 3598 had_char_class = false; 3599 SET_LIST_BIT (c); 3600 range_start = c; 3601 } 3602 } 3603 3604 /* Discard any (non)matching list bytes that are all 0 at the 3605 end of the map. Decrease the map-length byte too. */ 3606 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) 3607 b[-1]--; 3608 b += b[-1]; 3609 #endif /* MBS_SUPPORT */ 3610 } 3611 break; 3612 3613 3614 case '(': 3615 if (syntax & RE_NO_BK_PARENS) 3616 goto handle_open; 3617 else 3618 goto normal_char; 3619 3620 3621 case ')': 3622 if (syntax & RE_NO_BK_PARENS) 3623 goto handle_close; 3624 else 3625 goto normal_char; 3626 3627 3628 case '\n': 3629 if (syntax & RE_NEWLINE_ALT) 3630 goto handle_alt; 3631 else 3632 goto normal_char; 3633 3634 3635 case '|': 3636 if (syntax & RE_NO_BK_VBAR) 3637 goto handle_alt; 3638 else 3639 goto normal_char; 3640 3641 3642 case '{': 3643 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) 3644 goto handle_interval; 3645 else 3646 goto normal_char; 3647 3648 3649 case '\\': 3650 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 3651 3652 /* Do not translate the character after the \, so that we can 3653 distinguish, e.g., \B from \b, even if we normally would 3654 translate, e.g., B to b. */ 3655 PATFETCH_RAW (c); 3656 3657 switch (c) 3658 { 3659 case '(': 3660 if (syntax & RE_NO_BK_PARENS) 3661 goto normal_backslash; 3662 3663 handle_open: 3664 bufp->re_nsub++; 3665 regnum++; 3666 3667 if (COMPILE_STACK_FULL) 3668 { 3669 RETALLOC (compile_stack.stack, compile_stack.size << 1, 3670 compile_stack_elt_t); 3671 if (compile_stack.stack == NULL) return REG_ESPACE; 3672 3673 compile_stack.size <<= 1; 3674 } 3675 3676 /* These are the values to restore when we hit end of this 3677 group. They are all relative offsets, so that if the 3678 whole pattern moves because of realloc, they will still 3679 be valid. */ 3680 COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR; 3681 COMPILE_STACK_TOP.fixup_alt_jump 3682 = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0; 3683 COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR; 3684 COMPILE_STACK_TOP.regnum = regnum; 3685 3686 /* We will eventually replace the 0 with the number of 3687 groups inner to this one. But do not push a 3688 start_memory for groups beyond the last one we can 3689 represent in the compiled pattern. */ 3690 if (regnum <= MAX_REGNUM) 3691 { 3692 COMPILE_STACK_TOP.inner_group_offset = b 3693 - COMPILED_BUFFER_VAR + 2; 3694 BUF_PUSH_3 (start_memory, regnum, 0); 3695 } 3696 3697 compile_stack.avail++; 3698 3699 fixup_alt_jump = 0; 3700 laststart = 0; 3701 begalt = b; 3702 /* If we've reached MAX_REGNUM groups, then this open 3703 won't actually generate any code, so we'll have to 3704 clear pending_exact explicitly. */ 3705 pending_exact = 0; 3706 break; 3707 3708 3709 case ')': 3710 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; 3711 3712 if (COMPILE_STACK_EMPTY) 3713 { 3714 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 3715 goto normal_backslash; 3716 else 3717 FREE_STACK_RETURN (REG_ERPAREN); 3718 } 3719 3720 handle_close: 3721 if (fixup_alt_jump) 3722 { /* Push a dummy failure point at the end of the 3723 alternative for a possible future 3724 `pop_failure_jump' to pop. See comments at 3725 `push_dummy_failure' in `re_match_2'. */ 3726 BUF_PUSH (push_dummy_failure); 3727 3728 /* We allocated space for this jump when we assigned 3729 to `fixup_alt_jump', in the `handle_alt' case below. */ 3730 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); 3731 } 3732 3733 /* See similar code for backslashed left paren above. */ 3734 if (COMPILE_STACK_EMPTY) 3735 { 3736 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 3737 goto normal_char; 3738 else 3739 FREE_STACK_RETURN (REG_ERPAREN); 3740 } 3741 3742 /* Since we just checked for an empty stack above, this 3743 ``can't happen''. */ 3744 assert (compile_stack.avail != 0); 3745 { 3746 /* We don't just want to restore into `regnum', because 3747 later groups should continue to be numbered higher, 3748 as in `(ab)c(de)' -- the second group is #2. */ 3749 regnum_t this_group_regnum; 3750 3751 compile_stack.avail--; 3752 begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset; 3753 fixup_alt_jump 3754 = COMPILE_STACK_TOP.fixup_alt_jump 3755 ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1 3756 : 0; 3757 laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset; 3758 this_group_regnum = COMPILE_STACK_TOP.regnum; 3759 /* If we've reached MAX_REGNUM groups, then this open 3760 won't actually generate any code, so we'll have to 3761 clear pending_exact explicitly. */ 3762 pending_exact = 0; 3763 3764 /* We're at the end of the group, so now we know how many 3765 groups were inside this one. */ 3766 if (this_group_regnum <= MAX_REGNUM) 3767 { 3768 US_CHAR_TYPE *inner_group_loc 3769 = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset; 3770 3771 *inner_group_loc = regnum - this_group_regnum; 3772 BUF_PUSH_3 (stop_memory, this_group_regnum, 3773 regnum - this_group_regnum); 3774 } 3775 } 3776 break; 3777 3778 3779 case '|': /* `\|'. */ 3780 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) 3781 goto normal_backslash; 3782 handle_alt: 3783 if (syntax & RE_LIMITED_OPS) 3784 goto normal_char; 3785 3786 /* Insert before the previous alternative a jump which 3787 jumps to this alternative if the former fails. */ 3788 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 3789 INSERT_JUMP (on_failure_jump, begalt, 3790 b + 2 + 2 * OFFSET_ADDRESS_SIZE); 3791 pending_exact = 0; 3792 b += 1 + OFFSET_ADDRESS_SIZE; 3793 3794 /* The alternative before this one has a jump after it 3795 which gets executed if it gets matched. Adjust that 3796 jump so it will jump to this alternative's analogous 3797 jump (put in below, which in turn will jump to the next 3798 (if any) alternative's such jump, etc.). The last such 3799 jump jumps to the correct final destination. A picture: 3800 _____ _____ 3801 | | | | 3802 | v | v 3803 a | b | c 3804 3805 If we are at `b', then fixup_alt_jump right now points to a 3806 three-byte space after `a'. We'll put in the jump, set 3807 fixup_alt_jump to right after `b', and leave behind three 3808 bytes which we'll fill in when we get to after `c'. */ 3809 3810 if (fixup_alt_jump) 3811 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 3812 3813 /* Mark and leave space for a jump after this alternative, 3814 to be filled in later either by next alternative or 3815 when know we're at the end of a series of alternatives. */ 3816 fixup_alt_jump = b; 3817 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 3818 b += 1 + OFFSET_ADDRESS_SIZE; 3819 3820 laststart = 0; 3821 begalt = b; 3822 break; 3823 3824 3825 case '{': 3826 /* If \{ is a literal. */ 3827 if (!(syntax & RE_INTERVALS) 3828 /* If we're at `\{' and it's not the open-interval 3829 operator. */ 3830 || (syntax & RE_NO_BK_BRACES)) 3831 goto normal_backslash; 3832 3833 handle_interval: 3834 { 3835 /* If got here, then the syntax allows intervals. */ 3836 3837 /* At least (most) this many matches must be made. */ 3838 int lower_bound = -1, upper_bound = -1; 3839 3840 /* Place in the uncompiled pattern (i.e., just after 3841 the '{') to go back to if the interval is invalid. */ 3842 const CHAR_TYPE *beg_interval = p; 3843 3844 if (p == pend) 3845 goto invalid_interval; 3846 3847 GET_UNSIGNED_NUMBER (lower_bound); 3848 3849 if (c == ',') 3850 { 3851 GET_UNSIGNED_NUMBER (upper_bound); 3852 if (upper_bound < 0) 3853 upper_bound = RE_DUP_MAX; 3854 } 3855 else 3856 /* Interval such as `{1}' => match exactly once. */ 3857 upper_bound = lower_bound; 3858 3859 if (! (0 <= lower_bound && lower_bound <= upper_bound)) 3860 goto invalid_interval; 3861 3862 if (!(syntax & RE_NO_BK_BRACES)) 3863 { 3864 if (c != '\\' || p == pend) 3865 goto invalid_interval; 3866 PATFETCH (c); 3867 } 3868 3869 if (c != '}') 3870 goto invalid_interval; 3871 3872 /* If it's invalid to have no preceding re. */ 3873 if (!laststart) 3874 { 3875 if (syntax & RE_CONTEXT_INVALID_OPS 3876 && !(syntax & RE_INVALID_INTERVAL_ORD)) 3877 FREE_STACK_RETURN (REG_BADRPT); 3878 else if (syntax & RE_CONTEXT_INDEP_OPS) 3879 laststart = b; 3880 else 3881 goto unfetch_interval; 3882 } 3883 3884 /* We just parsed a valid interval. */ 3885 3886 if (RE_DUP_MAX < upper_bound) 3887 FREE_STACK_RETURN (REG_BADBR); 3888 3889 /* If the upper bound is zero, don't want to succeed at 3890 all; jump from `laststart' to `b + 3', which will be 3891 the end of the buffer after we insert the jump. */ 3892 /* ifdef MBS_SUPPORT, 'b + 1 + OFFSET_ADDRESS_SIZE' 3893 instead of 'b + 3'. */ 3894 if (upper_bound == 0) 3895 { 3896 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 3897 INSERT_JUMP (jump, laststart, b + 1 3898 + OFFSET_ADDRESS_SIZE); 3899 b += 1 + OFFSET_ADDRESS_SIZE; 3900 } 3901 3902 /* Otherwise, we have a nontrivial interval. When 3903 we're all done, the pattern will look like: 3904 set_number_at <jump count> <upper bound> 3905 set_number_at <succeed_n count> <lower bound> 3906 succeed_n <after jump addr> <succeed_n count> 3907 <body of loop> 3908 jump_n <succeed_n addr> <jump count> 3909 (The upper bound and `jump_n' are omitted if 3910 `upper_bound' is 1, though.) */ 3911 else 3912 { /* If the upper bound is > 1, we need to insert 3913 more at the end of the loop. */ 3914 unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE + 3915 (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE); 3916 3917 GET_BUFFER_SPACE (nbytes); 3918 3919 /* Initialize lower bound of the `succeed_n', even 3920 though it will be set during matching by its 3921 attendant `set_number_at' (inserted next), 3922 because `re_compile_fastmap' needs to know. 3923 Jump to the `jump_n' we might insert below. */ 3924 INSERT_JUMP2 (succeed_n, laststart, 3925 b + 1 + 2 * OFFSET_ADDRESS_SIZE 3926 + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE) 3927 , lower_bound); 3928 b += 1 + 2 * OFFSET_ADDRESS_SIZE; 3929 3930 /* Code to initialize the lower bound. Insert 3931 before the `succeed_n'. The `5' is the last two 3932 bytes of this `set_number_at', plus 3 bytes of 3933 the following `succeed_n'. */ 3934 /* ifdef MBS_SUPPORT, The '1+2*OFFSET_ADDRESS_SIZE' 3935 is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE' 3936 of the following `succeed_n'. */ 3937 insert_op2 (set_number_at, laststart, 1 3938 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b); 3939 b += 1 + 2 * OFFSET_ADDRESS_SIZE; 3940 3941 if (upper_bound > 1) 3942 { /* More than one repetition is allowed, so 3943 append a backward jump to the `succeed_n' 3944 that starts this interval. 3945 3946 When we've reached this during matching, 3947 we'll have matched the interval once, so 3948 jump back only `upper_bound - 1' times. */ 3949 STORE_JUMP2 (jump_n, b, laststart 3950 + 2 * OFFSET_ADDRESS_SIZE + 1, 3951 upper_bound - 1); 3952 b += 1 + 2 * OFFSET_ADDRESS_SIZE; 3953 3954 /* The location we want to set is the second 3955 parameter of the `jump_n'; that is `b-2' as 3956 an absolute address. `laststart' will be 3957 the `set_number_at' we're about to insert; 3958 `laststart+3' the number to set, the source 3959 for the relative address. But we are 3960 inserting into the middle of the pattern -- 3961 so everything is getting moved up by 5. 3962 Conclusion: (b - 2) - (laststart + 3) + 5, 3963 i.e., b - laststart. 3964 3965 We insert this at the beginning of the loop 3966 so that if we fail during matching, we'll 3967 reinitialize the bounds. */ 3968 insert_op2 (set_number_at, laststart, b - laststart, 3969 upper_bound - 1, b); 3970 b += 1 + 2 * OFFSET_ADDRESS_SIZE; 3971 } 3972 } 3973 pending_exact = 0; 3974 break; 3975 3976 invalid_interval: 3977 if (!(syntax & RE_INVALID_INTERVAL_ORD)) 3978 FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR); 3979 unfetch_interval: 3980 /* Match the characters as literals. */ 3981 p = beg_interval; 3982 c = '{'; 3983 if (syntax & RE_NO_BK_BRACES) 3984 goto normal_char; 3985 else 3986 goto normal_backslash; 3987 } 3988 3989 #ifdef emacs 3990 /* There is no way to specify the before_dot and after_dot 3991 operators. rms says this is ok. --karl */ 3992 case '=': 3993 BUF_PUSH (at_dot); 3994 break; 3995 3996 case 's': 3997 laststart = b; 3998 PATFETCH (c); 3999 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); 4000 break; 4001 4002 case 'S': 4003 laststart = b; 4004 PATFETCH (c); 4005 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); 4006 break; 4007 #endif /* emacs */ 4008 4009 4010 case 'w': 4011 if (syntax & RE_NO_GNU_OPS) 4012 goto normal_char; 4013 laststart = b; 4014 BUF_PUSH (wordchar); 4015 break; 4016 4017 4018 case 'W': 4019 if (syntax & RE_NO_GNU_OPS) 4020 goto normal_char; 4021 laststart = b; 4022 BUF_PUSH (notwordchar); 4023 break; 4024 4025 4026 case '<': 4027 if (syntax & RE_NO_GNU_OPS) 4028 goto normal_char; 4029 BUF_PUSH (wordbeg); 4030 break; 4031 4032 case '>': 4033 if (syntax & RE_NO_GNU_OPS) 4034 goto normal_char; 4035 BUF_PUSH (wordend); 4036 break; 4037 4038 case 'b': 4039 if (syntax & RE_NO_GNU_OPS) 4040 goto normal_char; 4041 BUF_PUSH (wordbound); 4042 break; 4043 4044 case 'B': 4045 if (syntax & RE_NO_GNU_OPS) 4046 goto normal_char; 4047 BUF_PUSH (notwordbound); 4048 break; 4049 4050 case '`': 4051 if (syntax & RE_NO_GNU_OPS) 4052 goto normal_char; 4053 BUF_PUSH (begbuf); 4054 break; 4055 4056 case '\'': 4057 if (syntax & RE_NO_GNU_OPS) 4058 goto normal_char; 4059 BUF_PUSH (endbuf); 4060 break; 4061 4062 case '1': case '2': case '3': case '4': case '5': 4063 case '6': case '7': case '8': case '9': 4064 if (syntax & RE_NO_BK_REFS) 4065 goto normal_char; 4066 4067 c1 = c - '0'; 4068 4069 if (c1 > regnum) 4070 FREE_STACK_RETURN (REG_ESUBREG); 4071 4072 /* Can't back reference to a subexpression if inside of it. */ 4073 if (group_in_compile_stack (compile_stack, (regnum_t) c1)) 4074 goto normal_char; 4075 4076 laststart = b; 4077 BUF_PUSH_2 (duplicate, c1); 4078 break; 4079 4080 4081 case '+': 4082 case '?': 4083 if (syntax & RE_BK_PLUS_QM) 4084 goto handle_plus; 4085 else 4086 goto normal_backslash; 4087 4088 default: 4089 normal_backslash: 4090 /* You might think it would be useful for \ to mean 4091 not to translate; but if we don't translate it 4092 it will never match anything. */ 4093 c = TRANSLATE (c); 4094 goto normal_char; 4095 } 4096 break; 4097 4098 4099 default: 4100 /* Expects the character in `c'. */ 4101 normal_char: 4102 /* If no exactn currently being built. */ 4103 if (!pending_exact 4104 #ifdef MBS_SUPPORT 4105 /* If last exactn handle binary(or character) and 4106 new exactn handle character(or binary). */ 4107 || is_exactn_bin != is_binary[p - 1 - pattern] 4108 #endif /* MBS_SUPPORT */ 4109 4110 /* If last exactn not at current position. */ 4111 || pending_exact + *pending_exact + 1 != b 4112 4113 /* We have only one byte following the exactn for the count. */ 4114 || *pending_exact == (1 << BYTEWIDTH) - 1 4115 4116 /* If followed by a repetition operator. */ 4117 || *p == '*' || *p == '^' 4118 || ((syntax & RE_BK_PLUS_QM) 4119 ? *p == '\\' && (p[1] == '+' || p[1] == '?') 4120 : (*p == '+' || *p == '?')) 4121 || ((syntax & RE_INTERVALS) 4122 && ((syntax & RE_NO_BK_BRACES) 4123 ? *p == '{' 4124 : (p[0] == '\\' && p[1] == '{')))) 4125 { 4126 /* Start building a new exactn. */ 4127 4128 laststart = b; 4129 4130 #ifdef MBS_SUPPORT 4131 /* Is this exactn binary data or character? */ 4132 is_exactn_bin = is_binary[p - 1 - pattern]; 4133 if (is_exactn_bin) 4134 BUF_PUSH_2 (exactn_bin, 0); 4135 else 4136 BUF_PUSH_2 (exactn, 0); 4137 #else 4138 BUF_PUSH_2 (exactn, 0); 4139 #endif /* MBS_SUPPORT */ 4140 pending_exact = b - 1; 4141 } 4142 4143 BUF_PUSH (c); 4144 (*pending_exact)++; 4145 break; 4146 } /* switch (c) */ 4147 } /* while p != pend */ 4148 4149 4150 /* Through the pattern now. */ 4151 4152 if (fixup_alt_jump) 4153 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 4154 4155 if (!COMPILE_STACK_EMPTY) 4156 FREE_STACK_RETURN (REG_EPAREN); 4157 4158 /* If we don't want backtracking, force success 4159 the first time we reach the end of the compiled pattern. */ 4160 if (syntax & RE_NO_POSIX_BACKTRACKING) 4161 BUF_PUSH (succeed); 4162 4163 #ifdef MBS_SUPPORT 4164 free (pattern); 4165 free (mbs_offset); 4166 free (is_binary); 4167 #endif 4168 free (compile_stack.stack); 4169 4170 /* We have succeeded; set the length of the buffer. */ 4171 #ifdef MBS_SUPPORT 4172 bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR; 4173 #else 4174 bufp->used = b - bufp->buffer; 4175 #endif 4176 4177 #ifdef DEBUG 4178 if (debug) 4179 { 4180 DEBUG_PRINT1 ("\nCompiled pattern: \n"); 4181 print_compiled_pattern (bufp); 4182 } 4183 #endif /* DEBUG */ 4184 4185 #ifndef MATCH_MAY_ALLOCATE 4186 /* Initialize the failure stack to the largest possible stack. This 4187 isn't necessary unless we're trying to avoid calling alloca in 4188 the search and match routines. */ 4189 { 4190 int num_regs = bufp->re_nsub + 1; 4191 4192 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size 4193 is strictly greater than re_max_failures, the largest possible stack 4194 is 2 * re_max_failures failure points. */ 4195 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) 4196 { 4197 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); 4198 4199 # ifdef emacs 4200 if (! fail_stack.stack) 4201 fail_stack.stack 4202 = (fail_stack_elt_t *) xmalloc (fail_stack.size 4203 * sizeof (fail_stack_elt_t)); 4204 else 4205 fail_stack.stack 4206 = (fail_stack_elt_t *) xrealloc (fail_stack.stack, 4207 (fail_stack.size 4208 * sizeof (fail_stack_elt_t))); 4209 # else /* not emacs */ 4210 if (! fail_stack.stack) 4211 fail_stack.stack 4212 = (fail_stack_elt_t *) malloc (fail_stack.size 4213 * sizeof (fail_stack_elt_t)); 4214 else 4215 fail_stack.stack 4216 = (fail_stack_elt_t *) realloc (fail_stack.stack, 4217 (fail_stack.size 4218 * sizeof (fail_stack_elt_t))); 4219 # endif /* not emacs */ 4220 } 4221 4222 regex_grow_registers (num_regs); 4223 } 4224 #endif /* not MATCH_MAY_ALLOCATE */ 4225 4226 return REG_NOERROR; 4227 } /* regex_compile */ 4228 4229 /* Subroutines for `regex_compile'. */ 4231 4232 /* Store OP at LOC followed by two-byte integer parameter ARG. */ 4233 /* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */ 4234 4235 static void 4236 store_op1 (op, loc, arg) 4237 re_opcode_t op; 4238 US_CHAR_TYPE *loc; 4239 int arg; 4240 { 4241 *loc = (US_CHAR_TYPE) op; 4242 STORE_NUMBER (loc + 1, arg); 4243 } 4244 4245 4246 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ 4247 /* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */ 4248 4249 static void 4250 store_op2 (op, loc, arg1, arg2) 4251 re_opcode_t op; 4252 US_CHAR_TYPE *loc; 4253 int arg1, arg2; 4254 { 4255 *loc = (US_CHAR_TYPE) op; 4256 STORE_NUMBER (loc + 1, arg1); 4257 STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2); 4258 } 4259 4260 4261 /* Copy the bytes from LOC to END to open up three bytes of space at LOC 4262 for OP followed by two-byte integer parameter ARG. */ 4263 /* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */ 4264 4265 static void 4266 insert_op1 (op, loc, arg, end) 4267 re_opcode_t op; 4268 US_CHAR_TYPE *loc; 4269 int arg; 4270 US_CHAR_TYPE *end; 4271 { 4272 register US_CHAR_TYPE *pfrom = end; 4273 register US_CHAR_TYPE *pto = end + 1 + OFFSET_ADDRESS_SIZE; 4274 4275 while (pfrom != loc) 4276 *--pto = *--pfrom; 4277 4278 store_op1 (op, loc, arg); 4279 } 4280 4281 4282 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ 4283 /* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */ 4284 4285 static void 4286 insert_op2 (op, loc, arg1, arg2, end) 4287 re_opcode_t op; 4288 US_CHAR_TYPE *loc; 4289 int arg1, arg2; 4290 US_CHAR_TYPE *end; 4291 { 4292 register US_CHAR_TYPE *pfrom = end; 4293 register US_CHAR_TYPE *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE; 4294 4295 while (pfrom != loc) 4296 *--pto = *--pfrom; 4297 4298 store_op2 (op, loc, arg1, arg2); 4299 } 4300 4301 4302 /* P points to just after a ^ in PATTERN. Return true if that ^ comes 4303 after an alternative or a begin-subexpression. We assume there is at 4304 least one character before the ^. */ 4305 4306 static boolean 4307 at_begline_loc_p (pattern, p, syntax) 4308 const CHAR_TYPE *pattern, *p; 4309 reg_syntax_t syntax; 4310 { 4311 const CHAR_TYPE *prev = p - 2; 4312 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; 4313 4314 return 4315 /* After a subexpression? */ 4316 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) 4317 /* After an alternative? */ 4318 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); 4319 } 4320 4321 4322 /* The dual of at_begline_loc_p. This one is for $. We assume there is 4323 at least one character after the $, i.e., `P < PEND'. */ 4324 4325 static boolean 4326 at_endline_loc_p (p, pend, syntax) 4327 const CHAR_TYPE *p, *pend; 4328 reg_syntax_t syntax; 4329 { 4330 const CHAR_TYPE *next = p; 4331 boolean next_backslash = *next == '\\'; 4332 const CHAR_TYPE *next_next = p + 1 < pend ? p + 1 : 0; 4333 4334 return 4335 /* Before a subexpression? */ 4336 (syntax & RE_NO_BK_PARENS ? *next == ')' 4337 : next_backslash && next_next && *next_next == ')') 4338 /* Before an alternative? */ 4339 || (syntax & RE_NO_BK_VBAR ? *next == '|' 4340 : next_backslash && next_next && *next_next == '|'); 4341 } 4342 4343 4344 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and 4345 false if it's not. */ 4346 4347 static boolean 4348 group_in_compile_stack (compile_stack, regnum) 4349 compile_stack_type compile_stack; 4350 regnum_t regnum; 4351 { 4352 int this_element; 4353 4354 for (this_element = compile_stack.avail - 1; 4355 this_element >= 0; 4356 this_element--) 4357 if (compile_stack.stack[this_element].regnum == regnum) 4358 return true; 4359 4360 return false; 4361 } 4362 4363 #ifdef MBS_SUPPORT 4364 /* This insert space, which size is "num", into the pattern at "loc". 4365 "end" must point the end of the allocated buffer. */ 4366 static void 4367 insert_space (num, loc, end) 4368 int num; 4369 CHAR_TYPE *loc; 4370 CHAR_TYPE *end; 4371 { 4372 register CHAR_TYPE *pto = end; 4373 register CHAR_TYPE *pfrom = end - num; 4374 4375 while (pfrom >= loc) 4376 *pto-- = *pfrom--; 4377 } 4378 #endif /* MBS_SUPPORT */ 4379 4380 #ifdef MBS_SUPPORT 4381 static reg_errcode_t 4382 compile_range (range_start_char, p_ptr, pend, translate, syntax, b, 4383 char_set) 4384 CHAR_TYPE range_start_char; 4385 const CHAR_TYPE **p_ptr, *pend; 4386 CHAR_TYPE *char_set, *b; 4387 RE_TRANSLATE_TYPE translate; 4388 reg_syntax_t syntax; 4389 { 4390 const CHAR_TYPE *p = *p_ptr; 4391 CHAR_TYPE range_start, range_end; 4392 reg_errcode_t ret; 4393 # ifdef _LIBC 4394 uint32_t nrules; 4395 uint32_t start_val, end_val; 4396 # endif 4397 if (p == pend) 4398 return REG_ERANGE; 4399 4400 # ifdef _LIBC 4401 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 4402 if (nrules != 0) 4403 { 4404 const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE, 4405 _NL_COLLATE_COLLSEQWC); 4406 const unsigned char *extra = (const unsigned char *) 4407 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 4408 4409 if (range_start_char < -1) 4410 { 4411 /* range_start is a collating symbol. */ 4412 int32_t *wextra; 4413 /* Retreive the index and get collation sequence value. */ 4414 wextra = (int32_t*)(extra + char_set[-range_start_char]); 4415 start_val = wextra[1 + *wextra]; 4416 } 4417 else 4418 start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char)); 4419 4420 end_val = collseq_table_lookup (collseq, TRANSLATE (p[0])); 4421 4422 /* Report an error if the range is empty and the syntax prohibits 4423 this. */ 4424 ret = ((syntax & RE_NO_EMPTY_RANGES) 4425 && (start_val > end_val))? REG_ERANGE : REG_NOERROR; 4426 4427 /* Insert space to the end of the char_ranges. */ 4428 insert_space(2, b - char_set[5] - 2, b - 1); 4429 *(b - char_set[5] - 2) = (wchar_t)start_val; 4430 *(b - char_set[5] - 1) = (wchar_t)end_val; 4431 char_set[4]++; /* ranges_index */ 4432 } 4433 else 4434 # endif 4435 { 4436 range_start = (range_start_char >= 0)? TRANSLATE (range_start_char): 4437 range_start_char; 4438 range_end = TRANSLATE (p[0]); 4439 /* Report an error if the range is empty and the syntax prohibits 4440 this. */ 4441 ret = ((syntax & RE_NO_EMPTY_RANGES) 4442 && (range_start > range_end))? REG_ERANGE : REG_NOERROR; 4443 4444 /* Insert space to the end of the char_ranges. */ 4445 insert_space(2, b - char_set[5] - 2, b - 1); 4446 *(b - char_set[5] - 2) = range_start; 4447 *(b - char_set[5] - 1) = range_end; 4448 char_set[4]++; /* ranges_index */ 4449 } 4450 /* Have to increment the pointer into the pattern string, so the 4451 caller isn't still at the ending character. */ 4452 (*p_ptr)++; 4453 4454 return ret; 4455 } 4456 #else 4457 /* Read the ending character of a range (in a bracket expression) from the 4458 uncompiled pattern *P_PTR (which ends at PEND). We assume the 4459 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) 4460 Then we set the translation of all bits between the starting and 4461 ending characters (inclusive) in the compiled pattern B. 4462 4463 Return an error code. 4464 4465 We use these short variable names so we can use the same macros as 4466 `regex_compile' itself. */ 4467 4468 static reg_errcode_t 4469 compile_range (range_start_char, p_ptr, pend, translate, syntax, b) 4470 unsigned int range_start_char; 4471 const char **p_ptr, *pend; 4472 RE_TRANSLATE_TYPE translate; 4473 reg_syntax_t syntax; 4474 unsigned char *b; 4475 { 4476 unsigned this_char; 4477 const char *p = *p_ptr; 4478 reg_errcode_t ret; 4479 # if _LIBC 4480 const unsigned char *collseq; 4481 unsigned int start_colseq; 4482 unsigned int end_colseq; 4483 # else 4484 unsigned end_char; 4485 # endif 4486 4487 if (p == pend) 4488 return REG_ERANGE; 4489 4490 /* Have to increment the pointer into the pattern string, so the 4491 caller isn't still at the ending character. */ 4492 (*p_ptr)++; 4493 4494 /* Report an error if the range is empty and the syntax prohibits this. */ 4495 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; 4496 4497 # if _LIBC 4498 collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 4499 _NL_COLLATE_COLLSEQMB); 4500 4501 start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)]; 4502 end_colseq = collseq[(unsigned char) TRANSLATE (p[0])]; 4503 for (this_char = 0; this_char <= (unsigned char) -1; ++this_char) 4504 { 4505 unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)]; 4506 4507 if (start_colseq <= this_colseq && this_colseq <= end_colseq) 4508 { 4509 SET_LIST_BIT (TRANSLATE (this_char)); 4510 ret = REG_NOERROR; 4511 } 4512 } 4513 # else 4514 /* Here we see why `this_char' has to be larger than an `unsigned 4515 char' -- we would otherwise go into an infinite loop, since all 4516 characters <= 0xff. */ 4517 range_start_char = TRANSLATE (range_start_char); 4518 /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE, 4519 and some compilers cast it to int implicitly, so following for_loop 4520 may fall to (almost) infinite loop. 4521 e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff. 4522 To avoid this, we cast p[0] to unsigned int and truncate it. */ 4523 end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1)); 4524 4525 for (this_char = range_start_char; this_char <= end_char; ++this_char) 4526 { 4527 SET_LIST_BIT (TRANSLATE (this_char)); 4528 ret = REG_NOERROR; 4529 } 4530 # endif 4531 4532 return ret; 4533 } 4534 #endif /* MBS_SUPPORT */ 4535 4536 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in 4538 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible 4539 characters can start a string that matches the pattern. This fastmap 4540 is used by re_search to skip quickly over impossible starting points. 4541 4542 The caller must supply the address of a (1 << BYTEWIDTH)-byte data 4543 area as BUFP->fastmap. 4544 4545 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in 4546 the pattern buffer. 4547 4548 Returns 0 if we succeed, -2 if an internal error. */ 4549 4550 #ifdef MBS_SUPPORT 4551 /* local function for re_compile_fastmap. 4552 truncate wchar_t character to char. */ 4553 static unsigned char truncate_wchar (CHAR_TYPE c); 4554 4555 static unsigned char 4556 truncate_wchar (c) 4557 CHAR_TYPE c; 4558 { 4559 unsigned char buf[MB_LEN_MAX]; 4560 int retval = wctomb(buf, c); 4561 return retval > 0 ? buf[0] : (unsigned char)c; 4562 } 4563 #endif /* MBS_SUPPORT */ 4564 4565 int 4566 re_compile_fastmap (bufp) 4567 struct re_pattern_buffer *bufp; 4568 { 4569 int j, k; 4570 #ifdef MATCH_MAY_ALLOCATE 4571 fail_stack_type fail_stack; 4572 #endif 4573 #ifndef REGEX_MALLOC 4574 char *destination; 4575 #endif 4576 4577 register char *fastmap = bufp->fastmap; 4578 4579 #ifdef MBS_SUPPORT 4580 /* We need to cast pattern to (wchar_t*), because we casted this compiled 4581 pattern to (char*) in regex_compile. */ 4582 US_CHAR_TYPE *pattern = (US_CHAR_TYPE*)bufp->buffer; 4583 register US_CHAR_TYPE *pend = (US_CHAR_TYPE*) (bufp->buffer + bufp->used); 4584 #else 4585 US_CHAR_TYPE *pattern = bufp->buffer; 4586 register US_CHAR_TYPE *pend = pattern + bufp->used; 4587 #endif /* MBS_SUPPORT */ 4588 US_CHAR_TYPE *p = pattern; 4589 4590 #ifdef REL_ALLOC 4591 /* This holds the pointer to the failure stack, when 4592 it is allocated relocatably. */ 4593 fail_stack_elt_t *failure_stack_ptr; 4594 #endif 4595 4596 /* Assume that each path through the pattern can be null until 4597 proven otherwise. We set this false at the bottom of switch 4598 statement, to which we get only if a particular path doesn't 4599 match the empty string. */ 4600 boolean path_can_be_null = true; 4601 4602 /* We aren't doing a `succeed_n' to begin with. */ 4603 boolean succeed_n_p = false; 4604 4605 assert (fastmap != NULL && p != NULL); 4606 4607 INIT_FAIL_STACK (); 4608 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ 4609 bufp->fastmap_accurate = 1; /* It will be when we're done. */ 4610 bufp->can_be_null = 0; 4611 4612 while (1) 4613 { 4614 if (p == pend || *p == succeed) 4615 { 4616 /* We have reached the (effective) end of pattern. */ 4617 if (!FAIL_STACK_EMPTY ()) 4618 { 4619 bufp->can_be_null |= path_can_be_null; 4620 4621 /* Reset for next path. */ 4622 path_can_be_null = true; 4623 4624 p = fail_stack.stack[--fail_stack.avail].pointer; 4625 4626 continue; 4627 } 4628 else 4629 break; 4630 } 4631 4632 /* We should never be about to go beyond the end of the pattern. */ 4633 assert (p < pend); 4634 4635 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 4636 { 4637 4638 /* I guess the idea here is to simply not bother with a fastmap 4639 if a backreference is used, since it's too hard to figure out 4640 the fastmap for the corresponding group. Setting 4641 `can_be_null' stops `re_search_2' from using the fastmap, so 4642 that is all we do. */ 4643 case duplicate: 4644 bufp->can_be_null = 1; 4645 goto done; 4646 4647 4648 /* Following are the cases which match a character. These end 4649 with `break'. */ 4650 4651 #ifdef MBS_SUPPORT 4652 case exactn: 4653 fastmap[truncate_wchar(p[1])] = 1; 4654 break; 4655 case exactn_bin: 4656 fastmap[p[1]] = 1; 4657 break; 4658 #else 4659 case exactn: 4660 fastmap[p[1]] = 1; 4661 break; 4662 #endif /* MBS_SUPPORT */ 4663 4664 4665 #ifdef MBS_SUPPORT 4666 /* It is hard to distinguish fastmap from (multi byte) characters 4667 which depends on current locale. */ 4668 case charset: 4669 case charset_not: 4670 case wordchar: 4671 case notwordchar: 4672 bufp->can_be_null = 1; 4673 goto done; 4674 #else 4675 case charset: 4676 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 4677 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 4678 fastmap[j] = 1; 4679 break; 4680 4681 4682 case charset_not: 4683 /* Chars beyond end of map must be allowed. */ 4684 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) 4685 fastmap[j] = 1; 4686 4687 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 4688 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) 4689 fastmap[j] = 1; 4690 break; 4691 4692 4693 case wordchar: 4694 for (j = 0; j < (1 << BYTEWIDTH); j++) 4695 if (SYNTAX (j) == Sword) 4696 fastmap[j] = 1; 4697 break; 4698 4699 4700 case notwordchar: 4701 for (j = 0; j < (1 << BYTEWIDTH); j++) 4702 if (SYNTAX (j) != Sword) 4703 fastmap[j] = 1; 4704 break; 4705 #endif 4706 4707 case anychar: 4708 { 4709 int fastmap_newline = fastmap['\n']; 4710 4711 /* `.' matches anything ... */ 4712 for (j = 0; j < (1 << BYTEWIDTH); j++) 4713 fastmap[j] = 1; 4714 4715 /* ... except perhaps newline. */ 4716 if (!(bufp->syntax & RE_DOT_NEWLINE)) 4717 fastmap['\n'] = fastmap_newline; 4718 4719 /* Return if we have already set `can_be_null'; if we have, 4720 then the fastmap is irrelevant. Something's wrong here. */ 4721 else if (bufp->can_be_null) 4722 goto done; 4723 4724 /* Otherwise, have to check alternative paths. */ 4725 break; 4726 } 4727 4728 #ifdef emacs 4729 case syntaxspec: 4730 k = *p++; 4731 for (j = 0; j < (1 << BYTEWIDTH); j++) 4732 if (SYNTAX (j) == (enum syntaxcode) k) 4733 fastmap[j] = 1; 4734 break; 4735 4736 4737 case notsyntaxspec: 4738 k = *p++; 4739 for (j = 0; j < (1 << BYTEWIDTH); j++) 4740 if (SYNTAX (j) != (enum syntaxcode) k) 4741 fastmap[j] = 1; 4742 break; 4743 4744 4745 /* All cases after this match the empty string. These end with 4746 `continue'. */ 4747 4748 4749 case before_dot: 4750 case at_dot: 4751 case after_dot: 4752 continue; 4753 #endif /* emacs */ 4754 4755 4756 case no_op: 4757 case begline: 4758 case endline: 4759 case begbuf: 4760 case endbuf: 4761 case wordbound: 4762 case notwordbound: 4763 case wordbeg: 4764 case wordend: 4765 case push_dummy_failure: 4766 continue; 4767 4768 4769 case jump_n: 4770 case pop_failure_jump: 4771 case maybe_pop_jump: 4772 case jump: 4773 case jump_past_alt: 4774 case dummy_failure_jump: 4775 EXTRACT_NUMBER_AND_INCR (j, p); 4776 p += j; 4777 if (j > 0) 4778 continue; 4779 4780 /* Jump backward implies we just went through the body of a 4781 loop and matched nothing. Opcode jumped to should be 4782 `on_failure_jump' or `succeed_n'. Just treat it like an 4783 ordinary jump. For a * loop, it has pushed its failure 4784 point already; if so, discard that as redundant. */ 4785 if ((re_opcode_t) *p != on_failure_jump 4786 && (re_opcode_t) *p != succeed_n) 4787 continue; 4788 4789 p++; 4790 EXTRACT_NUMBER_AND_INCR (j, p); 4791 p += j; 4792 4793 /* If what's on the stack is where we are now, pop it. */ 4794 if (!FAIL_STACK_EMPTY () 4795 && fail_stack.stack[fail_stack.avail - 1].pointer == p) 4796 fail_stack.avail--; 4797 4798 continue; 4799 4800 4801 case on_failure_jump: 4802 case on_failure_keep_string_jump: 4803 handle_on_failure_jump: 4804 EXTRACT_NUMBER_AND_INCR (j, p); 4805 4806 /* For some patterns, e.g., `(a?)?', `p+j' here points to the 4807 end of the pattern. We don't want to push such a point, 4808 since when we restore it above, entering the switch will 4809 increment `p' past the end of the pattern. We don't need 4810 to push such a point since we obviously won't find any more 4811 fastmap entries beyond `pend'. Such a pattern can match 4812 the null string, though. */ 4813 if (p + j < pend) 4814 { 4815 if (!PUSH_PATTERN_OP (p + j, fail_stack)) 4816 { 4817 RESET_FAIL_STACK (); 4818 return -2; 4819 } 4820 } 4821 else 4822 bufp->can_be_null = 1; 4823 4824 if (succeed_n_p) 4825 { 4826 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ 4827 succeed_n_p = false; 4828 } 4829 4830 continue; 4831 4832 4833 case succeed_n: 4834 /* Get to the number of times to succeed. */ 4835 p += OFFSET_ADDRESS_SIZE; 4836 4837 /* Increment p past the n for when k != 0. */ 4838 EXTRACT_NUMBER_AND_INCR (k, p); 4839 if (k == 0) 4840 { 4841 p -= 2 * OFFSET_ADDRESS_SIZE; 4842 succeed_n_p = true; /* Spaghetti code alert. */ 4843 goto handle_on_failure_jump; 4844 } 4845 continue; 4846 4847 4848 case set_number_at: 4849 p += 2 * OFFSET_ADDRESS_SIZE; 4850 continue; 4851 4852 4853 case start_memory: 4854 case stop_memory: 4855 p += 2; 4856 continue; 4857 4858 4859 default: 4860 abort (); /* We have listed all the cases. */ 4861 } /* switch *p++ */ 4862 4863 /* Getting here means we have found the possible starting 4864 characters for one path of the pattern -- and that the empty 4865 string does not match. We need not follow this path further. 4866 Instead, look at the next alternative (remembered on the 4867 stack), or quit if no more. The test at the top of the loop 4868 does these things. */ 4869 path_can_be_null = false; 4870 p = pend; 4871 } /* while p */ 4872 4873 /* Set `can_be_null' for the last path (also the first path, if the 4874 pattern is empty). */ 4875 bufp->can_be_null |= path_can_be_null; 4876 4877 done: 4878 RESET_FAIL_STACK (); 4879 return 0; 4880 } /* re_compile_fastmap */ 4881 #ifdef _LIBC 4882 weak_alias (__re_compile_fastmap, re_compile_fastmap) 4883 #endif 4884 4885 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and 4887 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 4888 this memory for recording register information. STARTS and ENDS 4889 must be allocated using the malloc library routine, and must each 4890 be at least NUM_REGS * sizeof (regoff_t) bytes long. 4891 4892 If NUM_REGS == 0, then subsequent matches should allocate their own 4893 register data. 4894 4895 Unless this function is called, the first search or match using 4896 PATTERN_BUFFER will allocate its own register data, without 4897 freeing the old data. */ 4898 4899 void 4900 re_set_registers (bufp, regs, num_regs, starts, ends) 4901 struct re_pattern_buffer *bufp; 4902 struct re_registers *regs; 4903 unsigned num_regs; 4904 regoff_t *starts, *ends; 4905 { 4906 if (num_regs) 4907 { 4908 bufp->regs_allocated = REGS_REALLOCATE; 4909 regs->num_regs = num_regs; 4910 regs->start = starts; 4911 regs->end = ends; 4912 } 4913 else 4914 { 4915 bufp->regs_allocated = REGS_UNALLOCATED; 4916 regs->num_regs = 0; 4917 regs->start = regs->end = (regoff_t *) 0; 4918 } 4919 } 4920 #ifdef _LIBC 4921 weak_alias (__re_set_registers, re_set_registers) 4922 #endif 4923 4924 /* Searching routines. */ 4926 4927 /* Like re_search_2, below, but only one string is specified, and 4928 doesn't let you say where to stop matching. */ 4929 4930 int 4931 re_search (bufp, string, size, startpos, range, regs) 4932 struct re_pattern_buffer *bufp; 4933 const char *string; 4934 int size, startpos, range; 4935 struct re_registers *regs; 4936 { 4937 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, 4938 regs, size); 4939 } 4940 #ifdef _LIBC 4941 weak_alias (__re_search, re_search) 4942 #endif 4943 4944 4945 /* Using the compiled pattern in BUFP->buffer, first tries to match the 4946 virtual concatenation of STRING1 and STRING2, starting first at index 4947 STARTPOS, then at STARTPOS + 1, and so on. 4948 4949 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. 4950 4951 RANGE is how far to scan while trying to match. RANGE = 0 means try 4952 only at STARTPOS; in general, the last start tried is STARTPOS + 4953 RANGE. 4954 4955 In REGS, return the indices of the virtual concatenation of STRING1 4956 and STRING2 that matched the entire BUFP->buffer and its contained 4957 subexpressions. 4958 4959 Do not consider matching one past the index STOP in the virtual 4960 concatenation of STRING1 and STRING2. 4961 4962 We return either the position in the strings at which the match was 4963 found, -1 if no match, or -2 if error (such as failure 4964 stack overflow). */ 4965 4966 int 4967 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) 4968 struct re_pattern_buffer *bufp; 4969 const char *string1, *string2; 4970 int size1, size2; 4971 int startpos; 4972 int range; 4973 struct re_registers *regs; 4974 int stop; 4975 { 4976 int val; 4977 register char *fastmap = bufp->fastmap; 4978 register RE_TRANSLATE_TYPE translate = bufp->translate; 4979 int total_size = size1 + size2; 4980 int endpos = startpos + range; 4981 4982 /* Check for out-of-range STARTPOS. */ 4983 if (startpos < 0 || startpos > total_size) 4984 return -1; 4985 4986 /* Fix up RANGE if it might eventually take us outside 4987 the virtual concatenation of STRING1 and STRING2. 4988 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */ 4989 if (endpos < 0) 4990 range = 0 - startpos; 4991 else if (endpos > total_size) 4992 range = total_size - startpos; 4993 4994 /* If the search isn't to be a backwards one, don't waste time in a 4995 search for a pattern that must be anchored. */ 4996 if (bufp->used > 0 && range > 0 4997 && ((re_opcode_t) bufp->buffer[0] == begbuf 4998 /* `begline' is like `begbuf' if it cannot match at newlines. */ 4999 || ((re_opcode_t) bufp->buffer[0] == begline 5000 && !bufp->newline_anchor))) 5001 { 5002 if (startpos > 0) 5003 return -1; 5004 else 5005 range = 1; 5006 } 5007 5008 #ifdef emacs 5009 /* In a forward search for something that starts with \=. 5010 don't keep searching past point. */ 5011 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) 5012 { 5013 range = PT - startpos; 5014 if (range <= 0) 5015 return -1; 5016 } 5017 #endif /* emacs */ 5018 5019 /* Update the fastmap now if not correct already. */ 5020 if (fastmap && !bufp->fastmap_accurate) 5021 if (re_compile_fastmap (bufp) == -2) 5022 return -2; 5023 5024 /* Loop through the string, looking for a place to start matching. */ 5025 for (;;) 5026 { 5027 /* If a fastmap is supplied, skip quickly over characters that 5028 cannot be the start of a match. If the pattern can match the 5029 null string, however, we don't need to skip characters; we want 5030 the first null string. */ 5031 if (fastmap && startpos < total_size && !bufp->can_be_null) 5032 { 5033 if (range > 0) /* Searching forwards. */ 5034 { 5035 register const char *d; 5036 register int lim = 0; 5037 int irange = range; 5038 5039 if (startpos < size1 && startpos + range >= size1) 5040 lim = range - (size1 - startpos); 5041 5042 d = (startpos >= size1 ? string2 - size1 : string1) + startpos; 5043 5044 /* Written out as an if-else to avoid testing `translate' 5045 inside the loop. */ 5046 if (translate) 5047 while (range > lim 5048 && !fastmap[(unsigned char) 5049 translate[(unsigned char) *d++]]) 5050 range--; 5051 else 5052 while (range > lim && !fastmap[(unsigned char) *d++]) 5053 range--; 5054 5055 startpos += irange - range; 5056 } 5057 else /* Searching backwards. */ 5058 { 5059 register CHAR_TYPE c = (size1 == 0 || startpos >= size1 5060 ? string2[startpos - size1] 5061 : string1[startpos]); 5062 5063 if (!fastmap[(unsigned char) TRANSLATE (c)]) 5064 goto advance; 5065 } 5066 } 5067 5068 /* If can't match the null string, and that's all we have left, fail. */ 5069 if (range >= 0 && startpos == total_size && fastmap 5070 && !bufp->can_be_null) 5071 return -1; 5072 5073 val = re_match_2_internal (bufp, string1, size1, string2, size2, 5074 startpos, regs, stop); 5075 #ifndef REGEX_MALLOC 5076 # ifdef C_ALLOCA 5077 alloca (0); 5078 # endif 5079 #endif 5080 5081 if (val >= 0) 5082 return startpos; 5083 5084 if (val == -2) 5085 return -2; 5086 5087 advance: 5088 if (!range) 5089 break; 5090 else if (range > 0) 5091 { 5092 range--; 5093 startpos++; 5094 } 5095 else 5096 { 5097 range++; 5098 startpos--; 5099 } 5100 } 5101 return -1; 5102 } /* re_search_2 */ 5103 #ifdef _LIBC 5104 weak_alias (__re_search_2, re_search_2) 5105 #endif 5106 5107 #ifdef MBS_SUPPORT 5109 /* This converts PTR, a pointer into one of the search wchar_t strings 5110 `string1' and `string2' into an multibyte string offset from the 5111 beginning of that string. We use mbs_offset to optimize. 5112 See convert_mbs_to_wcs. */ 5113 # define POINTER_TO_OFFSET(ptr) \ 5114 (FIRST_STRING_P (ptr) \ 5115 ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \ 5116 : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \ 5117 + csize1))) 5118 #else 5119 /* This converts PTR, a pointer into one of the search strings `string1' 5120 and `string2' into an offset from the beginning of that string. */ 5121 # define POINTER_TO_OFFSET(ptr) \ 5122 (FIRST_STRING_P (ptr) \ 5123 ? ((regoff_t) ((ptr) - string1)) \ 5124 : ((regoff_t) ((ptr) - string2 + size1))) 5125 #endif /* MBS_SUPPORT */ 5126 5127 /* Macros for dealing with the split strings in re_match_2. */ 5128 5129 #define MATCHING_IN_FIRST_STRING (dend == end_match_1) 5130 5131 /* Call before fetching a character with *d. This switches over to 5132 string2 if necessary. */ 5133 #define PREFETCH() \ 5134 while (d == dend) \ 5135 { \ 5136 /* End of string2 => fail. */ \ 5137 if (dend == end_match_2) \ 5138 goto fail; \ 5139 /* End of string1 => advance to string2. */ \ 5140 d = string2; \ 5141 dend = end_match_2; \ 5142 } 5143 5144 5145 /* Test if at very beginning or at very end of the virtual concatenation 5146 of `string1' and `string2'. If only one string, it's `string2'. */ 5147 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) 5148 #define AT_STRINGS_END(d) ((d) == end2) 5149 5150 5151 /* Test if D points to a character which is word-constituent. We have 5152 two special cases to check for: if past the end of string1, look at 5153 the first character in string2; and if before the beginning of 5154 string2, look at the last character in string1. */ 5155 #ifdef MBS_SUPPORT 5156 /* Use internationalized API instead of SYNTAX. */ 5157 # define WORDCHAR_P(d) \ 5158 (iswalnum ((wint_t)((d) == end1 ? *string2 \ 5159 : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0) 5160 #else 5161 # define WORDCHAR_P(d) \ 5162 (SYNTAX ((d) == end1 ? *string2 \ 5163 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ 5164 == Sword) 5165 #endif /* MBS_SUPPORT */ 5166 5167 /* Disabled due to a compiler bug -- see comment at case wordbound */ 5168 #if 0 5169 /* Test if the character before D and the one at D differ with respect 5170 to being word-constituent. */ 5171 #define AT_WORD_BOUNDARY(d) \ 5172 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ 5173 || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) 5174 #endif 5175 5176 /* Free everything we malloc. */ 5177 #ifdef MATCH_MAY_ALLOCATE 5178 # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL 5179 # ifdef MBS_SUPPORT 5180 # define FREE_VARIABLES() \ 5181 do { \ 5182 REGEX_FREE_STACK (fail_stack.stack); \ 5183 FREE_VAR (regstart); \ 5184 FREE_VAR (regend); \ 5185 FREE_VAR (old_regstart); \ 5186 FREE_VAR (old_regend); \ 5187 FREE_VAR (best_regstart); \ 5188 FREE_VAR (best_regend); \ 5189 FREE_VAR (reg_info); \ 5190 FREE_VAR (reg_dummy); \ 5191 FREE_VAR (reg_info_dummy); \ 5192 FREE_VAR (string1); \ 5193 FREE_VAR (string2); \ 5194 FREE_VAR (mbs_offset1); \ 5195 FREE_VAR (mbs_offset2); \ 5196 } while (0) 5197 # else /* not MBS_SUPPORT */ 5198 # define FREE_VARIABLES() \ 5199 do { \ 5200 REGEX_FREE_STACK (fail_stack.stack); \ 5201 FREE_VAR (regstart); \ 5202 FREE_VAR (regend); \ 5203 FREE_VAR (old_regstart); \ 5204 FREE_VAR (old_regend); \ 5205 FREE_VAR (best_regstart); \ 5206 FREE_VAR (best_regend); \ 5207 FREE_VAR (reg_info); \ 5208 FREE_VAR (reg_dummy); \ 5209 FREE_VAR (reg_info_dummy); \ 5210 } while (0) 5211 # endif /* MBS_SUPPORT */ 5212 #else 5213 # define FREE_VAR(var) if (var) free (var); var = NULL 5214 # ifdef MBS_SUPPORT 5215 # define FREE_VARIABLES() \ 5216 do { \ 5217 FREE_VAR (string1); \ 5218 FREE_VAR (string2); \ 5219 FREE_VAR (mbs_offset1); \ 5220 FREE_VAR (mbs_offset2); \ 5221 } while (0) 5222 # else 5223 # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */ 5224 # endif /* MBS_SUPPORT */ 5225 #endif /* not MATCH_MAY_ALLOCATE */ 5226 5227 /* These values must meet several constraints. They must not be valid 5228 register values; since we have a limit of 255 registers (because 5229 we use only one byte in the pattern for the register number), we can 5230 use numbers larger than 255. They must differ by 1, because of 5231 NUM_FAILURE_ITEMS above. And the value for the lowest register must 5232 be larger than the value for the highest register, so we do not try 5233 to actually save any registers when none are active. */ 5234 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) 5235 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) 5236 5237 /* Matching routines. */ 5239 5240 #ifndef emacs /* Emacs never uses this. */ 5241 /* re_match is like re_match_2 except it takes only a single string. */ 5242 5243 int 5244 re_match (bufp, string, size, pos, regs) 5245 struct re_pattern_buffer *bufp; 5246 const char *string; 5247 int size, pos; 5248 struct re_registers *regs; 5249 { 5250 int result = re_match_2_internal (bufp, NULL, 0, string, size, 5251 pos, regs, size); 5252 # ifndef REGEX_MALLOC 5253 # ifdef C_ALLOCA 5254 alloca (0); 5255 # endif 5256 # endif 5257 return result; 5258 } 5259 # ifdef _LIBC 5260 weak_alias (__re_match, re_match) 5261 # endif 5262 #endif /* not emacs */ 5263 5264 static boolean group_match_null_string_p _RE_ARGS ((US_CHAR_TYPE **p, 5265 US_CHAR_TYPE *end, 5266 register_info_type *reg_info)); 5267 static boolean alt_match_null_string_p _RE_ARGS ((US_CHAR_TYPE *p, 5268 US_CHAR_TYPE *end, 5269 register_info_type *reg_info)); 5270 static boolean common_op_match_null_string_p _RE_ARGS ((US_CHAR_TYPE **p, 5271 US_CHAR_TYPE *end, 5272 register_info_type *reg_info)); 5273 static int bcmp_translate _RE_ARGS ((const CHAR_TYPE *s1, const CHAR_TYPE *s2, 5274 int len, char *translate)); 5275 5276 /* re_match_2 matches the compiled pattern in BUFP against the 5277 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 5278 and SIZE2, respectively). We start matching at POS, and stop 5279 matching at STOP. 5280 5281 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we 5282 store offsets for the substring each group matched in REGS. See the 5283 documentation for exactly how many groups we fill. 5284 5285 We return -1 if no match, -2 if an internal error (such as the 5286 failure stack overflowing). Otherwise, we return the length of the 5287 matched substring. */ 5288 5289 int 5290 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) 5291 struct re_pattern_buffer *bufp; 5292 const char *string1, *string2; 5293 int size1, size2; 5294 int pos; 5295 struct re_registers *regs; 5296 int stop; 5297 { 5298 int result = re_match_2_internal (bufp, string1, size1, string2, size2, 5299 pos, regs, stop); 5300 #ifndef REGEX_MALLOC 5301 # ifdef C_ALLOCA 5302 alloca (0); 5303 # endif 5304 #endif 5305 return result; 5306 } 5307 #ifdef _LIBC 5308 weak_alias (__re_match_2, re_match_2) 5309 #endif 5310 5311 #ifdef MBS_SUPPORT 5312 5313 static int count_mbs_length PARAMS ((int *, int)); 5314 5315 /* This check the substring (from 0, to length) of the multibyte string, 5316 to which offset_buffer correspond. And count how many wchar_t_characters 5317 the substring occupy. We use offset_buffer to optimization. 5318 See convert_mbs_to_wcs. */ 5319 5320 static int 5321 count_mbs_length(offset_buffer, length) 5322 int *offset_buffer; 5323 int length; 5324 { 5325 int wcs_size; 5326 5327 /* Check whether the size is valid. */ 5328 if (length < 0) 5329 return -1; 5330 5331 if (offset_buffer == NULL) 5332 return 0; 5333 5334 for (wcs_size = 0 ; offset_buffer[wcs_size] != -1 ; wcs_size++) 5335 { 5336 if (offset_buffer[wcs_size] == length) 5337 return wcs_size; 5338 if (offset_buffer[wcs_size] > length) 5339 /* It is a fragment of a wide character. */ 5340 return -1; 5341 } 5342 5343 /* We reached at the sentinel. */ 5344 return -1; 5345 } 5346 #endif /* MBS_SUPPORT */ 5347 5348 /* This is a separate function so that we can force an alloca cleanup 5349 afterwards. */ 5350 static int 5351 #ifdef MBS_SUPPORT 5352 re_match_2_internal (bufp, cstring1, csize1, cstring2, csize2, pos, regs, stop) 5353 struct re_pattern_buffer *bufp; 5354 const char *cstring1, *cstring2; 5355 int csize1, csize2; 5356 #else 5357 re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) 5358 struct re_pattern_buffer *bufp; 5359 const char *string1, *string2; 5360 int size1, size2; 5361 #endif 5362 int pos; 5363 struct re_registers *regs; 5364 int stop; 5365 { 5366 /* General temporaries. */ 5367 int mcnt; 5368 US_CHAR_TYPE *p1; 5369 #ifdef MBS_SUPPORT 5370 /* We need wchar_t* buffers correspond to string1, string2. */ 5371 CHAR_TYPE *string1 = NULL, *string2 = NULL; 5372 /* We need the size of wchar_t buffers correspond to csize1, csize2. */ 5373 int size1 = 0, size2 = 0; 5374 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */ 5375 int *mbs_offset1 = NULL, *mbs_offset2 = NULL; 5376 /* They hold whether each wchar_t is binary data or not. */ 5377 char *is_binary = NULL; 5378 #endif /* MBS_SUPPORT */ 5379 5380 /* Just past the end of the corresponding string. */ 5381 const CHAR_TYPE *end1, *end2; 5382 5383 /* Pointers into string1 and string2, just past the last characters in 5384 each to consider matching. */ 5385 const CHAR_TYPE *end_match_1, *end_match_2; 5386 5387 /* Where we are in the data, and the end of the current string. */ 5388 const CHAR_TYPE *d, *dend; 5389 5390 /* Where we are in the pattern, and the end of the pattern. */ 5391 #ifdef MBS_SUPPORT 5392 US_CHAR_TYPE *pattern, *p; 5393 register US_CHAR_TYPE *pend; 5394 #else 5395 US_CHAR_TYPE *p = bufp->buffer; 5396 register US_CHAR_TYPE *pend = p + bufp->used; 5397 #endif /* MBS_SUPPORT */ 5398 5399 /* Mark the opcode just after a start_memory, so we can test for an 5400 empty subpattern when we get to the stop_memory. */ 5401 US_CHAR_TYPE *just_past_start_mem = 0; 5402 5403 /* We use this to map every character in the string. */ 5404 RE_TRANSLATE_TYPE translate = bufp->translate; 5405 5406 /* Failure point stack. Each place that can handle a failure further 5407 down the line pushes a failure point on this stack. It consists of 5408 restart, regend, and reg_info for all registers corresponding to 5409 the subexpressions we're currently inside, plus the number of such 5410 registers, and, finally, two char *'s. The first char * is where 5411 to resume scanning the pattern; the second one is where to resume 5412 scanning the strings. If the latter is zero, the failure point is 5413 a ``dummy''; if a failure happens and the failure point is a dummy, 5414 it gets discarded and the next next one is tried. */ 5415 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ 5416 fail_stack_type fail_stack; 5417 #endif 5418 #ifdef DEBUG 5419 static unsigned failure_id; 5420 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; 5421 #endif 5422 5423 #ifdef REL_ALLOC 5424 /* This holds the pointer to the failure stack, when 5425 it is allocated relocatably. */ 5426 fail_stack_elt_t *failure_stack_ptr; 5427 #endif 5428 5429 /* We fill all the registers internally, independent of what we 5430 return, for use in backreferences. The number here includes 5431 an element for register zero. */ 5432 size_t num_regs = bufp->re_nsub + 1; 5433 5434 /* The currently active registers. */ 5435 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG; 5436 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG; 5437 5438 /* Information on the contents of registers. These are pointers into 5439 the input strings; they record just what was matched (on this 5440 attempt) by a subexpression part of the pattern, that is, the 5441 regnum-th regstart pointer points to where in the pattern we began 5442 matching and the regnum-th regend points to right after where we 5443 stopped matching the regnum-th subexpression. (The zeroth register 5444 keeps track of what the whole pattern matches.) */ 5445 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 5446 const CHAR_TYPE **regstart, **regend; 5447 #endif 5448 5449 /* If a group that's operated upon by a repetition operator fails to 5450 match anything, then the register for its start will need to be 5451 restored because it will have been set to wherever in the string we 5452 are when we last see its open-group operator. Similarly for a 5453 register's end. */ 5454 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 5455 const CHAR_TYPE **old_regstart, **old_regend; 5456 #endif 5457 5458 /* The is_active field of reg_info helps us keep track of which (possibly 5459 nested) subexpressions we are currently in. The matched_something 5460 field of reg_info[reg_num] helps us tell whether or not we have 5461 matched any of the pattern so far this time through the reg_num-th 5462 subexpression. These two fields get reset each time through any 5463 loop their register is in. */ 5464 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ 5465 register_info_type *reg_info; 5466 #endif 5467 5468 /* The following record the register info as found in the above 5469 variables when we find a match better than any we've seen before. 5470 This happens as we backtrack through the failure points, which in 5471 turn happens only if we have not yet matched the entire string. */ 5472 unsigned best_regs_set = false; 5473 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 5474 const CHAR_TYPE **best_regstart, **best_regend; 5475 #endif 5476 5477 /* Logically, this is `best_regend[0]'. But we don't want to have to 5478 allocate space for that if we're not allocating space for anything 5479 else (see below). Also, we never need info about register 0 for 5480 any of the other register vectors, and it seems rather a kludge to 5481 treat `best_regend' differently than the rest. So we keep track of 5482 the end of the best match so far in a separate variable. We 5483 initialize this to NULL so that when we backtrack the first time 5484 and need to test it, it's not garbage. */ 5485 const CHAR_TYPE *match_end = NULL; 5486 5487 /* This helps SET_REGS_MATCHED avoid doing redundant work. */ 5488 int set_regs_matched_done = 0; 5489 5490 /* Used when we pop values we don't care about. */ 5491 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 5492 const CHAR_TYPE **reg_dummy; 5493 register_info_type *reg_info_dummy; 5494 #endif 5495 5496 #ifdef DEBUG 5497 /* Counts the total number of registers pushed. */ 5498 unsigned num_regs_pushed = 0; 5499 #endif 5500 5501 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); 5502 5503 INIT_FAIL_STACK (); 5504 5505 #ifdef MATCH_MAY_ALLOCATE 5506 /* Do not bother to initialize all the register variables if there are 5507 no groups in the pattern, as it takes a fair amount of time. If 5508 there are groups, we include space for register 0 (the whole 5509 pattern), even though we never use it, since it simplifies the 5510 array indexing. We should fix this. */ 5511 if (bufp->re_nsub) 5512 { 5513 regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *); 5514 regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *); 5515 old_regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *); 5516 old_regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *); 5517 best_regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *); 5518 best_regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *); 5519 reg_info = REGEX_TALLOC (num_regs, register_info_type); 5520 reg_dummy = REGEX_TALLOC (num_regs, const CHAR_TYPE *); 5521 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); 5522 5523 if (!(regstart && regend && old_regstart && old_regend && reg_info 5524 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) 5525 { 5526 FREE_VARIABLES (); 5527 return -2; 5528 } 5529 } 5530 else 5531 { 5532 /* We must initialize all our variables to NULL, so that 5533 `FREE_VARIABLES' doesn't try to free them. */ 5534 regstart = regend = old_regstart = old_regend = best_regstart 5535 = best_regend = reg_dummy = NULL; 5536 reg_info = reg_info_dummy = (register_info_type *) NULL; 5537 } 5538 #endif /* MATCH_MAY_ALLOCATE */ 5539 5540 /* The starting position is bogus. */ 5541 #ifdef MBS_SUPPORT 5542 if (pos < 0 || pos > csize1 + csize2) 5543 #else 5544 if (pos < 0 || pos > size1 + size2) 5545 #endif 5546 { 5547 FREE_VARIABLES (); 5548 return -1; 5549 } 5550 5551 #ifdef MBS_SUPPORT 5552 /* Allocate wchar_t array for string1 and string2 and 5553 fill them with converted string. */ 5554 if (csize1 != 0) 5555 { 5556 string1 = REGEX_TALLOC (csize1 + 1, CHAR_TYPE); 5557 mbs_offset1 = REGEX_TALLOC (csize1 + 1, int); 5558 is_binary = REGEX_TALLOC (csize1 + 1, char); 5559 if (!string1 || !mbs_offset1 || !is_binary) 5560 { 5561 FREE_VAR (string1); 5562 FREE_VAR (mbs_offset1); 5563 FREE_VAR (is_binary); 5564 return -2; 5565 } 5566 size1 = convert_mbs_to_wcs(string1, cstring1, csize1, 5567 mbs_offset1, is_binary); 5568 string1[size1] = L'\0'; /* for a sentinel */ 5569 FREE_VAR (is_binary); 5570 } 5571 if (csize2 != 0) 5572 { 5573 string2 = REGEX_TALLOC (csize2 + 1, CHAR_TYPE); 5574 mbs_offset2 = REGEX_TALLOC (csize2 + 1, int); 5575 is_binary = REGEX_TALLOC (csize2 + 1, char); 5576 if (!string2 || !mbs_offset2 || !is_binary) 5577 { 5578 FREE_VAR (string1); 5579 FREE_VAR (mbs_offset1); 5580 FREE_VAR (string2); 5581 FREE_VAR (mbs_offset2); 5582 FREE_VAR (is_binary); 5583 return -2; 5584 } 5585 size2 = convert_mbs_to_wcs(string2, cstring2, csize2, 5586 mbs_offset2, is_binary); 5587 string2[size2] = L'\0'; /* for a sentinel */ 5588 FREE_VAR (is_binary); 5589 } 5590 5591 /* We need to cast pattern to (wchar_t*), because we casted this compiled 5592 pattern to (char*) in regex_compile. */ 5593 p = pattern = (CHAR_TYPE*)bufp->buffer; 5594 pend = (CHAR_TYPE*)(bufp->buffer + bufp->used); 5595 5596 #endif /* MBS_SUPPORT */ 5597 5598 /* Initialize subexpression text positions to -1 to mark ones that no 5599 start_memory/stop_memory has been seen for. Also initialize the 5600 register information struct. */ 5601 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 5602 { 5603 regstart[mcnt] = regend[mcnt] 5604 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; 5605 5606 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; 5607 IS_ACTIVE (reg_info[mcnt]) = 0; 5608 MATCHED_SOMETHING (reg_info[mcnt]) = 0; 5609 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; 5610 } 5611 5612 /* We move `string1' into `string2' if the latter's empty -- but not if 5613 `string1' is null. */ 5614 if (size2 == 0 && string1 != NULL) 5615 { 5616 string2 = string1; 5617 size2 = size1; 5618 string1 = 0; 5619 size1 = 0; 5620 } 5621 end1 = string1 + size1; 5622 end2 = string2 + size2; 5623 5624 /* Compute where to stop matching, within the two strings. */ 5625 #ifdef MBS_SUPPORT 5626 if (stop <= csize1) 5627 { 5628 mcnt = count_mbs_length(mbs_offset1, stop); 5629 end_match_1 = string1 + mcnt; 5630 end_match_2 = string2; 5631 } 5632 else 5633 { 5634 end_match_1 = end1; 5635 mcnt = count_mbs_length(mbs_offset2, stop-csize1); 5636 end_match_2 = string2 + mcnt; 5637 } 5638 if (mcnt < 0) 5639 { /* count_mbs_length return error. */ 5640 FREE_VARIABLES (); 5641 return -1; 5642 } 5643 #else 5644 if (stop <= size1) 5645 { 5646 end_match_1 = string1 + stop; 5647 end_match_2 = string2; 5648 } 5649 else 5650 { 5651 end_match_1 = end1; 5652 end_match_2 = string2 + stop - size1; 5653 } 5654 #endif /* MBS_SUPPORT */ 5655 5656 /* `p' scans through the pattern as `d' scans through the data. 5657 `dend' is the end of the input string that `d' points within. `d' 5658 is advanced into the following input string whenever necessary, but 5659 this happens before fetching; therefore, at the beginning of the 5660 loop, `d' can be pointing at the end of a string, but it cannot 5661 equal `string2'. */ 5662 #ifdef MBS_SUPPORT 5663 if (size1 > 0 && pos <= csize1) 5664 { 5665 mcnt = count_mbs_length(mbs_offset1, pos); 5666 d = string1 + mcnt; 5667 dend = end_match_1; 5668 } 5669 else 5670 { 5671 mcnt = count_mbs_length(mbs_offset2, pos-csize1); 5672 d = string2 + mcnt; 5673 dend = end_match_2; 5674 } 5675 5676 if (mcnt < 0) 5677 { /* count_mbs_length return error. */ 5678 FREE_VARIABLES (); 5679 return -1; 5680 } 5681 #else 5682 if (size1 > 0 && pos <= size1) 5683 { 5684 d = string1 + pos; 5685 dend = end_match_1; 5686 } 5687 else 5688 { 5689 d = string2 + pos - size1; 5690 dend = end_match_2; 5691 } 5692 #endif /* MBS_SUPPORT */ 5693 5694 DEBUG_PRINT1 ("The compiled pattern is:\n"); 5695 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); 5696 DEBUG_PRINT1 ("The string to match is: `"); 5697 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); 5698 DEBUG_PRINT1 ("'\n"); 5699 5700 /* This loops over pattern commands. It exits by returning from the 5701 function if the match is complete, or it drops through if the match 5702 fails at this starting point in the input data. */ 5703 for (;;) 5704 { 5705 #ifdef _LIBC 5706 DEBUG_PRINT2 ("\n%p: ", p); 5707 #else 5708 DEBUG_PRINT2 ("\n0x%x: ", p); 5709 #endif 5710 5711 if (p == pend) 5712 { /* End of pattern means we might have succeeded. */ 5713 DEBUG_PRINT1 ("end of pattern ... "); 5714 5715 /* If we haven't matched the entire string, and we want the 5716 longest match, try backtracking. */ 5717 if (d != end_match_2) 5718 { 5719 /* 1 if this match ends in the same string (string1 or string2) 5720 as the best previous match. */ 5721 boolean same_str_p = (FIRST_STRING_P (match_end) 5722 == MATCHING_IN_FIRST_STRING); 5723 /* 1 if this match is the best seen so far. */ 5724 boolean best_match_p; 5725 5726 /* AIX compiler got confused when this was combined 5727 with the previous declaration. */ 5728 if (same_str_p) 5729 best_match_p = d > match_end; 5730 else 5731 best_match_p = !MATCHING_IN_FIRST_STRING; 5732 5733 DEBUG_PRINT1 ("backtracking.\n"); 5734 5735 if (!FAIL_STACK_EMPTY ()) 5736 { /* More failure points to try. */ 5737 5738 /* If exceeds best match so far, save it. */ 5739 if (!best_regs_set || best_match_p) 5740 { 5741 best_regs_set = true; 5742 match_end = d; 5743 5744 DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); 5745 5746 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 5747 { 5748 best_regstart[mcnt] = regstart[mcnt]; 5749 best_regend[mcnt] = regend[mcnt]; 5750 } 5751 } 5752 goto fail; 5753 } 5754 5755 /* If no failure points, don't restore garbage. And if 5756 last match is real best match, don't restore second 5757 best one. */ 5758 else if (best_regs_set && !best_match_p) 5759 { 5760 restore_best_regs: 5761 /* Restore best match. It may happen that `dend == 5762 end_match_1' while the restored d is in string2. 5763 For example, the pattern `x.*y.*z' against the 5764 strings `x-' and `y-z-', if the two strings are 5765 not consecutive in memory. */ 5766 DEBUG_PRINT1 ("Restoring best registers.\n"); 5767 5768 d = match_end; 5769 dend = ((d >= string1 && d <= end1) 5770 ? end_match_1 : end_match_2); 5771 5772 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 5773 { 5774 regstart[mcnt] = best_regstart[mcnt]; 5775 regend[mcnt] = best_regend[mcnt]; 5776 } 5777 } 5778 } /* d != end_match_2 */ 5779 5780 succeed_label: 5781 DEBUG_PRINT1 ("Accepting match.\n"); 5782 /* If caller wants register contents data back, do it. */ 5783 if (regs && !bufp->no_sub) 5784 { 5785 /* Have the register data arrays been allocated? */ 5786 if (bufp->regs_allocated == REGS_UNALLOCATED) 5787 { /* No. So allocate them with malloc. We need one 5788 extra element beyond `num_regs' for the `-1' marker 5789 GNU code uses. */ 5790 regs->num_regs = MAX (RE_NREGS, num_regs + 1); 5791 regs->start = TALLOC (regs->num_regs, regoff_t); 5792 regs->end = TALLOC (regs->num_regs, regoff_t); 5793 if (regs->start == NULL || regs->end == NULL) 5794 { 5795 FREE_VARIABLES (); 5796 return -2; 5797 } 5798 bufp->regs_allocated = REGS_REALLOCATE; 5799 } 5800 else if (bufp->regs_allocated == REGS_REALLOCATE) 5801 { /* Yes. If we need more elements than were already 5802 allocated, reallocate them. If we need fewer, just 5803 leave it alone. */ 5804 if (regs->num_regs < num_regs + 1) 5805 { 5806 regs->num_regs = num_regs + 1; 5807 RETALLOC (regs->start, regs->num_regs, regoff_t); 5808 RETALLOC (regs->end, regs->num_regs, regoff_t); 5809 if (regs->start == NULL || regs->end == NULL) 5810 { 5811 FREE_VARIABLES (); 5812 return -2; 5813 } 5814 } 5815 } 5816 else 5817 { 5818 /* These braces fend off a "empty body in an else-statement" 5819 warning under GCC when assert expands to nothing. */ 5820 assert (bufp->regs_allocated == REGS_FIXED); 5821 } 5822 5823 /* Convert the pointer data in `regstart' and `regend' to 5824 indices. Register zero has to be set differently, 5825 since we haven't kept track of any info for it. */ 5826 if (regs->num_regs > 0) 5827 { 5828 regs->start[0] = pos; 5829 #ifdef MBS_SUPPORT 5830 if (MATCHING_IN_FIRST_STRING) 5831 regs->end[0] = mbs_offset1 != NULL ? 5832 mbs_offset1[d-string1] : 0; 5833 else 5834 regs->end[0] = csize1 + (mbs_offset2 != NULL ? 5835 mbs_offset2[d-string2] : 0); 5836 #else 5837 regs->end[0] = (MATCHING_IN_FIRST_STRING 5838 ? ((regoff_t) (d - string1)) 5839 : ((regoff_t) (d - string2 + size1))); 5840 #endif /* MBS_SUPPORT */ 5841 } 5842 5843 /* Go through the first `min (num_regs, regs->num_regs)' 5844 registers, since that is all we initialized. */ 5845 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs); 5846 mcnt++) 5847 { 5848 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) 5849 regs->start[mcnt] = regs->end[mcnt] = -1; 5850 else 5851 { 5852 regs->start[mcnt] 5853 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]); 5854 regs->end[mcnt] 5855 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]); 5856 } 5857 } 5858 5859 /* If the regs structure we return has more elements than 5860 were in the pattern, set the extra elements to -1. If 5861 we (re)allocated the registers, this is the case, 5862 because we always allocate enough to have at least one 5863 -1 at the end. */ 5864 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++) 5865 regs->start[mcnt] = regs->end[mcnt] = -1; 5866 } /* regs && !bufp->no_sub */ 5867 5868 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", 5869 nfailure_points_pushed, nfailure_points_popped, 5870 nfailure_points_pushed - nfailure_points_popped); 5871 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); 5872 5873 #ifdef MBS_SUPPORT 5874 if (MATCHING_IN_FIRST_STRING) 5875 mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0; 5876 else 5877 mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) + 5878 csize1; 5879 mcnt -= pos; 5880 #else 5881 mcnt = d - pos - (MATCHING_IN_FIRST_STRING 5882 ? string1 5883 : string2 - size1); 5884 #endif /* MBS_SUPPORT */ 5885 5886 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); 5887 5888 FREE_VARIABLES (); 5889 return mcnt; 5890 } 5891 5892 /* Otherwise match next pattern command. */ 5893 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 5894 { 5895 /* Ignore these. Used to ignore the n of succeed_n's which 5896 currently have n == 0. */ 5897 case no_op: 5898 DEBUG_PRINT1 ("EXECUTING no_op.\n"); 5899 break; 5900 5901 case succeed: 5902 DEBUG_PRINT1 ("EXECUTING succeed.\n"); 5903 goto succeed_label; 5904 5905 /* Match the next n pattern characters exactly. The following 5906 byte in the pattern defines n, and the n bytes after that 5907 are the characters to match. */ 5908 case exactn: 5909 #ifdef MBS_SUPPORT 5910 case exactn_bin: 5911 #endif 5912 mcnt = *p++; 5913 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); 5914 5915 /* This is written out as an if-else so we don't waste time 5916 testing `translate' inside the loop. */ 5917 if (translate) 5918 { 5919 do 5920 { 5921 PREFETCH (); 5922 #ifdef MBS_SUPPORT 5923 if (*d <= 0xff) 5924 { 5925 if ((US_CHAR_TYPE) translate[(unsigned char) *d++] 5926 != (US_CHAR_TYPE) *p++) 5927 goto fail; 5928 } 5929 else 5930 { 5931 if (*d++ != (CHAR_TYPE) *p++) 5932 goto fail; 5933 } 5934 #else 5935 if ((US_CHAR_TYPE) translate[(unsigned char) *d++] 5936 != (US_CHAR_TYPE) *p++) 5937 goto fail; 5938 #endif /* MBS_SUPPORT */ 5939 } 5940 while (--mcnt); 5941 } 5942 else 5943 { 5944 do 5945 { 5946 PREFETCH (); 5947 if (*d++ != (CHAR_TYPE) *p++) goto fail; 5948 } 5949 while (--mcnt); 5950 } 5951 SET_REGS_MATCHED (); 5952 break; 5953 5954 5955 /* Match any character except possibly a newline or a null. */ 5956 case anychar: 5957 DEBUG_PRINT1 ("EXECUTING anychar.\n"); 5958 5959 PREFETCH (); 5960 5961 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') 5962 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) 5963 goto fail; 5964 5965 SET_REGS_MATCHED (); 5966 DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d); 5967 d++; 5968 break; 5969 5970 5971 case charset: 5972 case charset_not: 5973 { 5974 register US_CHAR_TYPE c; 5975 #ifdef MBS_SUPPORT 5976 unsigned int i, char_class_length, coll_symbol_length, 5977 equiv_class_length, ranges_length, chars_length, length; 5978 CHAR_TYPE *workp, *workp2, *charset_top; 5979 #define WORK_BUFFER_SIZE 128 5980 CHAR_TYPE str_buf[WORK_BUFFER_SIZE]; 5981 # ifdef _LIBC 5982 uint32_t nrules; 5983 # endif /* _LIBC */ 5984 #endif /* MBS_SUPPORT */ 5985 boolean not = (re_opcode_t) *(p - 1) == charset_not; 5986 5987 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); 5988 PREFETCH (); 5989 c = TRANSLATE (*d); /* The character to match. */ 5990 #ifdef MBS_SUPPORT 5991 # ifdef _LIBC 5992 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 5993 # endif /* _LIBC */ 5994 charset_top = p - 1; 5995 char_class_length = *p++; 5996 coll_symbol_length = *p++; 5997 equiv_class_length = *p++; 5998 ranges_length = *p++; 5999 chars_length = *p++; 6000 /* p points charset[6], so the address of the next instruction 6001 (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'], 6002 where l=length of char_classes, m=length of collating_symbol, 6003 n=equivalence_class, o=length of char_range, 6004 p'=length of character. */ 6005 workp = p; 6006 /* Update p to indicate the next instruction. */ 6007 p += char_class_length + coll_symbol_length+ equiv_class_length + 6008 2*ranges_length + chars_length; 6009 6010 /* match with char_class? */ 6011 for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE) 6012 { 6013 wctype_t wctype; 6014 uintptr_t alignedp = ((uintptr_t)workp 6015 + __alignof__(wctype_t) - 1) 6016 & ~(uintptr_t)(__alignof__(wctype_t) - 1); 6017 wctype = *((wctype_t*)alignedp); 6018 workp += CHAR_CLASS_SIZE; 6019 if (iswctype((wint_t)c, wctype)) 6020 goto char_set_matched; 6021 } 6022 6023 /* match with collating_symbol? */ 6024 # ifdef _LIBC 6025 if (nrules != 0) 6026 { 6027 const unsigned char *extra = (const unsigned char *) 6028 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 6029 6030 for (workp2 = workp + coll_symbol_length ; workp < workp2 ; 6031 workp++) 6032 { 6033 int32_t *wextra; 6034 wextra = (int32_t*)(extra + *workp++); 6035 for (i = 0; i < *wextra; ++i) 6036 if (TRANSLATE(d[i]) != wextra[1 + i]) 6037 break; 6038 6039 if (i == *wextra) 6040 { 6041 /* Update d, however d will be incremented at 6042 char_set_matched:, we decrement d here. */ 6043 d += i - 1; 6044 goto char_set_matched; 6045 } 6046 } 6047 } 6048 else /* (nrules == 0) */ 6049 # endif 6050 /* If we can't look up collation data, we use wcscoll 6051 instead. */ 6052 { 6053 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;) 6054 { 6055 const CHAR_TYPE *backup_d = d, *backup_dend = dend; 6056 length = wcslen(workp); 6057 6058 /* If wcscoll(the collating symbol, whole string) > 0, 6059 any substring of the string never match with the 6060 collating symbol. */ 6061 if (wcscoll(workp, d) > 0) 6062 { 6063 workp += length + 1; 6064 continue; 6065 } 6066 6067 /* First, we compare the collating symbol with 6068 the first character of the string. 6069 If it don't match, we add the next character to 6070 the compare buffer in turn. */ 6071 for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++) 6072 { 6073 int match; 6074 if (d == dend) 6075 { 6076 if (dend == end_match_2) 6077 break; 6078 d = string2; 6079 dend = end_match_2; 6080 } 6081 6082 /* add next character to the compare buffer. */ 6083 str_buf[i] = TRANSLATE(*d); 6084 str_buf[i+1] = '\0'; 6085 6086 match = wcscoll(workp, str_buf); 6087 if (match == 0) 6088 goto char_set_matched; 6089 6090 if (match < 0) 6091 /* (str_buf > workp) indicate (str_buf + X > workp), 6092 because for all X (str_buf + X > str_buf). 6093 So we don't need continue this loop. */ 6094 break; 6095 6096 /* Otherwise(str_buf < workp), 6097 (str_buf+next_character) may equals (workp). 6098 So we continue this loop. */ 6099 } 6100 /* not matched */ 6101 d = backup_d; 6102 dend = backup_dend; 6103 workp += length + 1; 6104 } 6105 } 6106 /* match with equivalence_class? */ 6107 # ifdef _LIBC 6108 if (nrules != 0) 6109 { 6110 const CHAR_TYPE *backup_d = d, *backup_dend = dend; 6111 /* Try to match the equivalence class against 6112 those known to the collate implementation. */ 6113 const int32_t *table; 6114 const int32_t *weights; 6115 const int32_t *extra; 6116 const int32_t *indirect; 6117 int32_t idx, idx2; 6118 wint_t *cp; 6119 size_t len; 6120 6121 /* This #include defines a local function! */ 6122 # include <locale/weightwc.h> 6123 6124 table = (const int32_t *) 6125 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC); 6126 weights = (const wint_t *) 6127 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC); 6128 extra = (const wint_t *) 6129 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC); 6130 indirect = (const int32_t *) 6131 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC); 6132 6133 /* Write 1 collating element to str_buf, and 6134 get its index. */ 6135 idx2 = 0; 6136 6137 for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++) 6138 { 6139 cp = (wint_t*)str_buf; 6140 if (d == dend) 6141 { 6142 if (dend == end_match_2) 6143 break; 6144 d = string2; 6145 dend = end_match_2; 6146 } 6147 str_buf[i] = TRANSLATE(*(d+i)); 6148 str_buf[i+1] = '\0'; /* sentinel */ 6149 idx2 = findidx ((const wint_t**)&cp); 6150 } 6151 6152 /* Update d, however d will be incremented at 6153 char_set_matched:, we decrement d here. */ 6154 d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1); 6155 if (d >= dend) 6156 { 6157 if (dend == end_match_2) 6158 d = dend; 6159 else 6160 { 6161 d = string2; 6162 dend = end_match_2; 6163 } 6164 } 6165 6166 len = weights[idx2]; 6167 6168 for (workp2 = workp + equiv_class_length ; workp < workp2 ; 6169 workp++) 6170 { 6171 idx = (int32_t)*workp; 6172 /* We already checked idx != 0 in regex_compile. */ 6173 6174 if (idx2 != 0 && len == weights[idx]) 6175 { 6176 int cnt = 0; 6177 while (cnt < len && (weights[idx + 1 + cnt] 6178 == weights[idx2 + 1 + cnt])) 6179 ++cnt; 6180 6181 if (cnt == len) 6182 goto char_set_matched; 6183 } 6184 } 6185 /* not matched */ 6186 d = backup_d; 6187 dend = backup_dend; 6188 } 6189 else /* (nrules == 0) */ 6190 # endif 6191 /* If we can't look up collation data, we use wcscoll 6192 instead. */ 6193 { 6194 for (workp2 = workp + equiv_class_length ; workp < workp2 ;) 6195 { 6196 const CHAR_TYPE *backup_d = d, *backup_dend = dend; 6197 length = wcslen(workp); 6198 6199 /* If wcscoll(the collating symbol, whole string) > 0, 6200 any substring of the string never match with the 6201 collating symbol. */ 6202 if (wcscoll(workp, d) > 0) 6203 { 6204 workp += length + 1; 6205 break; 6206 } 6207 6208 /* First, we compare the equivalence class with 6209 the first character of the string. 6210 If it don't match, we add the next character to 6211 the compare buffer in turn. */ 6212 for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++) 6213 { 6214 int match; 6215 if (d == dend) 6216 { 6217 if (dend == end_match_2) 6218 break; 6219 d = string2; 6220 dend = end_match_2; 6221 } 6222 6223 /* add next character to the compare buffer. */ 6224 str_buf[i] = TRANSLATE(*d); 6225 str_buf[i+1] = '\0'; 6226 6227 match = wcscoll(workp, str_buf); 6228 6229 if (match == 0) 6230 goto char_set_matched; 6231 6232 if (match < 0) 6233 /* (str_buf > workp) indicate (str_buf + X > workp), 6234 because for all X (str_buf + X > str_buf). 6235 So we don't need continue this loop. */ 6236 break; 6237 6238 /* Otherwise(str_buf < workp), 6239 (str_buf+next_character) may equals (workp). 6240 So we continue this loop. */ 6241 } 6242 /* not matched */ 6243 d = backup_d; 6244 dend = backup_dend; 6245 workp += length + 1; 6246 } 6247 } 6248 6249 /* match with char_range? */ 6250 #ifdef _LIBC 6251 if (nrules != 0) 6252 { 6253 uint32_t collseqval; 6254 const char *collseq = (const char *) 6255 _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC); 6256 6257 collseqval = collseq_table_lookup (collseq, c); 6258 6259 for (; workp < p - chars_length ;) 6260 { 6261 uint32_t start_val, end_val; 6262 6263 /* We already compute the collation sequence value 6264 of the characters (or collating symbols). */ 6265 start_val = (uint32_t) *workp++; /* range_start */ 6266 end_val = (uint32_t) *workp++; /* range_end */ 6267 6268 if (start_val <= collseqval && collseqval <= end_val) 6269 goto char_set_matched; 6270 } 6271 } 6272 else 6273 #endif 6274 { 6275 /* We set range_start_char at str_buf[0], range_end_char 6276 at str_buf[4], and compared char at str_buf[2]. */ 6277 str_buf[1] = 0; 6278 str_buf[2] = c; 6279 str_buf[3] = 0; 6280 str_buf[5] = 0; 6281 for (; workp < p - chars_length ;) 6282 { 6283 wchar_t *range_start_char, *range_end_char; 6284 6285 /* match if (range_start_char <= c <= range_end_char). */ 6286 6287 /* If range_start(or end) < 0, we assume -range_start(end) 6288 is the offset of the collating symbol which is specified 6289 as the character of the range start(end). */ 6290 6291 /* range_start */ 6292 if (*workp < 0) 6293 range_start_char = charset_top - (*workp++); 6294 else 6295 { 6296 str_buf[0] = *workp++; 6297 range_start_char = str_buf; 6298 } 6299 6300 /* range_end */ 6301 if (*workp < 0) 6302 range_end_char = charset_top - (*workp++); 6303 else 6304 { 6305 str_buf[4] = *workp++; 6306 range_end_char = str_buf + 4; 6307 } 6308 6309 if (wcscoll(range_start_char, str_buf+2) <= 0 && 6310 wcscoll(str_buf+2, range_end_char) <= 0) 6311 6312 goto char_set_matched; 6313 } 6314 } 6315 6316 /* match with char? */ 6317 for (; workp < p ; workp++) 6318 if (c == *workp) 6319 goto char_set_matched; 6320 6321 not = !not; 6322 6323 char_set_matched: 6324 if (not) goto fail; 6325 #else 6326 /* Cast to `unsigned' instead of `unsigned char' in case the 6327 bit list is a full 32 bytes long. */ 6328 if (c < (unsigned) (*p * BYTEWIDTH) 6329 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 6330 not = !not; 6331 6332 p += 1 + *p; 6333 6334 if (!not) goto fail; 6335 #undef WORK_BUFFER_SIZE 6336 #endif /* MBS_SUPPORT */ 6337 SET_REGS_MATCHED (); 6338 d++; 6339 break; 6340 } 6341 6342 6343 /* The beginning of a group is represented by start_memory. 6344 The arguments are the register number in the next byte, and the 6345 number of groups inner to this one in the next. The text 6346 matched within the group is recorded (in the internal 6347 registers data structure) under the register number. */ 6348 case start_memory: 6349 DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n", 6350 (long int) *p, (long int) p[1]); 6351 6352 /* Find out if this group can match the empty string. */ 6353 p1 = p; /* To send to group_match_null_string_p. */ 6354 6355 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) 6356 REG_MATCH_NULL_STRING_P (reg_info[*p]) 6357 = group_match_null_string_p (&p1, pend, reg_info); 6358 6359 /* Save the position in the string where we were the last time 6360 we were at this open-group operator in case the group is 6361 operated upon by a repetition operator, e.g., with `(a*)*b' 6362 against `ab'; then we want to ignore where we are now in 6363 the string in case this attempt to match fails. */ 6364 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 6365 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] 6366 : regstart[*p]; 6367 DEBUG_PRINT2 (" old_regstart: %d\n", 6368 POINTER_TO_OFFSET (old_regstart[*p])); 6369 6370 regstart[*p] = d; 6371 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); 6372 6373 IS_ACTIVE (reg_info[*p]) = 1; 6374 MATCHED_SOMETHING (reg_info[*p]) = 0; 6375 6376 /* Clear this whenever we change the register activity status. */ 6377 set_regs_matched_done = 0; 6378 6379 /* This is the new highest active register. */ 6380 highest_active_reg = *p; 6381 6382 /* If nothing was active before, this is the new lowest active 6383 register. */ 6384 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 6385 lowest_active_reg = *p; 6386 6387 /* Move past the register number and inner group count. */ 6388 p += 2; 6389 just_past_start_mem = p; 6390 6391 break; 6392 6393 6394 /* The stop_memory opcode represents the end of a group. Its 6395 arguments are the same as start_memory's: the register 6396 number, and the number of inner groups. */ 6397 case stop_memory: 6398 DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n", 6399 (long int) *p, (long int) p[1]); 6400 6401 /* We need to save the string position the last time we were at 6402 this close-group operator in case the group is operated 6403 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' 6404 against `aba'; then we want to ignore where we are now in 6405 the string in case this attempt to match fails. */ 6406 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 6407 ? REG_UNSET (regend[*p]) ? d : regend[*p] 6408 : regend[*p]; 6409 DEBUG_PRINT2 (" old_regend: %d\n", 6410 POINTER_TO_OFFSET (old_regend[*p])); 6411 6412 regend[*p] = d; 6413 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); 6414 6415 /* This register isn't active anymore. */ 6416 IS_ACTIVE (reg_info[*p]) = 0; 6417 6418 /* Clear this whenever we change the register activity status. */ 6419 set_regs_matched_done = 0; 6420 6421 /* If this was the only register active, nothing is active 6422 anymore. */ 6423 if (lowest_active_reg == highest_active_reg) 6424 { 6425 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 6426 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 6427 } 6428 else 6429 { /* We must scan for the new highest active register, since 6430 it isn't necessarily one less than now: consider 6431 (a(b)c(d(e)f)g). When group 3 ends, after the f), the 6432 new highest active register is 1. */ 6433 US_CHAR_TYPE r = *p - 1; 6434 while (r > 0 && !IS_ACTIVE (reg_info[r])) 6435 r--; 6436 6437 /* If we end up at register zero, that means that we saved 6438 the registers as the result of an `on_failure_jump', not 6439 a `start_memory', and we jumped to past the innermost 6440 `stop_memory'. For example, in ((.)*) we save 6441 registers 1 and 2 as a result of the *, but when we pop 6442 back to the second ), we are at the stop_memory 1. 6443 Thus, nothing is active. */ 6444 if (r == 0) 6445 { 6446 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 6447 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 6448 } 6449 else 6450 highest_active_reg = r; 6451 } 6452 6453 /* If just failed to match something this time around with a 6454 group that's operated on by a repetition operator, try to 6455 force exit from the ``loop'', and restore the register 6456 information for this group that we had before trying this 6457 last match. */ 6458 if ((!MATCHED_SOMETHING (reg_info[*p]) 6459 || just_past_start_mem == p - 1) 6460 && (p + 2) < pend) 6461 { 6462 boolean is_a_jump_n = false; 6463 6464 p1 = p + 2; 6465 mcnt = 0; 6466 switch ((re_opcode_t) *p1++) 6467 { 6468 case jump_n: 6469 is_a_jump_n = true; 6470 case pop_failure_jump: 6471 case maybe_pop_jump: 6472 case jump: 6473 case dummy_failure_jump: 6474 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 6475 if (is_a_jump_n) 6476 p1 += OFFSET_ADDRESS_SIZE; 6477 break; 6478 6479 default: 6480 /* do nothing */ ; 6481 } 6482 p1 += mcnt; 6483 6484 /* If the next operation is a jump backwards in the pattern 6485 to an on_failure_jump right before the start_memory 6486 corresponding to this stop_memory, exit from the loop 6487 by forcing a failure after pushing on the stack the 6488 on_failure_jump's jump in the pattern, and d. */ 6489 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump 6490 && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory 6491 && p1[2+OFFSET_ADDRESS_SIZE] == *p) 6492 { 6493 /* If this group ever matched anything, then restore 6494 what its registers were before trying this last 6495 failed match, e.g., with `(a*)*b' against `ab' for 6496 regstart[1], and, e.g., with `((a*)*(b*)*)*' 6497 against `aba' for regend[3]. 6498 6499 Also restore the registers for inner groups for, 6500 e.g., `((a*)(b*))*' against `aba' (register 3 would 6501 otherwise get trashed). */ 6502 6503 if (EVER_MATCHED_SOMETHING (reg_info[*p])) 6504 { 6505 unsigned r; 6506 6507 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; 6508 6509 /* Restore this and inner groups' (if any) registers. */ 6510 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1); 6511 r++) 6512 { 6513 regstart[r] = old_regstart[r]; 6514 6515 /* xx why this test? */ 6516 if (old_regend[r] >= regstart[r]) 6517 regend[r] = old_regend[r]; 6518 } 6519 } 6520 p1++; 6521 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 6522 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); 6523 6524 goto fail; 6525 } 6526 } 6527 6528 /* Move past the register number and the inner group count. */ 6529 p += 2; 6530 break; 6531 6532 6533 /* \<digit> has been turned into a `duplicate' command which is 6534 followed by the numeric value of <digit> as the register number. */ 6535 case duplicate: 6536 { 6537 register const CHAR_TYPE *d2, *dend2; 6538 int regno = *p++; /* Get which register to match against. */ 6539 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); 6540 6541 /* Can't back reference a group which we've never matched. */ 6542 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) 6543 goto fail; 6544 6545 /* Where in input to try to start matching. */ 6546 d2 = regstart[regno]; 6547 6548 /* Where to stop matching; if both the place to start and 6549 the place to stop matching are in the same string, then 6550 set to the place to stop, otherwise, for now have to use 6551 the end of the first string. */ 6552 6553 dend2 = ((FIRST_STRING_P (regstart[regno]) 6554 == FIRST_STRING_P (regend[regno])) 6555 ? regend[regno] : end_match_1); 6556 for (;;) 6557 { 6558 /* If necessary, advance to next segment in register 6559 contents. */ 6560 while (d2 == dend2) 6561 { 6562 if (dend2 == end_match_2) break; 6563 if (dend2 == regend[regno]) break; 6564 6565 /* End of string1 => advance to string2. */ 6566 d2 = string2; 6567 dend2 = regend[regno]; 6568 } 6569 /* At end of register contents => success */ 6570 if (d2 == dend2) break; 6571 6572 /* If necessary, advance to next segment in data. */ 6573 PREFETCH (); 6574 6575 /* How many characters left in this segment to match. */ 6576 mcnt = dend - d; 6577 6578 /* Want how many consecutive characters we can match in 6579 one shot, so, if necessary, adjust the count. */ 6580 if (mcnt > dend2 - d2) 6581 mcnt = dend2 - d2; 6582 6583 /* Compare that many; failure if mismatch, else move 6584 past them. */ 6585 if (translate 6586 ? bcmp_translate (d, d2, mcnt, translate) 6587 : memcmp (d, d2, mcnt*sizeof(US_CHAR_TYPE))) 6588 goto fail; 6589 d += mcnt, d2 += mcnt; 6590 6591 /* Do this because we've match some characters. */ 6592 SET_REGS_MATCHED (); 6593 } 6594 } 6595 break; 6596 6597 6598 /* begline matches the empty string at the beginning of the string 6599 (unless `not_bol' is set in `bufp'), and, if 6600 `newline_anchor' is set, after newlines. */ 6601 case begline: 6602 DEBUG_PRINT1 ("EXECUTING begline.\n"); 6603 6604 if (AT_STRINGS_BEG (d)) 6605 { 6606 if (!bufp->not_bol) break; 6607 } 6608 else if (d[-1] == '\n' && bufp->newline_anchor) 6609 { 6610 break; 6611 } 6612 /* In all other cases, we fail. */ 6613 goto fail; 6614 6615 6616 /* endline is the dual of begline. */ 6617 case endline: 6618 DEBUG_PRINT1 ("EXECUTING endline.\n"); 6619 6620 if (AT_STRINGS_END (d)) 6621 { 6622 if (!bufp->not_eol) break; 6623 } 6624 6625 /* We have to ``prefetch'' the next character. */ 6626 else if ((d == end1 ? *string2 : *d) == '\n' 6627 && bufp->newline_anchor) 6628 { 6629 break; 6630 } 6631 goto fail; 6632 6633 6634 /* Match at the very beginning of the data. */ 6635 case begbuf: 6636 DEBUG_PRINT1 ("EXECUTING begbuf.\n"); 6637 if (AT_STRINGS_BEG (d)) 6638 break; 6639 goto fail; 6640 6641 6642 /* Match at the very end of the data. */ 6643 case endbuf: 6644 DEBUG_PRINT1 ("EXECUTING endbuf.\n"); 6645 if (AT_STRINGS_END (d)) 6646 break; 6647 goto fail; 6648 6649 6650 /* on_failure_keep_string_jump is used to optimize `.*\n'. It 6651 pushes NULL as the value for the string on the stack. Then 6652 `pop_failure_point' will keep the current value for the 6653 string, instead of restoring it. To see why, consider 6654 matching `foo\nbar' against `.*\n'. The .* matches the foo; 6655 then the . fails against the \n. But the next thing we want 6656 to do is match the \n against the \n; if we restored the 6657 string value, we would be back at the foo. 6658 6659 Because this is used only in specific cases, we don't need to 6660 check all the things that `on_failure_jump' does, to make 6661 sure the right things get saved on the stack. Hence we don't 6662 share its code. The only reason to push anything on the 6663 stack at all is that otherwise we would have to change 6664 `anychar's code to do something besides goto fail in this 6665 case; that seems worse than this. */ 6666 case on_failure_keep_string_jump: 6667 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); 6668 6669 EXTRACT_NUMBER_AND_INCR (mcnt, p); 6670 #ifdef _LIBC 6671 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt); 6672 #else 6673 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); 6674 #endif 6675 6676 PUSH_FAILURE_POINT (p + mcnt, NULL, -2); 6677 break; 6678 6679 6680 /* Uses of on_failure_jump: 6681 6682 Each alternative starts with an on_failure_jump that points 6683 to the beginning of the next alternative. Each alternative 6684 except the last ends with a jump that in effect jumps past 6685 the rest of the alternatives. (They really jump to the 6686 ending jump of the following alternative, because tensioning 6687 these jumps is a hassle.) 6688 6689 Repeats start with an on_failure_jump that points past both 6690 the repetition text and either the following jump or 6691 pop_failure_jump back to this on_failure_jump. */ 6692 case on_failure_jump: 6693 on_failure: 6694 DEBUG_PRINT1 ("EXECUTING on_failure_jump"); 6695 6696 EXTRACT_NUMBER_AND_INCR (mcnt, p); 6697 #ifdef _LIBC 6698 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt); 6699 #else 6700 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); 6701 #endif 6702 6703 /* If this on_failure_jump comes right before a group (i.e., 6704 the original * applied to a group), save the information 6705 for that group and all inner ones, so that if we fail back 6706 to this point, the group's information will be correct. 6707 For example, in \(a*\)*\1, we need the preceding group, 6708 and in \(zz\(a*\)b*\)\2, we need the inner group. */ 6709 6710 /* We can't use `p' to check ahead because we push 6711 a failure point to `p + mcnt' after we do this. */ 6712 p1 = p; 6713 6714 /* We need to skip no_op's before we look for the 6715 start_memory in case this on_failure_jump is happening as 6716 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 6717 against aba. */ 6718 while (p1 < pend && (re_opcode_t) *p1 == no_op) 6719 p1++; 6720 6721 if (p1 < pend && (re_opcode_t) *p1 == start_memory) 6722 { 6723 /* We have a new highest active register now. This will 6724 get reset at the start_memory we are about to get to, 6725 but we will have saved all the registers relevant to 6726 this repetition op, as described above. */ 6727 highest_active_reg = *(p1 + 1) + *(p1 + 2); 6728 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 6729 lowest_active_reg = *(p1 + 1); 6730 } 6731 6732 DEBUG_PRINT1 (":\n"); 6733 PUSH_FAILURE_POINT (p + mcnt, d, -2); 6734 break; 6735 6736 6737 /* A smart repeat ends with `maybe_pop_jump'. 6738 We change it to either `pop_failure_jump' or `jump'. */ 6739 case maybe_pop_jump: 6740 EXTRACT_NUMBER_AND_INCR (mcnt, p); 6741 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); 6742 { 6743 register US_CHAR_TYPE *p2 = p; 6744 6745 /* Compare the beginning of the repeat with what in the 6746 pattern follows its end. If we can establish that there 6747 is nothing that they would both match, i.e., that we 6748 would have to backtrack because of (as in, e.g., `a*a') 6749 then we can change to pop_failure_jump, because we'll 6750 never have to backtrack. 6751 6752 This is not true in the case of alternatives: in 6753 `(a|ab)*' we do need to backtrack to the `ab' alternative 6754 (e.g., if the string was `ab'). But instead of trying to 6755 detect that here, the alternative has put on a dummy 6756 failure point which is what we will end up popping. */ 6757 6758 /* Skip over open/close-group commands. 6759 If what follows this loop is a ...+ construct, 6760 look at what begins its body, since we will have to 6761 match at least one of that. */ 6762 while (1) 6763 { 6764 if (p2 + 2 < pend 6765 && ((re_opcode_t) *p2 == stop_memory 6766 || (re_opcode_t) *p2 == start_memory)) 6767 p2 += 3; 6768 else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend 6769 && (re_opcode_t) *p2 == dummy_failure_jump) 6770 p2 += 2 + 2 * OFFSET_ADDRESS_SIZE; 6771 else 6772 break; 6773 } 6774 6775 p1 = p + mcnt; 6776 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding 6777 to the `maybe_finalize_jump' of this case. Examine what 6778 follows. */ 6779 6780 /* If we're at the end of the pattern, we can change. */ 6781 if (p2 == pend) 6782 { 6783 /* Consider what happens when matching ":\(.*\)" 6784 against ":/". I don't really understand this code 6785 yet. */ 6786 p[-(1+OFFSET_ADDRESS_SIZE)] = (US_CHAR_TYPE) 6787 pop_failure_jump; 6788 DEBUG_PRINT1 6789 (" End of pattern: change to `pop_failure_jump'.\n"); 6790 } 6791 6792 else if ((re_opcode_t) *p2 == exactn 6793 #ifdef MBS_SUPPORT 6794 || (re_opcode_t) *p2 == exactn_bin 6795 #endif 6796 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) 6797 { 6798 register US_CHAR_TYPE c 6799 = *p2 == (US_CHAR_TYPE) endline ? '\n' : p2[2]; 6800 6801 if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn 6802 #ifdef MBS_SUPPORT 6803 || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin 6804 #endif 6805 ) && p1[3+OFFSET_ADDRESS_SIZE] != c) 6806 { 6807 p[-(1+OFFSET_ADDRESS_SIZE)] = (US_CHAR_TYPE) 6808 pop_failure_jump; 6809 #ifdef MBS_SUPPORT 6810 if (MB_CUR_MAX != 1) 6811 DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n", 6812 (wint_t) c, 6813 (wint_t) p1[3+OFFSET_ADDRESS_SIZE]); 6814 else 6815 #endif 6816 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", 6817 (char) c, 6818 (char) p1[3+OFFSET_ADDRESS_SIZE]); 6819 } 6820 6821 #ifndef MBS_SUPPORT 6822 else if ((re_opcode_t) p1[3] == charset 6823 || (re_opcode_t) p1[3] == charset_not) 6824 { 6825 int not = (re_opcode_t) p1[3] == charset_not; 6826 6827 if (c < (unsigned) (p1[4] * BYTEWIDTH) 6828 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 6829 not = !not; 6830 6831 /* `not' is equal to 1 if c would match, which means 6832 that we can't change to pop_failure_jump. */ 6833 if (!not) 6834 { 6835 p[-3] = (unsigned char) pop_failure_jump; 6836 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 6837 } 6838 } 6839 #endif /* not MBS_SUPPORT */ 6840 } 6841 #ifndef MBS_SUPPORT 6842 else if ((re_opcode_t) *p2 == charset) 6843 { 6844 /* We win if the first character of the loop is not part 6845 of the charset. */ 6846 if ((re_opcode_t) p1[3] == exactn 6847 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5] 6848 && (p2[2 + p1[5] / BYTEWIDTH] 6849 & (1 << (p1[5] % BYTEWIDTH))))) 6850 { 6851 p[-3] = (unsigned char) pop_failure_jump; 6852 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 6853 } 6854 6855 else if ((re_opcode_t) p1[3] == charset_not) 6856 { 6857 int idx; 6858 /* We win if the charset_not inside the loop 6859 lists every character listed in the charset after. */ 6860 for (idx = 0; idx < (int) p2[1]; idx++) 6861 if (! (p2[2 + idx] == 0 6862 || (idx < (int) p1[4] 6863 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0)))) 6864 break; 6865 6866 if (idx == p2[1]) 6867 { 6868 p[-3] = (unsigned char) pop_failure_jump; 6869 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 6870 } 6871 } 6872 else if ((re_opcode_t) p1[3] == charset) 6873 { 6874 int idx; 6875 /* We win if the charset inside the loop 6876 has no overlap with the one after the loop. */ 6877 for (idx = 0; 6878 idx < (int) p2[1] && idx < (int) p1[4]; 6879 idx++) 6880 if ((p2[2 + idx] & p1[5 + idx]) != 0) 6881 break; 6882 6883 if (idx == p2[1] || idx == p1[4]) 6884 { 6885 p[-3] = (unsigned char) pop_failure_jump; 6886 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 6887 } 6888 } 6889 } 6890 #endif /* not MBS_SUPPORT */ 6891 } 6892 p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */ 6893 if ((re_opcode_t) p[-1] != pop_failure_jump) 6894 { 6895 p[-1] = (US_CHAR_TYPE) jump; 6896 DEBUG_PRINT1 (" Match => jump.\n"); 6897 goto unconditional_jump; 6898 } 6899 /* Note fall through. */ 6900 6901 6902 /* The end of a simple repeat has a pop_failure_jump back to 6903 its matching on_failure_jump, where the latter will push a 6904 failure point. The pop_failure_jump takes off failure 6905 points put on by this pop_failure_jump's matching 6906 on_failure_jump; we got through the pattern to here from the 6907 matching on_failure_jump, so didn't fail. */ 6908 case pop_failure_jump: 6909 { 6910 /* We need to pass separate storage for the lowest and 6911 highest registers, even though we don't care about the 6912 actual values. Otherwise, we will restore only one 6913 register from the stack, since lowest will == highest in 6914 `pop_failure_point'. */ 6915 active_reg_t dummy_low_reg, dummy_high_reg; 6916 US_CHAR_TYPE *pdummy = NULL; 6917 const CHAR_TYPE *sdummy = NULL; 6918 6919 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); 6920 POP_FAILURE_POINT (sdummy, pdummy, 6921 dummy_low_reg, dummy_high_reg, 6922 reg_dummy, reg_dummy, reg_info_dummy); 6923 } 6924 /* Note fall through. */ 6925 6926 unconditional_jump: 6927 #ifdef _LIBC 6928 DEBUG_PRINT2 ("\n%p: ", p); 6929 #else 6930 DEBUG_PRINT2 ("\n0x%x: ", p); 6931 #endif 6932 /* Note fall through. */ 6933 6934 /* Unconditionally jump (without popping any failure points). */ 6935 case jump: 6936 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ 6937 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); 6938 p += mcnt; /* Do the jump. */ 6939 #ifdef _LIBC 6940 DEBUG_PRINT2 ("(to %p).\n", p); 6941 #else 6942 DEBUG_PRINT2 ("(to 0x%x).\n", p); 6943 #endif 6944 break; 6945 6946 6947 /* We need this opcode so we can detect where alternatives end 6948 in `group_match_null_string_p' et al. */ 6949 case jump_past_alt: 6950 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); 6951 goto unconditional_jump; 6952 6953 6954 /* Normally, the on_failure_jump pushes a failure point, which 6955 then gets popped at pop_failure_jump. We will end up at 6956 pop_failure_jump, also, and with a pattern of, say, `a+', we 6957 are skipping over the on_failure_jump, so we have to push 6958 something meaningless for pop_failure_jump to pop. */ 6959 case dummy_failure_jump: 6960 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); 6961 /* It doesn't matter what we push for the string here. What 6962 the code at `fail' tests is the value for the pattern. */ 6963 PUSH_FAILURE_POINT (NULL, NULL, -2); 6964 goto unconditional_jump; 6965 6966 6967 /* At the end of an alternative, we need to push a dummy failure 6968 point in case we are followed by a `pop_failure_jump', because 6969 we don't want the failure point for the alternative to be 6970 popped. For example, matching `(a|ab)*' against `aab' 6971 requires that we match the `ab' alternative. */ 6972 case push_dummy_failure: 6973 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); 6974 /* See comments just above at `dummy_failure_jump' about the 6975 two zeroes. */ 6976 PUSH_FAILURE_POINT (NULL, NULL, -2); 6977 break; 6978 6979 /* Have to succeed matching what follows at least n times. 6980 After that, handle like `on_failure_jump'. */ 6981 case succeed_n: 6982 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE); 6983 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); 6984 6985 assert (mcnt >= 0); 6986 /* Originally, this is how many times we HAVE to succeed. */ 6987 if (mcnt > 0) 6988 { 6989 mcnt--; 6990 p += OFFSET_ADDRESS_SIZE; 6991 STORE_NUMBER_AND_INCR (p, mcnt); 6992 #ifdef _LIBC 6993 DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE 6994 , mcnt); 6995 #else 6996 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE 6997 , mcnt); 6998 #endif 6999 } 7000 else if (mcnt == 0) 7001 { 7002 #ifdef _LIBC 7003 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", 7004 p + OFFSET_ADDRESS_SIZE); 7005 #else 7006 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", 7007 p + OFFSET_ADDRESS_SIZE); 7008 #endif /* _LIBC */ 7009 7010 #ifdef MBS_SUPPORT 7011 p[1] = (US_CHAR_TYPE) no_op; 7012 #else 7013 p[2] = (US_CHAR_TYPE) no_op; 7014 p[3] = (US_CHAR_TYPE) no_op; 7015 #endif /* MBS_SUPPORT */ 7016 goto on_failure; 7017 } 7018 break; 7019 7020 case jump_n: 7021 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE); 7022 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); 7023 7024 /* Originally, this is how many times we CAN jump. */ 7025 if (mcnt) 7026 { 7027 mcnt--; 7028 STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt); 7029 7030 #ifdef _LIBC 7031 DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE, 7032 mcnt); 7033 #else 7034 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE, 7035 mcnt); 7036 #endif /* _LIBC */ 7037 goto unconditional_jump; 7038 } 7039 /* If don't have to jump any more, skip over the rest of command. */ 7040 else 7041 p += 2 * OFFSET_ADDRESS_SIZE; 7042 break; 7043 7044 case set_number_at: 7045 { 7046 DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); 7047 7048 EXTRACT_NUMBER_AND_INCR (mcnt, p); 7049 p1 = p + mcnt; 7050 EXTRACT_NUMBER_AND_INCR (mcnt, p); 7051 #ifdef _LIBC 7052 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt); 7053 #else 7054 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); 7055 #endif 7056 STORE_NUMBER (p1, mcnt); 7057 break; 7058 } 7059 7060 #if 0 7061 /* The DEC Alpha C compiler 3.x generates incorrect code for the 7062 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of 7063 AT_WORD_BOUNDARY, so this code is disabled. Expanding the 7064 macro and introducing temporary variables works around the bug. */ 7065 7066 case wordbound: 7067 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 7068 if (AT_WORD_BOUNDARY (d)) 7069 break; 7070 goto fail; 7071 7072 case notwordbound: 7073 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 7074 if (AT_WORD_BOUNDARY (d)) 7075 goto fail; 7076 break; 7077 #else 7078 case wordbound: 7079 { 7080 boolean prevchar, thischar; 7081 7082 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 7083 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) 7084 break; 7085 7086 prevchar = WORDCHAR_P (d - 1); 7087 thischar = WORDCHAR_P (d); 7088 if (prevchar != thischar) 7089 break; 7090 goto fail; 7091 } 7092 7093 case notwordbound: 7094 { 7095 boolean prevchar, thischar; 7096 7097 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 7098 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) 7099 goto fail; 7100 7101 prevchar = WORDCHAR_P (d - 1); 7102 thischar = WORDCHAR_P (d); 7103 if (prevchar != thischar) 7104 goto fail; 7105 break; 7106 } 7107 #endif 7108 7109 case wordbeg: 7110 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); 7111 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) 7112 break; 7113 goto fail; 7114 7115 case wordend: 7116 DEBUG_PRINT1 ("EXECUTING wordend.\n"); 7117 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) 7118 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) 7119 break; 7120 goto fail; 7121 7122 #ifdef emacs 7123 case before_dot: 7124 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); 7125 if (PTR_CHAR_POS ((unsigned char *) d) >= point) 7126 goto fail; 7127 break; 7128 7129 case at_dot: 7130 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); 7131 if (PTR_CHAR_POS ((unsigned char *) d) != point) 7132 goto fail; 7133 break; 7134 7135 case after_dot: 7136 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); 7137 if (PTR_CHAR_POS ((unsigned char *) d) <= point) 7138 goto fail; 7139 break; 7140 7141 case syntaxspec: 7142 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); 7143 mcnt = *p++; 7144 goto matchsyntax; 7145 7146 case wordchar: 7147 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); 7148 mcnt = (int) Sword; 7149 matchsyntax: 7150 PREFETCH (); 7151 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ 7152 d++; 7153 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt) 7154 goto fail; 7155 SET_REGS_MATCHED (); 7156 break; 7157 7158 case notsyntaxspec: 7159 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); 7160 mcnt = *p++; 7161 goto matchnotsyntax; 7162 7163 case notwordchar: 7164 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); 7165 mcnt = (int) Sword; 7166 matchnotsyntax: 7167 PREFETCH (); 7168 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ 7169 d++; 7170 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt) 7171 goto fail; 7172 SET_REGS_MATCHED (); 7173 break; 7174 7175 #else /* not emacs */ 7176 case wordchar: 7177 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); 7178 PREFETCH (); 7179 if (!WORDCHAR_P (d)) 7180 goto fail; 7181 SET_REGS_MATCHED (); 7182 d++; 7183 break; 7184 7185 case notwordchar: 7186 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); 7187 PREFETCH (); 7188 if (WORDCHAR_P (d)) 7189 goto fail; 7190 SET_REGS_MATCHED (); 7191 d++; 7192 break; 7193 #endif /* not emacs */ 7194 7195 default: 7196 abort (); 7197 } 7198 continue; /* Successfully executed one pattern command; keep going. */ 7199 7200 7201 /* We goto here if a matching operation fails. */ 7202 fail: 7203 if (!FAIL_STACK_EMPTY ()) 7204 { /* A restart point is known. Restore to that state. */ 7205 DEBUG_PRINT1 ("\nFAIL:\n"); 7206 POP_FAILURE_POINT (d, p, 7207 lowest_active_reg, highest_active_reg, 7208 regstart, regend, reg_info); 7209 7210 /* If this failure point is a dummy, try the next one. */ 7211 if (!p) 7212 goto fail; 7213 7214 /* If we failed to the end of the pattern, don't examine *p. */ 7215 assert (p <= pend); 7216 if (p < pend) 7217 { 7218 boolean is_a_jump_n = false; 7219 7220 /* If failed to a backwards jump that's part of a repetition 7221 loop, need to pop this failure point and use the next one. */ 7222 switch ((re_opcode_t) *p) 7223 { 7224 case jump_n: 7225 is_a_jump_n = true; 7226 case maybe_pop_jump: 7227 case pop_failure_jump: 7228 case jump: 7229 p1 = p + 1; 7230 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7231 p1 += mcnt; 7232 7233 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) 7234 || (!is_a_jump_n 7235 && (re_opcode_t) *p1 == on_failure_jump)) 7236 goto fail; 7237 break; 7238 default: 7239 /* do nothing */ ; 7240 } 7241 } 7242 7243 if (d >= string1 && d <= end1) 7244 dend = end_match_1; 7245 } 7246 else 7247 break; /* Matching at this starting point really fails. */ 7248 } /* for (;;) */ 7249 7250 if (best_regs_set) 7251 goto restore_best_regs; 7252 7253 FREE_VARIABLES (); 7254 7255 return -1; /* Failure to match. */ 7256 } /* re_match_2 */ 7257 7258 /* Subroutine definitions for re_match_2. */ 7260 7261 7262 /* We are passed P pointing to a register number after a start_memory. 7263 7264 Return true if the pattern up to the corresponding stop_memory can 7265 match the empty string, and false otherwise. 7266 7267 If we find the matching stop_memory, sets P to point to one past its number. 7268 Otherwise, sets P to an undefined byte less than or equal to END. 7269 7270 We don't handle duplicates properly (yet). */ 7271 7272 static boolean 7273 group_match_null_string_p (p, end, reg_info) 7274 US_CHAR_TYPE **p, *end; 7275 register_info_type *reg_info; 7276 { 7277 int mcnt; 7278 /* Point to after the args to the start_memory. */ 7279 US_CHAR_TYPE *p1 = *p + 2; 7280 7281 while (p1 < end) 7282 { 7283 /* Skip over opcodes that can match nothing, and return true or 7284 false, as appropriate, when we get to one that can't, or to the 7285 matching stop_memory. */ 7286 7287 switch ((re_opcode_t) *p1) 7288 { 7289 /* Could be either a loop or a series of alternatives. */ 7290 case on_failure_jump: 7291 p1++; 7292 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7293 7294 /* If the next operation is not a jump backwards in the 7295 pattern. */ 7296 7297 if (mcnt >= 0) 7298 { 7299 /* Go through the on_failure_jumps of the alternatives, 7300 seeing if any of the alternatives cannot match nothing. 7301 The last alternative starts with only a jump, 7302 whereas the rest start with on_failure_jump and end 7303 with a jump, e.g., here is the pattern for `a|b|c': 7304 7305 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 7306 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 7307 /exactn/1/c 7308 7309 So, we have to first go through the first (n-1) 7310 alternatives and then deal with the last one separately. */ 7311 7312 7313 /* Deal with the first (n-1) alternatives, which start 7314 with an on_failure_jump (see above) that jumps to right 7315 past a jump_past_alt. */ 7316 7317 while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] == 7318 jump_past_alt) 7319 { 7320 /* `mcnt' holds how many bytes long the alternative 7321 is, including the ending `jump_past_alt' and 7322 its number. */ 7323 7324 if (!alt_match_null_string_p (p1, p1 + mcnt - 7325 (1 + OFFSET_ADDRESS_SIZE), 7326 reg_info)) 7327 return false; 7328 7329 /* Move to right after this alternative, including the 7330 jump_past_alt. */ 7331 p1 += mcnt; 7332 7333 /* Break if it's the beginning of an n-th alternative 7334 that doesn't begin with an on_failure_jump. */ 7335 if ((re_opcode_t) *p1 != on_failure_jump) 7336 break; 7337 7338 /* Still have to check that it's not an n-th 7339 alternative that starts with an on_failure_jump. */ 7340 p1++; 7341 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7342 if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] != 7343 jump_past_alt) 7344 { 7345 /* Get to the beginning of the n-th alternative. */ 7346 p1 -= 1 + OFFSET_ADDRESS_SIZE; 7347 break; 7348 } 7349 } 7350 7351 /* Deal with the last alternative: go back and get number 7352 of the `jump_past_alt' just before it. `mcnt' contains 7353 the length of the alternative. */ 7354 EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE); 7355 7356 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) 7357 return false; 7358 7359 p1 += mcnt; /* Get past the n-th alternative. */ 7360 } /* if mcnt > 0 */ 7361 break; 7362 7363 7364 case stop_memory: 7365 assert (p1[1] == **p); 7366 *p = p1 + 2; 7367 return true; 7368 7369 7370 default: 7371 if (!common_op_match_null_string_p (&p1, end, reg_info)) 7372 return false; 7373 } 7374 } /* while p1 < end */ 7375 7376 return false; 7377 } /* group_match_null_string_p */ 7378 7379 7380 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: 7381 It expects P to be the first byte of a single alternative and END one 7382 byte past the last. The alternative can contain groups. */ 7383 7384 static boolean 7385 alt_match_null_string_p (p, end, reg_info) 7386 US_CHAR_TYPE *p, *end; 7387 register_info_type *reg_info; 7388 { 7389 int mcnt; 7390 US_CHAR_TYPE *p1 = p; 7391 7392 while (p1 < end) 7393 { 7394 /* Skip over opcodes that can match nothing, and break when we get 7395 to one that can't. */ 7396 7397 switch ((re_opcode_t) *p1) 7398 { 7399 /* It's a loop. */ 7400 case on_failure_jump: 7401 p1++; 7402 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7403 p1 += mcnt; 7404 break; 7405 7406 default: 7407 if (!common_op_match_null_string_p (&p1, end, reg_info)) 7408 return false; 7409 } 7410 } /* while p1 < end */ 7411 7412 return true; 7413 } /* alt_match_null_string_p */ 7414 7415 7416 /* Deals with the ops common to group_match_null_string_p and 7417 alt_match_null_string_p. 7418 7419 Sets P to one after the op and its arguments, if any. */ 7420 7421 static boolean 7422 common_op_match_null_string_p (p, end, reg_info) 7423 US_CHAR_TYPE **p, *end; 7424 register_info_type *reg_info; 7425 { 7426 int mcnt; 7427 boolean ret; 7428 int reg_no; 7429 US_CHAR_TYPE *p1 = *p; 7430 7431 switch ((re_opcode_t) *p1++) 7432 { 7433 case no_op: 7434 case begline: 7435 case endline: 7436 case begbuf: 7437 case endbuf: 7438 case wordbeg: 7439 case wordend: 7440 case wordbound: 7441 case notwordbound: 7442 #ifdef emacs 7443 case before_dot: 7444 case at_dot: 7445 case after_dot: 7446 #endif 7447 break; 7448 7449 case start_memory: 7450 reg_no = *p1; 7451 assert (reg_no > 0 && reg_no <= MAX_REGNUM); 7452 ret = group_match_null_string_p (&p1, end, reg_info); 7453 7454 /* Have to set this here in case we're checking a group which 7455 contains a group and a back reference to it. */ 7456 7457 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) 7458 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; 7459 7460 if (!ret) 7461 return false; 7462 break; 7463 7464 /* If this is an optimized succeed_n for zero times, make the jump. */ 7465 case jump: 7466 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7467 if (mcnt >= 0) 7468 p1 += mcnt; 7469 else 7470 return false; 7471 break; 7472 7473 case succeed_n: 7474 /* Get to the number of times to succeed. */ 7475 p1 += OFFSET_ADDRESS_SIZE; 7476 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7477 7478 if (mcnt == 0) 7479 { 7480 p1 -= 2 * OFFSET_ADDRESS_SIZE; 7481 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7482 p1 += mcnt; 7483 } 7484 else 7485 return false; 7486 break; 7487 7488 case duplicate: 7489 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) 7490 return false; 7491 break; 7492 7493 case set_number_at: 7494 p1 += 2 * OFFSET_ADDRESS_SIZE; 7495 7496 default: 7497 /* All other opcodes mean we cannot match the empty string. */ 7498 return false; 7499 } 7500 7501 *p = p1; 7502 return true; 7503 } /* common_op_match_null_string_p */ 7504 7505 7506 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN 7507 bytes; nonzero otherwise. */ 7508 7509 static int 7510 bcmp_translate (s1, s2, len, translate) 7511 const CHAR_TYPE *s1, *s2; 7512 register int len; 7513 RE_TRANSLATE_TYPE translate; 7514 { 7515 register const US_CHAR_TYPE *p1 = (const US_CHAR_TYPE *) s1; 7516 register const US_CHAR_TYPE *p2 = (const US_CHAR_TYPE *) s2; 7517 while (len) 7518 { 7519 #ifdef MBS_SUPPORT 7520 if (((*p1<=0xff)?translate[*p1++]:*p1++) 7521 != ((*p2<=0xff)?translate[*p2++]:*p2++)) 7522 return 1; 7523 #else 7524 if (translate[*p1++] != translate[*p2++]) return 1; 7525 #endif /* MBS_SUPPORT */ 7526 len--; 7527 } 7528 return 0; 7529 } 7530 7531 /* Entry points for GNU code. */ 7533 7534 /* re_compile_pattern is the GNU regular expression compiler: it 7535 compiles PATTERN (of length SIZE) and puts the result in BUFP. 7536 Returns 0 if the pattern was valid, otherwise an error string. 7537 7538 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 7539 are set in BUFP on entry. 7540 7541 We call regex_compile to do the actual compilation. */ 7542 7543 const char * 7544 re_compile_pattern (pattern, length, bufp) 7545 const char *pattern; 7546 size_t length; 7547 struct re_pattern_buffer *bufp; 7548 { 7549 reg_errcode_t ret; 7550 7551 /* GNU code is written to assume at least RE_NREGS registers will be set 7552 (and at least one extra will be -1). */ 7553 bufp->regs_allocated = REGS_UNALLOCATED; 7554 7555 /* And GNU code determines whether or not to get register information 7556 by passing null for the REGS argument to re_match, etc., not by 7557 setting no_sub. */ 7558 bufp->no_sub = 0; 7559 7560 /* Match anchors at newline. */ 7561 bufp->newline_anchor = 1; 7562 7563 ret = regex_compile (pattern, length, re_syntax_options, bufp); 7564 7565 if (!ret) 7566 return NULL; 7567 return gettext (re_error_msgid + re_error_msgid_idx[(int) ret]); 7568 } 7569 #ifdef _LIBC 7570 weak_alias (__re_compile_pattern, re_compile_pattern) 7571 #endif 7572 7573 /* Entry points compatible with 4.2 BSD regex library. We don't define 7575 them unless specifically requested. */ 7576 7577 #if defined _REGEX_RE_COMP || defined _LIBC 7578 7579 /* BSD has one and only one pattern buffer. */ 7580 static struct re_pattern_buffer re_comp_buf; 7581 7582 char * 7583 #ifdef _LIBC 7584 /* Make these definitions weak in libc, so POSIX programs can redefine 7585 these names if they don't use our functions, and still use 7586 regcomp/regexec below without link errors. */ 7587 weak_function 7588 #endif 7589 re_comp (s) 7590 const char *s; 7591 { 7592 reg_errcode_t ret; 7593 7594 if (!s) 7595 { 7596 if (!re_comp_buf.buffer) 7597 return gettext ("No previous regular expression"); 7598 return 0; 7599 } 7600 7601 if (!re_comp_buf.buffer) 7602 { 7603 re_comp_buf.buffer = (unsigned char *) malloc (200); 7604 if (re_comp_buf.buffer == NULL) 7605 return (char *) gettext (re_error_msgid 7606 + re_error_msgid_idx[(int) REG_ESPACE]); 7607 re_comp_buf.allocated = 200; 7608 7609 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); 7610 if (re_comp_buf.fastmap == NULL) 7611 return (char *) gettext (re_error_msgid 7612 + re_error_msgid_idx[(int) REG_ESPACE]); 7613 } 7614 7615 /* Since `re_exec' always passes NULL for the `regs' argument, we 7616 don't need to initialize the pattern buffer fields which affect it. */ 7617 7618 /* Match anchors at newlines. */ 7619 re_comp_buf.newline_anchor = 1; 7620 7621 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); 7622 7623 if (!ret) 7624 return NULL; 7625 7626 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 7627 return (char *) gettext (re_error_msgid + re_error_msgid_idx[(int) ret]); 7628 } 7629 7630 7631 int 7632 #ifdef _LIBC 7633 weak_function 7634 #endif 7635 re_exec (s) 7636 const char *s; 7637 { 7638 const int len = strlen (s); 7639 return 7640 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); 7641 } 7642 7643 #endif /* _REGEX_RE_COMP */ 7644 7645 /* POSIX.2 functions. Don't define these for Emacs. */ 7647 7648 #ifndef emacs 7649 7650 /* regcomp takes a regular expression as a string and compiles it. 7651 7652 PREG is a regex_t *. We do not expect any fields to be initialized, 7653 since POSIX says we shouldn't. Thus, we set 7654 7655 `buffer' to the compiled pattern; 7656 `used' to the length of the compiled pattern; 7657 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 7658 REG_EXTENDED bit in CFLAGS is set; otherwise, to 7659 RE_SYNTAX_POSIX_BASIC; 7660 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 7661 `fastmap' to an allocated space for the fastmap; 7662 `fastmap_accurate' to zero; 7663 `re_nsub' to the number of subexpressions in PATTERN. 7664 7665 PATTERN is the address of the pattern string. 7666 7667 CFLAGS is a series of bits which affect compilation. 7668 7669 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 7670 use POSIX basic syntax. 7671 7672 If REG_NEWLINE is set, then . and [^...] don't match newline. 7673 Also, regexec will try a match beginning after every newline. 7674 7675 If REG_ICASE is set, then we considers upper- and lowercase 7676 versions of letters to be equivalent when matching. 7677 7678 If REG_NOSUB is set, then when PREG is passed to regexec, that 7679 routine will report only success or failure, and nothing about the 7680 registers. 7681 7682 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 7683 the return codes and their meanings.) */ 7684 7685 int 7686 regcomp (preg, pattern, cflags) 7687 regex_t *preg; 7688 const char *pattern; 7689 int cflags; 7690 { 7691 reg_errcode_t ret; 7692 reg_syntax_t syntax 7693 = (cflags & REG_EXTENDED) ? 7694 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; 7695 7696 /* regex_compile will allocate the space for the compiled pattern. */ 7697 preg->buffer = 0; 7698 preg->allocated = 0; 7699 preg->used = 0; 7700 7701 /* Try to allocate space for the fastmap. */ 7702 preg->fastmap = (char *) malloc (1 << BYTEWIDTH); 7703 7704 if (cflags & REG_ICASE) 7705 { 7706 unsigned i; 7707 7708 preg->translate 7709 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE 7710 * sizeof (*(RE_TRANSLATE_TYPE)0)); 7711 if (preg->translate == NULL) 7712 return (int) REG_ESPACE; 7713 7714 /* Map uppercase characters to corresponding lowercase ones. */ 7715 for (i = 0; i < CHAR_SET_SIZE; i++) 7716 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i; 7717 } 7718 else 7719 preg->translate = NULL; 7720 7721 /* If REG_NEWLINE is set, newlines are treated differently. */ 7722 if (cflags & REG_NEWLINE) 7723 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 7724 syntax &= ~RE_DOT_NEWLINE; 7725 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 7726 /* It also changes the matching behavior. */ 7727 preg->newline_anchor = 1; 7728 } 7729 else 7730 preg->newline_anchor = 0; 7731 7732 preg->no_sub = !!(cflags & REG_NOSUB); 7733 7734 /* POSIX says a null character in the pattern terminates it, so we 7735 can use strlen here in compiling the pattern. */ 7736 ret = regex_compile (pattern, strlen (pattern), syntax, preg); 7737 7738 /* POSIX doesn't distinguish between an unmatched open-group and an 7739 unmatched close-group: both are REG_EPAREN. */ 7740 if (ret == REG_ERPAREN) ret = REG_EPAREN; 7741 7742 if (ret == REG_NOERROR && preg->fastmap) 7743 { 7744 /* Compute the fastmap now, since regexec cannot modify the pattern 7745 buffer. */ 7746 if (re_compile_fastmap (preg) == -2) 7747 { 7748 /* Some error occurred while computing the fastmap, just forget 7749 about it. */ 7750 free (preg->fastmap); 7751 preg->fastmap = NULL; 7752 } 7753 } 7754 7755 return (int) ret; 7756 } 7757 #ifdef _LIBC 7758 weak_alias (__regcomp, regcomp) 7759 #endif 7760 7761 7762 /* regexec searches for a given pattern, specified by PREG, in the 7763 string STRING. 7764 7765 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 7766 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 7767 least NMATCH elements, and we set them to the offsets of the 7768 corresponding matched substrings. 7769 7770 EFLAGS specifies `execution flags' which affect matching: if 7771 REG_NOTBOL is set, then ^ does not match at the beginning of the 7772 string; if REG_NOTEOL is set, then $ does not match at the end. 7773 7774 We return 0 if we find a match and REG_NOMATCH if not. */ 7775 7776 int 7777 regexec (preg, string, nmatch, pmatch, eflags) 7778 const regex_t *preg; 7779 const char *string; 7780 size_t nmatch; 7781 regmatch_t pmatch[]; 7782 int eflags; 7783 { 7784 int ret; 7785 struct re_registers regs; 7786 regex_t private_preg; 7787 int len = strlen (string); 7788 boolean want_reg_info = !preg->no_sub && nmatch > 0; 7789 7790 private_preg = *preg; 7791 7792 private_preg.not_bol = !!(eflags & REG_NOTBOL); 7793 private_preg.not_eol = !!(eflags & REG_NOTEOL); 7794 7795 /* The user has told us exactly how many registers to return 7796 information about, via `nmatch'. We have to pass that on to the 7797 matching routines. */ 7798 private_preg.regs_allocated = REGS_FIXED; 7799 7800 if (want_reg_info) 7801 { 7802 regs.num_regs = nmatch; 7803 regs.start = TALLOC (nmatch * 2, regoff_t); 7804 if (regs.start == NULL) 7805 return (int) REG_NOMATCH; 7806 regs.end = regs.start + nmatch; 7807 } 7808 7809 /* Perform the searching operation. */ 7810 ret = re_search (&private_preg, string, len, 7811 /* start: */ 0, /* range: */ len, 7812 want_reg_info ? ®s : (struct re_registers *) 0); 7813 7814 /* Copy the register information to the POSIX structure. */ 7815 if (want_reg_info) 7816 { 7817 if (ret >= 0) 7818 { 7819 unsigned r; 7820 7821 for (r = 0; r < nmatch; r++) 7822 { 7823 pmatch[r].rm_so = regs.start[r]; 7824 pmatch[r].rm_eo = regs.end[r]; 7825 } 7826 } 7827 7828 /* If we needed the temporary register info, free the space now. */ 7829 free (regs.start); 7830 } 7831 7832 /* We want zero return to mean success, unlike `re_search'. */ 7833 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; 7834 } 7835 #ifdef _LIBC 7836 weak_alias (__regexec, regexec) 7837 #endif 7838 7839 7840 /* Returns a message corresponding to an error code, ERRCODE, returned 7841 from either regcomp or regexec. We don't use PREG here. */ 7842 7843 size_t 7844 regerror (errcode, preg, errbuf, errbuf_size) 7845 int errcode; 7846 const regex_t *preg; 7847 char *errbuf; 7848 size_t errbuf_size; 7849 { 7850 const char *msg; 7851 size_t msg_size; 7852 7853 if (errcode < 0 7854 || errcode >= (int) (sizeof (re_error_msgid_idx) 7855 / sizeof (re_error_msgid_idx[0]))) 7856 /* Only error codes returned by the rest of the code should be passed 7857 to this routine. If we are given anything else, or if other regex 7858 code generates an invalid error code, then the program has a bug. 7859 Dump core so we can fix it. */ 7860 abort (); 7861 7862 msg = gettext (re_error_msgid + re_error_msgid_idx[errcode]); 7863 7864 msg_size = strlen (msg) + 1; /* Includes the null. */ 7865 7866 if (errbuf_size != 0) 7867 { 7868 if (msg_size > errbuf_size) 7869 { 7870 #if defined HAVE_MEMPCPY || defined _LIBC 7871 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; 7872 #else 7873 memcpy (errbuf, msg, errbuf_size - 1); 7874 errbuf[errbuf_size - 1] = 0; 7875 #endif 7876 } 7877 else 7878 memcpy (errbuf, msg, msg_size); 7879 } 7880 7881 return msg_size; 7882 } 7883 #ifdef _LIBC 7884 weak_alias (__regerror, regerror) 7885 #endif 7886 7887 7888 /* Free dynamically allocated space used by PREG. */ 7889 7890 void 7891 regfree (preg) 7892 regex_t *preg; 7893 { 7894 if (preg->buffer != NULL) 7895 free (preg->buffer); 7896 preg->buffer = NULL; 7897 7898 preg->allocated = 0; 7899 preg->used = 0; 7900 7901 if (preg->fastmap != NULL) 7902 free (preg->fastmap); 7903 preg->fastmap = NULL; 7904 preg->fastmap_accurate = 0; 7905 7906 if (preg->translate != NULL) 7907 free (preg->translate); 7908 preg->translate = NULL; 7909 } 7910 #ifdef _LIBC 7911 weak_alias (__regfree, regfree) 7912 #endif 7913 7914 #endif /* not emacs */ 7915