regex.c revision 1.1.1.3 1 /* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P1003.2/D11.2, except for some of the
4 internationalization features.)
5
6 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
7 2002, 2005, 2010, 2013 Free Software Foundation, Inc.
8 This file is part of the GNU C Library.
9
10 The GNU C Library is free software; you can redistribute it and/or
11 modify it under the terms of the GNU Lesser General Public
12 License as published by the Free Software Foundation; either
13 version 2.1 of the License, or (at your option) any later version.
14
15 The GNU C Library is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 Lesser General Public License for more details.
19
20 You should have received a copy of the GNU Lesser General Public
21 License along with the GNU C Library; if not, write to the Free
22 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 02110-1301 USA. */
24
25 /* This file has been modified for usage in libiberty. It includes "xregex.h"
26 instead of <regex.h>. The "xregex.h" header file renames all external
27 routines with an "x" prefix so they do not collide with the native regex
28 routines or with other components regex routines. */
29 /* AIX requires this to be the first thing in the file. */
30 #if defined _AIX && !defined __GNUC__ && !defined REGEX_MALLOC
31 #pragma alloca
32 #endif
33
34 #undef _GNU_SOURCE
35 #define _GNU_SOURCE
36
37 #ifndef INSIDE_RECURSION
38 # ifdef HAVE_CONFIG_H
39 # include <config.h>
40 # endif
41 #endif
42
43 #include <ansidecl.h>
44
45 #ifndef INSIDE_RECURSION
46
47 # if defined STDC_HEADERS && !defined emacs
48 # include <stddef.h>
49 # define PTR_INT_TYPE ptrdiff_t
50 # else
51 /* We need this for `regex.h', and perhaps for the Emacs include files. */
52 # include <sys/types.h>
53 # define PTR_INT_TYPE long
54 # endif
55
56 # define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
57
58 /* For platform which support the ISO C amendement 1 functionality we
59 support user defined character classes. */
60 # if defined _LIBC || WIDE_CHAR_SUPPORT
61 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
62 # include <wchar.h>
63 # include <wctype.h>
64 # endif
65
66 # ifdef _LIBC
67 /* We have to keep the namespace clean. */
68 # define regfree(preg) __regfree (preg)
69 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
70 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
71 # define regerror(errcode, preg, errbuf, errbuf_size) \
72 __regerror(errcode, preg, errbuf, errbuf_size)
73 # define re_set_registers(bu, re, nu, st, en) \
74 __re_set_registers (bu, re, nu, st, en)
75 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
76 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
77 # define re_match(bufp, string, size, pos, regs) \
78 __re_match (bufp, string, size, pos, regs)
79 # define re_search(bufp, string, size, startpos, range, regs) \
80 __re_search (bufp, string, size, startpos, range, regs)
81 # define re_compile_pattern(pattern, length, bufp) \
82 __re_compile_pattern (pattern, length, bufp)
83 # define re_set_syntax(syntax) __re_set_syntax (syntax)
84 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
85 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
86 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
87
88 # define btowc __btowc
89
90 /* We are also using some library internals. */
91 # include <locale/localeinfo.h>
92 # include <locale/elem-hash.h>
93 # include <langinfo.h>
94 # include <locale/coll-lookup.h>
95 # endif
96
97 /* This is for other GNU distributions with internationalized messages. */
98 # if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
99 # include <libintl.h>
100 # ifdef _LIBC
101 # undef gettext
102 # define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES)
103 # endif
104 # else
105 # define gettext(msgid) (msgid)
106 # endif
107
108 # ifndef gettext_noop
109 /* This define is so xgettext can find the internationalizable
110 strings. */
111 # define gettext_noop(String) String
112 # endif
113
114 /* The `emacs' switch turns on certain matching commands
115 that make sense only in Emacs. */
116 # ifdef emacs
117
118 # include "lisp.h"
119 # include "buffer.h"
120 # include "syntax.h"
121
122 # else /* not emacs */
123
124 /* If we are not linking with Emacs proper,
125 we can't use the relocating allocator
126 even if config.h says that we can. */
127 # undef REL_ALLOC
128
129 # if defined STDC_HEADERS || defined _LIBC
130 # include <stdlib.h>
131 # else
132 char *malloc ();
133 char *realloc ();
134 # endif
135
136 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
137 If nothing else has been done, use the method below. */
138 # ifdef INHIBIT_STRING_HEADER
139 # if !(defined HAVE_BZERO && defined HAVE_BCOPY)
140 # if !defined bzero && !defined bcopy
141 # undef INHIBIT_STRING_HEADER
142 # endif
143 # endif
144 # endif
145
146 /* This is the normal way of making sure we have a bcopy and a bzero.
147 This is used in most programs--a few other programs avoid this
148 by defining INHIBIT_STRING_HEADER. */
149 # ifndef INHIBIT_STRING_HEADER
150 # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
151 # include <string.h>
152 # ifndef bzero
153 # ifndef _LIBC
154 # define bzero(s, n) (memset (s, '\0', n), (s))
155 # else
156 # define bzero(s, n) __bzero (s, n)
157 # endif
158 # endif
159 # else
160 # include <strings.h>
161 # ifndef memcmp
162 # define memcmp(s1, s2, n) bcmp (s1, s2, n)
163 # endif
164 # ifndef memcpy
165 # define memcpy(d, s, n) (bcopy (s, d, n), (d))
166 # endif
167 # endif
168 # endif
169
170 /* Define the syntax stuff for \<, \>, etc. */
171
172 /* This must be nonzero for the wordchar and notwordchar pattern
173 commands in re_match_2. */
174 # ifndef Sword
175 # define Sword 1
176 # endif
177
178 # ifdef SWITCH_ENUM_BUG
179 # define SWITCH_ENUM_CAST(x) ((int)(x))
180 # else
181 # define SWITCH_ENUM_CAST(x) (x)
182 # endif
183
184 # endif /* not emacs */
185
186 # if defined _LIBC || HAVE_LIMITS_H
187 # include <limits.h>
188 # endif
189
190 # ifndef MB_LEN_MAX
191 # define MB_LEN_MAX 1
192 # endif
193
194 /* Get the interface, including the syntax bits. */
196 # include "xregex.h" /* change for libiberty */
197
198 /* isalpha etc. are used for the character classes. */
199 # include <ctype.h>
200
201 /* Jim Meyering writes:
202
203 "... Some ctype macros are valid only for character codes that
204 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
205 using /bin/cc or gcc but without giving an ansi option). So, all
206 ctype uses should be through macros like ISPRINT... If
207 STDC_HEADERS is defined, then autoconf has verified that the ctype
208 macros don't need to be guarded with references to isascii. ...
209 Defining isascii to 1 should let any compiler worth its salt
210 eliminate the && through constant folding."
211 Solaris defines some of these symbols so we must undefine them first. */
212
213 # undef ISASCII
214 # if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
215 # define ISASCII(c) 1
216 # else
217 # define ISASCII(c) isascii(c)
218 # endif
219
220 # ifdef isblank
221 # define ISBLANK(c) (ISASCII (c) && isblank (c))
222 # else
223 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
224 # endif
225 # ifdef isgraph
226 # define ISGRAPH(c) (ISASCII (c) && isgraph (c))
227 # else
228 # define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
229 # endif
230
231 # undef ISPRINT
232 # define ISPRINT(c) (ISASCII (c) && isprint (c))
233 # define ISDIGIT(c) (ISASCII (c) && isdigit (c))
234 # define ISALNUM(c) (ISASCII (c) && isalnum (c))
235 # define ISALPHA(c) (ISASCII (c) && isalpha (c))
236 # define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
237 # define ISLOWER(c) (ISASCII (c) && islower (c))
238 # define ISPUNCT(c) (ISASCII (c) && ispunct (c))
239 # define ISSPACE(c) (ISASCII (c) && isspace (c))
240 # define ISUPPER(c) (ISASCII (c) && isupper (c))
241 # define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
242
243 # ifdef _tolower
244 # define TOLOWER(c) _tolower(c)
245 # else
246 # define TOLOWER(c) tolower(c)
247 # endif
248
249 # ifndef NULL
250 # define NULL (void *)0
251 # endif
252
253 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
254 since ours (we hope) works properly with all combinations of
255 machines, compilers, `char' and `unsigned char' argument types.
256 (Per Bothner suggested the basic approach.) */
257 # undef SIGN_EXTEND_CHAR
258 # if __STDC__
259 # define SIGN_EXTEND_CHAR(c) ((signed char) (c))
260 # else /* not __STDC__ */
261 /* As in Harbison and Steele. */
262 # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
263 # endif
264
265 # ifndef emacs
267 /* How many characters in the character set. */
268 # define CHAR_SET_SIZE 256
269
270 # ifdef SYNTAX_TABLE
271
272 extern char *re_syntax_table;
273
274 # else /* not SYNTAX_TABLE */
275
276 static char re_syntax_table[CHAR_SET_SIZE];
277
278 static void init_syntax_once (void);
279
280 static void
281 init_syntax_once (void)
282 {
283 register int c;
284 static int done = 0;
285
286 if (done)
287 return;
288 bzero (re_syntax_table, sizeof re_syntax_table);
289
290 for (c = 0; c < CHAR_SET_SIZE; ++c)
291 if (ISALNUM (c))
292 re_syntax_table[c] = Sword;
293
294 re_syntax_table['_'] = Sword;
295
296 done = 1;
297 }
298
299 # endif /* not SYNTAX_TABLE */
300
301 # define SYNTAX(c) re_syntax_table[(unsigned char) (c)]
302
303 # endif /* emacs */
304
305 /* Integer type for pointers. */
307 # if !defined _LIBC && !defined HAVE_UINTPTR_T
308 typedef unsigned long int uintptr_t;
309 # endif
310
311 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
312 use `alloca' instead of `malloc'. This is because using malloc in
313 re_search* or re_match* could cause memory leaks when C-g is used in
314 Emacs; also, malloc is slower and causes storage fragmentation. On
315 the other hand, malloc is more portable, and easier to debug.
316
317 Because we sometimes use alloca, some routines have to be macros,
318 not functions -- `alloca'-allocated space disappears at the end of the
319 function it is called in. */
320
321 # ifdef REGEX_MALLOC
322
323 # define REGEX_ALLOCATE malloc
324 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
325 # define REGEX_FREE free
326
327 # else /* not REGEX_MALLOC */
328
329 /* Emacs already defines alloca, sometimes. */
330 # ifndef alloca
331
332 /* Make alloca work the best possible way. */
333 # ifdef __GNUC__
334 # define alloca __builtin_alloca
335 # else /* not __GNUC__ */
336 # if HAVE_ALLOCA_H
337 # include <alloca.h>
338 # endif /* HAVE_ALLOCA_H */
339 # endif /* not __GNUC__ */
340
341 # endif /* not alloca */
342
343 # define REGEX_ALLOCATE alloca
344
345 /* Assumes a `char *destination' variable. */
346 # define REGEX_REALLOCATE(source, osize, nsize) \
347 (destination = (char *) alloca (nsize), \
348 memcpy (destination, source, osize))
349
350 /* No need to do anything to free, after alloca. */
351 # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
352
353 # endif /* not REGEX_MALLOC */
354
355 /* Define how to allocate the failure stack. */
356
357 # if defined REL_ALLOC && defined REGEX_MALLOC
358
359 # define REGEX_ALLOCATE_STACK(size) \
360 r_alloc (&failure_stack_ptr, (size))
361 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
362 r_re_alloc (&failure_stack_ptr, (nsize))
363 # define REGEX_FREE_STACK(ptr) \
364 r_alloc_free (&failure_stack_ptr)
365
366 # else /* not using relocating allocator */
367
368 # ifdef REGEX_MALLOC
369
370 # define REGEX_ALLOCATE_STACK malloc
371 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
372 # define REGEX_FREE_STACK free
373
374 # else /* not REGEX_MALLOC */
375
376 # define REGEX_ALLOCATE_STACK alloca
377
378 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
379 REGEX_REALLOCATE (source, osize, nsize)
380 /* No need to explicitly free anything. */
381 # define REGEX_FREE_STACK(arg)
382
383 # endif /* not REGEX_MALLOC */
384 # endif /* not using relocating allocator */
385
386
387 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
388 `string1' or just past its end. This works if PTR is NULL, which is
389 a good thing. */
390 # define FIRST_STRING_P(ptr) \
391 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
392
393 /* (Re)Allocate N items of type T using malloc, or fail. */
394 # define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
395 # define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
396 # define RETALLOC_IF(addr, n, t) \
397 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
398 # define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
399
400 # define BYTEWIDTH 8 /* In bits. */
401
402 # define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
403
404 # undef MAX
405 # undef MIN
406 # define MAX(a, b) ((a) > (b) ? (a) : (b))
407 # define MIN(a, b) ((a) < (b) ? (a) : (b))
408
409 typedef char boolean;
410 # define false 0
411 # define true 1
412
413 static reg_errcode_t byte_regex_compile (const char *pattern, size_t size,
414 reg_syntax_t syntax,
415 struct re_pattern_buffer *bufp);
416
417 static int byte_re_match_2_internal (struct re_pattern_buffer *bufp,
418 const char *string1, int size1,
419 const char *string2, int size2,
420 int pos,
421 struct re_registers *regs,
422 int stop);
423 static int byte_re_search_2 (struct re_pattern_buffer *bufp,
424 const char *string1, int size1,
425 const char *string2, int size2,
426 int startpos, int range,
427 struct re_registers *regs, int stop);
428 static int byte_re_compile_fastmap (struct re_pattern_buffer *bufp);
429
430 #ifdef MBS_SUPPORT
431 static reg_errcode_t wcs_regex_compile (const char *pattern, size_t size,
432 reg_syntax_t syntax,
433 struct re_pattern_buffer *bufp);
434
435
436 static int wcs_re_match_2_internal (struct re_pattern_buffer *bufp,
437 const char *cstring1, int csize1,
438 const char *cstring2, int csize2,
439 int pos,
440 struct re_registers *regs,
441 int stop,
442 wchar_t *string1, int size1,
443 wchar_t *string2, int size2,
444 int *mbs_offset1, int *mbs_offset2);
445 static int wcs_re_search_2 (struct re_pattern_buffer *bufp,
446 const char *string1, int size1,
447 const char *string2, int size2,
448 int startpos, int range,
449 struct re_registers *regs, int stop);
450 static int wcs_re_compile_fastmap (struct re_pattern_buffer *bufp);
451 #endif
452
453 /* These are the command codes that appear in compiled regular
455 expressions. Some opcodes are followed by argument bytes. A
456 command code can specify any interpretation whatsoever for its
457 arguments. Zero bytes may appear in the compiled regular expression. */
458
459 typedef enum
460 {
461 no_op = 0,
462
463 /* Succeed right away--no more backtracking. */
464 succeed,
465
466 /* Followed by one byte giving n, then by n literal bytes. */
467 exactn,
468
469 # ifdef MBS_SUPPORT
470 /* Same as exactn, but contains binary data. */
471 exactn_bin,
472 # endif
473
474 /* Matches any (more or less) character. */
475 anychar,
476
477 /* Matches any one char belonging to specified set. First
478 following byte is number of bitmap bytes. Then come bytes
479 for a bitmap saying which chars are in. Bits in each byte
480 are ordered low-bit-first. A character is in the set if its
481 bit is 1. A character too large to have a bit in the map is
482 automatically not in the set. */
483 /* ifdef MBS_SUPPORT, following element is length of character
484 classes, length of collating symbols, length of equivalence
485 classes, length of character ranges, and length of characters.
486 Next, character class element, collating symbols elements,
487 equivalence class elements, range elements, and character
488 elements follow.
489 See regex_compile function. */
490 charset,
491
492 /* Same parameters as charset, but match any character that is
493 not one of those specified. */
494 charset_not,
495
496 /* Start remembering the text that is matched, for storing in a
497 register. Followed by one byte with the register number, in
498 the range 0 to one less than the pattern buffer's re_nsub
499 field. Then followed by one byte with the number of groups
500 inner to this one. (This last has to be part of the
501 start_memory only because we need it in the on_failure_jump
502 of re_match_2.) */
503 start_memory,
504
505 /* Stop remembering the text that is matched and store it in a
506 memory register. Followed by one byte with the register
507 number, in the range 0 to one less than `re_nsub' in the
508 pattern buffer, and one byte with the number of inner groups,
509 just like `start_memory'. (We need the number of inner
510 groups here because we don't have any easy way of finding the
511 corresponding start_memory when we're at a stop_memory.) */
512 stop_memory,
513
514 /* Match a duplicate of something remembered. Followed by one
515 byte containing the register number. */
516 duplicate,
517
518 /* Fail unless at beginning of line. */
519 begline,
520
521 /* Fail unless at end of line. */
522 endline,
523
524 /* Succeeds if at beginning of buffer (if emacs) or at beginning
525 of string to be matched (if not). */
526 begbuf,
527
528 /* Analogously, for end of buffer/string. */
529 endbuf,
530
531 /* Followed by two byte relative address to which to jump. */
532 jump,
533
534 /* Same as jump, but marks the end of an alternative. */
535 jump_past_alt,
536
537 /* Followed by two-byte relative address of place to resume at
538 in case of failure. */
539 /* ifdef MBS_SUPPORT, the size of address is 1. */
540 on_failure_jump,
541
542 /* Like on_failure_jump, but pushes a placeholder instead of the
543 current string position when executed. */
544 on_failure_keep_string_jump,
545
546 /* Throw away latest failure point and then jump to following
547 two-byte relative address. */
548 /* ifdef MBS_SUPPORT, the size of address is 1. */
549 pop_failure_jump,
550
551 /* Change to pop_failure_jump if know won't have to backtrack to
552 match; otherwise change to jump. This is used to jump
553 back to the beginning of a repeat. If what follows this jump
554 clearly won't match what the repeat does, such that we can be
555 sure that there is no use backtracking out of repetitions
556 already matched, then we change it to a pop_failure_jump.
557 Followed by two-byte address. */
558 /* ifdef MBS_SUPPORT, the size of address is 1. */
559 maybe_pop_jump,
560
561 /* Jump to following two-byte address, and push a dummy failure
562 point. This failure point will be thrown away if an attempt
563 is made to use it for a failure. A `+' construct makes this
564 before the first repeat. Also used as an intermediary kind
565 of jump when compiling an alternative. */
566 /* ifdef MBS_SUPPORT, the size of address is 1. */
567 dummy_failure_jump,
568
569 /* Push a dummy failure point and continue. Used at the end of
570 alternatives. */
571 push_dummy_failure,
572
573 /* Followed by two-byte relative address and two-byte number n.
574 After matching N times, jump to the address upon failure. */
575 /* ifdef MBS_SUPPORT, the size of address is 1. */
576 succeed_n,
577
578 /* Followed by two-byte relative address, and two-byte number n.
579 Jump to the address N times, then fail. */
580 /* ifdef MBS_SUPPORT, the size of address is 1. */
581 jump_n,
582
583 /* Set the following two-byte relative address to the
584 subsequent two-byte number. The address *includes* the two
585 bytes of number. */
586 /* ifdef MBS_SUPPORT, the size of address is 1. */
587 set_number_at,
588
589 wordchar, /* Matches any word-constituent character. */
590 notwordchar, /* Matches any char that is not a word-constituent. */
591
592 wordbeg, /* Succeeds if at word beginning. */
593 wordend, /* Succeeds if at word end. */
594
595 wordbound, /* Succeeds if at a word boundary. */
596 notwordbound /* Succeeds if not at a word boundary. */
597
598 # ifdef emacs
599 ,before_dot, /* Succeeds if before point. */
600 at_dot, /* Succeeds if at point. */
601 after_dot, /* Succeeds if after point. */
602
603 /* Matches any character whose syntax is specified. Followed by
604 a byte which contains a syntax code, e.g., Sword. */
605 syntaxspec,
606
607 /* Matches any character whose syntax is not that specified. */
608 notsyntaxspec
609 # endif /* emacs */
610 } re_opcode_t;
611 #endif /* not INSIDE_RECURSION */
612
613
615 #ifdef BYTE
616 # define CHAR_T char
617 # define UCHAR_T unsigned char
618 # define COMPILED_BUFFER_VAR bufp->buffer
619 # define OFFSET_ADDRESS_SIZE 2
620 # define PREFIX(name) byte_##name
621 # define ARG_PREFIX(name) name
622 # define PUT_CHAR(c) putchar (c)
623 #else
624 # ifdef WCHAR
625 # define CHAR_T wchar_t
626 # define UCHAR_T wchar_t
627 # define COMPILED_BUFFER_VAR wc_buffer
628 # define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
629 # define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_T)+1)
630 # define PREFIX(name) wcs_##name
631 # define ARG_PREFIX(name) c##name
632 /* Should we use wide stream?? */
633 # define PUT_CHAR(c) printf ("%C", c);
634 # define TRUE 1
635 # define FALSE 0
636 # else
637 # ifdef MBS_SUPPORT
638 # define WCHAR
639 # define INSIDE_RECURSION
640 # include "regex.c"
641 # undef INSIDE_RECURSION
642 # endif
643 # define BYTE
644 # define INSIDE_RECURSION
645 # include "regex.c"
646 # undef INSIDE_RECURSION
647 # endif
648 #endif
649
650 #ifdef INSIDE_RECURSION
651 /* Common operations on the compiled pattern. */
652
653 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
654 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
655
656 # ifdef WCHAR
657 # define STORE_NUMBER(destination, number) \
658 do { \
659 *(destination) = (UCHAR_T)(number); \
660 } while (0)
661 # else /* BYTE */
662 # define STORE_NUMBER(destination, number) \
663 do { \
664 (destination)[0] = (number) & 0377; \
665 (destination)[1] = (number) >> 8; \
666 } while (0)
667 # endif /* WCHAR */
668
669 /* Same as STORE_NUMBER, except increment DESTINATION to
670 the byte after where the number is stored. Therefore, DESTINATION
671 must be an lvalue. */
672 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
673
674 # define STORE_NUMBER_AND_INCR(destination, number) \
675 do { \
676 STORE_NUMBER (destination, number); \
677 (destination) += OFFSET_ADDRESS_SIZE; \
678 } while (0)
679
680 /* Put into DESTINATION a number stored in two contiguous bytes starting
681 at SOURCE. */
682 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
683
684 # ifdef WCHAR
685 # define EXTRACT_NUMBER(destination, source) \
686 do { \
687 (destination) = *(source); \
688 } while (0)
689 # else /* BYTE */
690 # define EXTRACT_NUMBER(destination, source) \
691 do { \
692 (destination) = *(source) & 0377; \
693 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
694 } while (0)
695 # endif
696
697 # ifdef DEBUG
698 static void PREFIX(extract_number) (int *dest, UCHAR_T *source);
699 static void
700 PREFIX(extract_number) (int *dest, UCHAR_T *source)
701 {
702 # ifdef WCHAR
703 *dest = *source;
704 # else /* BYTE */
705 int temp = SIGN_EXTEND_CHAR (*(source + 1));
706 *dest = *source & 0377;
707 *dest += temp << 8;
708 # endif
709 }
710
711 # ifndef EXTRACT_MACROS /* To debug the macros. */
712 # undef EXTRACT_NUMBER
713 # define EXTRACT_NUMBER(dest, src) PREFIX(extract_number) (&dest, src)
714 # endif /* not EXTRACT_MACROS */
715
716 # endif /* DEBUG */
717
718 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
719 SOURCE must be an lvalue. */
720
721 # define EXTRACT_NUMBER_AND_INCR(destination, source) \
722 do { \
723 EXTRACT_NUMBER (destination, source); \
724 (source) += OFFSET_ADDRESS_SIZE; \
725 } while (0)
726
727 # ifdef DEBUG
728 static void PREFIX(extract_number_and_incr) (int *destination,
729 UCHAR_T **source);
730 static void
731 PREFIX(extract_number_and_incr) (int *destination, UCHAR_T **source)
732 {
733 PREFIX(extract_number) (destination, *source);
734 *source += OFFSET_ADDRESS_SIZE;
735 }
736
737 # ifndef EXTRACT_MACROS
738 # undef EXTRACT_NUMBER_AND_INCR
739 # define EXTRACT_NUMBER_AND_INCR(dest, src) \
740 PREFIX(extract_number_and_incr) (&dest, &src)
741 # endif /* not EXTRACT_MACROS */
742
743 # endif /* DEBUG */
744
745
746
748 /* If DEBUG is defined, Regex prints many voluminous messages about what
749 it is doing (if the variable `debug' is nonzero). If linked with the
750 main program in `iregex.c', you can enter patterns and strings
751 interactively. And if linked with the main program in `main.c' and
752 the other test files, you can run the already-written tests. */
753
754 # ifdef DEBUG
755
756 # ifndef DEFINED_ONCE
757
758 /* We use standard I/O for debugging. */
759 # include <stdio.h>
760
761 /* It is useful to test things that ``must'' be true when debugging. */
762 # include <assert.h>
763
764 static int debug;
765
766 # define DEBUG_STATEMENT(e) e
767 # define DEBUG_PRINT1(x) if (debug) printf (x)
768 # define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
769 # define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
770 # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
771 # endif /* not DEFINED_ONCE */
772
773 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
774 if (debug) PREFIX(print_partial_compiled_pattern) (s, e)
775 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
776 if (debug) PREFIX(print_double_string) (w, s1, sz1, s2, sz2)
777
778
779 /* Print the fastmap in human-readable form. */
780
781 # ifndef DEFINED_ONCE
782 void
783 print_fastmap (char *fastmap)
784 {
785 unsigned was_a_range = 0;
786 unsigned i = 0;
787
788 while (i < (1 << BYTEWIDTH))
789 {
790 if (fastmap[i++])
791 {
792 was_a_range = 0;
793 putchar (i - 1);
794 while (i < (1 << BYTEWIDTH) && fastmap[i])
795 {
796 was_a_range = 1;
797 i++;
798 }
799 if (was_a_range)
800 {
801 printf ("-");
802 putchar (i - 1);
803 }
804 }
805 }
806 putchar ('\n');
807 }
808 # endif /* not DEFINED_ONCE */
809
810
811 /* Print a compiled pattern string in human-readable form, starting at
812 the START pointer into it and ending just before the pointer END. */
813
814 void
815 PREFIX(print_partial_compiled_pattern) (UCHAR_T *start, UCHAR_T *end)
816 {
817 int mcnt, mcnt2;
818 UCHAR_T *p1;
819 UCHAR_T *p = start;
820 UCHAR_T *pend = end;
821
822 if (start == NULL)
823 {
824 printf ("(null)\n");
825 return;
826 }
827
828 /* Loop over pattern commands. */
829 while (p < pend)
830 {
831 # ifdef _LIBC
832 printf ("%td:\t", p - start);
833 # else
834 printf ("%ld:\t", (long int) (p - start));
835 # endif
836
837 switch ((re_opcode_t) *p++)
838 {
839 case no_op:
840 printf ("/no_op");
841 break;
842
843 case exactn:
844 mcnt = *p++;
845 printf ("/exactn/%d", mcnt);
846 do
847 {
848 putchar ('/');
849 PUT_CHAR (*p++);
850 }
851 while (--mcnt);
852 break;
853
854 # ifdef MBS_SUPPORT
855 case exactn_bin:
856 mcnt = *p++;
857 printf ("/exactn_bin/%d", mcnt);
858 do
859 {
860 printf("/%lx", (long int) *p++);
861 }
862 while (--mcnt);
863 break;
864 # endif /* MBS_SUPPORT */
865
866 case start_memory:
867 mcnt = *p++;
868 printf ("/start_memory/%d/%ld", mcnt, (long int) *p++);
869 break;
870
871 case stop_memory:
872 mcnt = *p++;
873 printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++);
874 break;
875
876 case duplicate:
877 printf ("/duplicate/%ld", (long int) *p++);
878 break;
879
880 case anychar:
881 printf ("/anychar");
882 break;
883
884 case charset:
885 case charset_not:
886 {
887 # ifdef WCHAR
888 int i, length;
889 wchar_t *workp = p;
890 printf ("/charset [%s",
891 (re_opcode_t) *(workp - 1) == charset_not ? "^" : "");
892 p += 5;
893 length = *workp++; /* the length of char_classes */
894 for (i=0 ; i<length ; i++)
895 printf("[:%lx:]", (long int) *p++);
896 length = *workp++; /* the length of collating_symbol */
897 for (i=0 ; i<length ;)
898 {
899 printf("[.");
900 while(*p != 0)
901 PUT_CHAR((i++,*p++));
902 i++,p++;
903 printf(".]");
904 }
905 length = *workp++; /* the length of equivalence_class */
906 for (i=0 ; i<length ;)
907 {
908 printf("[=");
909 while(*p != 0)
910 PUT_CHAR((i++,*p++));
911 i++,p++;
912 printf("=]");
913 }
914 length = *workp++; /* the length of char_range */
915 for (i=0 ; i<length ; i++)
916 {
917 wchar_t range_start = *p++;
918 wchar_t range_end = *p++;
919 printf("%C-%C", range_start, range_end);
920 }
921 length = *workp++; /* the length of char */
922 for (i=0 ; i<length ; i++)
923 printf("%C", *p++);
924 putchar (']');
925 # else
926 register int c, last = -100;
927 register int in_range = 0;
928
929 printf ("/charset [%s",
930 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
931
932 assert (p + *p < pend);
933
934 for (c = 0; c < 256; c++)
935 if (c / 8 < *p
936 && (p[1 + (c/8)] & (1 << (c % 8))))
937 {
938 /* Are we starting a range? */
939 if (last + 1 == c && ! in_range)
940 {
941 putchar ('-');
942 in_range = 1;
943 }
944 /* Have we broken a range? */
945 else if (last + 1 != c && in_range)
946 {
947 putchar (last);
948 in_range = 0;
949 }
950
951 if (! in_range)
952 putchar (c);
953
954 last = c;
955 }
956
957 if (in_range)
958 putchar (last);
959
960 putchar (']');
961
962 p += 1 + *p;
963 # endif /* WCHAR */
964 }
965 break;
966
967 case begline:
968 printf ("/begline");
969 break;
970
971 case endline:
972 printf ("/endline");
973 break;
974
975 case on_failure_jump:
976 PREFIX(extract_number_and_incr) (&mcnt, &p);
977 # ifdef _LIBC
978 printf ("/on_failure_jump to %td", p + mcnt - start);
979 # else
980 printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start));
981 # endif
982 break;
983
984 case on_failure_keep_string_jump:
985 PREFIX(extract_number_and_incr) (&mcnt, &p);
986 # ifdef _LIBC
987 printf ("/on_failure_keep_string_jump to %td", p + mcnt - start);
988 # else
989 printf ("/on_failure_keep_string_jump to %ld",
990 (long int) (p + mcnt - start));
991 # endif
992 break;
993
994 case dummy_failure_jump:
995 PREFIX(extract_number_and_incr) (&mcnt, &p);
996 # ifdef _LIBC
997 printf ("/dummy_failure_jump to %td", p + mcnt - start);
998 # else
999 printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start));
1000 # endif
1001 break;
1002
1003 case push_dummy_failure:
1004 printf ("/push_dummy_failure");
1005 break;
1006
1007 case maybe_pop_jump:
1008 PREFIX(extract_number_and_incr) (&mcnt, &p);
1009 # ifdef _LIBC
1010 printf ("/maybe_pop_jump to %td", p + mcnt - start);
1011 # else
1012 printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start));
1013 # endif
1014 break;
1015
1016 case pop_failure_jump:
1017 PREFIX(extract_number_and_incr) (&mcnt, &p);
1018 # ifdef _LIBC
1019 printf ("/pop_failure_jump to %td", p + mcnt - start);
1020 # else
1021 printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start));
1022 # endif
1023 break;
1024
1025 case jump_past_alt:
1026 PREFIX(extract_number_and_incr) (&mcnt, &p);
1027 # ifdef _LIBC
1028 printf ("/jump_past_alt to %td", p + mcnt - start);
1029 # else
1030 printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start));
1031 # endif
1032 break;
1033
1034 case jump:
1035 PREFIX(extract_number_and_incr) (&mcnt, &p);
1036 # ifdef _LIBC
1037 printf ("/jump to %td", p + mcnt - start);
1038 # else
1039 printf ("/jump to %ld", (long int) (p + mcnt - start));
1040 # endif
1041 break;
1042
1043 case succeed_n:
1044 PREFIX(extract_number_and_incr) (&mcnt, &p);
1045 p1 = p + mcnt;
1046 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1047 # ifdef _LIBC
1048 printf ("/succeed_n to %td, %d times", p1 - start, mcnt2);
1049 # else
1050 printf ("/succeed_n to %ld, %d times",
1051 (long int) (p1 - start), mcnt2);
1052 # endif
1053 break;
1054
1055 case jump_n:
1056 PREFIX(extract_number_and_incr) (&mcnt, &p);
1057 p1 = p + mcnt;
1058 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1059 printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
1060 break;
1061
1062 case set_number_at:
1063 PREFIX(extract_number_and_incr) (&mcnt, &p);
1064 p1 = p + mcnt;
1065 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1066 # ifdef _LIBC
1067 printf ("/set_number_at location %td to %d", p1 - start, mcnt2);
1068 # else
1069 printf ("/set_number_at location %ld to %d",
1070 (long int) (p1 - start), mcnt2);
1071 # endif
1072 break;
1073
1074 case wordbound:
1075 printf ("/wordbound");
1076 break;
1077
1078 case notwordbound:
1079 printf ("/notwordbound");
1080 break;
1081
1082 case wordbeg:
1083 printf ("/wordbeg");
1084 break;
1085
1086 case wordend:
1087 printf ("/wordend");
1088 break;
1089
1090 # ifdef emacs
1091 case before_dot:
1092 printf ("/before_dot");
1093 break;
1094
1095 case at_dot:
1096 printf ("/at_dot");
1097 break;
1098
1099 case after_dot:
1100 printf ("/after_dot");
1101 break;
1102
1103 case syntaxspec:
1104 printf ("/syntaxspec");
1105 mcnt = *p++;
1106 printf ("/%d", mcnt);
1107 break;
1108
1109 case notsyntaxspec:
1110 printf ("/notsyntaxspec");
1111 mcnt = *p++;
1112 printf ("/%d", mcnt);
1113 break;
1114 # endif /* emacs */
1115
1116 case wordchar:
1117 printf ("/wordchar");
1118 break;
1119
1120 case notwordchar:
1121 printf ("/notwordchar");
1122 break;
1123
1124 case begbuf:
1125 printf ("/begbuf");
1126 break;
1127
1128 case endbuf:
1129 printf ("/endbuf");
1130 break;
1131
1132 default:
1133 printf ("?%ld", (long int) *(p-1));
1134 }
1135
1136 putchar ('\n');
1137 }
1138
1139 # ifdef _LIBC
1140 printf ("%td:\tend of pattern.\n", p - start);
1141 # else
1142 printf ("%ld:\tend of pattern.\n", (long int) (p - start));
1143 # endif
1144 }
1145
1146
1147 void
1148 PREFIX(print_compiled_pattern) (struct re_pattern_buffer *bufp)
1149 {
1150 UCHAR_T *buffer = (UCHAR_T*) bufp->buffer;
1151
1152 PREFIX(print_partial_compiled_pattern) (buffer, buffer
1153 + bufp->used / sizeof(UCHAR_T));
1154 printf ("%ld bytes used/%ld bytes allocated.\n",
1155 bufp->used, bufp->allocated);
1156
1157 if (bufp->fastmap_accurate && bufp->fastmap)
1158 {
1159 printf ("fastmap: ");
1160 print_fastmap (bufp->fastmap);
1161 }
1162
1163 # ifdef _LIBC
1164 printf ("re_nsub: %Zd\t", bufp->re_nsub);
1165 # else
1166 printf ("re_nsub: %ld\t", (long int) bufp->re_nsub);
1167 # endif
1168 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1169 printf ("can_be_null: %d\t", bufp->can_be_null);
1170 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1171 printf ("no_sub: %d\t", bufp->no_sub);
1172 printf ("not_bol: %d\t", bufp->not_bol);
1173 printf ("not_eol: %d\t", bufp->not_eol);
1174 printf ("syntax: %lx\n", bufp->syntax);
1175 /* Perhaps we should print the translate table? */
1176 }
1177
1178
1179 void
1180 PREFIX(print_double_string) (const CHAR_T *where, const CHAR_T *string1,
1181 int size1, const CHAR_T *string2, int size2)
1182 {
1183 int this_char;
1184
1185 if (where == NULL)
1186 printf ("(null)");
1187 else
1188 {
1189 int cnt;
1190
1191 if (FIRST_STRING_P (where))
1192 {
1193 for (this_char = where - string1; this_char < size1; this_char++)
1194 PUT_CHAR (string1[this_char]);
1195
1196 where = string2;
1197 }
1198
1199 cnt = 0;
1200 for (this_char = where - string2; this_char < size2; this_char++)
1201 {
1202 PUT_CHAR (string2[this_char]);
1203 if (++cnt > 100)
1204 {
1205 fputs ("...", stdout);
1206 break;
1207 }
1208 }
1209 }
1210 }
1211
1212 # ifndef DEFINED_ONCE
1213 void
1214 printchar (int c)
1215 {
1216 putc (c, stderr);
1217 }
1218 # endif
1219
1220 # else /* not DEBUG */
1221
1222 # ifndef DEFINED_ONCE
1223 # undef assert
1224 # define assert(e)
1225
1226 # define DEBUG_STATEMENT(e)
1227 # define DEBUG_PRINT1(x)
1228 # define DEBUG_PRINT2(x1, x2)
1229 # define DEBUG_PRINT3(x1, x2, x3)
1230 # define DEBUG_PRINT4(x1, x2, x3, x4)
1231 # endif /* not DEFINED_ONCE */
1232 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1233 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1234
1235 # endif /* not DEBUG */
1236
1237
1238
1240 # ifdef WCHAR
1241 /* This convert a multibyte string to a wide character string.
1242 And write their correspondances to offset_buffer(see below)
1243 and write whether each wchar_t is binary data to is_binary.
1244 This assume invalid multibyte sequences as binary data.
1245 We assume offset_buffer and is_binary is already allocated
1246 enough space. */
1247
1248 static size_t convert_mbs_to_wcs (CHAR_T *dest, const unsigned char* src,
1249 size_t len, int *offset_buffer,
1250 char *is_binary);
1251 static size_t
1252 convert_mbs_to_wcs (CHAR_T *dest, const unsigned char*src, size_t len,
1253 int *offset_buffer, char *is_binary)
1254 /* It hold correspondances between src(char string) and
1255 dest(wchar_t string) for optimization.
1256 e.g. src = "xxxyzz"
1257 dest = {'X', 'Y', 'Z'}
1258 (each "xxx", "y" and "zz" represent one multibyte character
1259 corresponding to 'X', 'Y' and 'Z'.)
1260 offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")}
1261 = {0, 3, 4, 6}
1262 */
1263 {
1264 wchar_t *pdest = dest;
1265 const unsigned char *psrc = src;
1266 size_t wc_count = 0;
1267
1268 mbstate_t mbs;
1269 int i, consumed;
1270 size_t mb_remain = len;
1271 size_t mb_count = 0;
1272
1273 /* Initialize the conversion state. */
1274 memset (&mbs, 0, sizeof (mbstate_t));
1275
1276 offset_buffer[0] = 0;
1277 for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed,
1278 psrc += consumed)
1279 {
1280 #ifdef _LIBC
1281 consumed = __mbrtowc (pdest, psrc, mb_remain, &mbs);
1282 #else
1283 consumed = mbrtowc (pdest, psrc, mb_remain, &mbs);
1284 #endif
1285
1286 if (consumed <= 0)
1287 /* failed to convert. maybe src contains binary data.
1288 So we consume 1 byte manualy. */
1289 {
1290 *pdest = *psrc;
1291 consumed = 1;
1292 is_binary[wc_count] = TRUE;
1293 }
1294 else
1295 is_binary[wc_count] = FALSE;
1296 /* In sjis encoding, we use yen sign as escape character in
1297 place of reverse solidus. So we convert 0x5c(yen sign in
1298 sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse
1299 solidus in UCS2). */
1300 if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5)
1301 *pdest = (wchar_t) *psrc;
1302
1303 offset_buffer[wc_count + 1] = mb_count += consumed;
1304 }
1305
1306 /* Fill remain of the buffer with sentinel. */
1307 for (i = wc_count + 1 ; i <= len ; i++)
1308 offset_buffer[i] = mb_count + 1;
1309
1310 return wc_count;
1311 }
1312
1313 # endif /* WCHAR */
1314
1315 #else /* not INSIDE_RECURSION */
1316
1317 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1318 also be assigned to arbitrarily: each pattern buffer stores its own
1319 syntax, so it can be changed between regex compilations. */
1320 /* This has no initializer because initialized variables in Emacs
1321 become read-only after dumping. */
1322 reg_syntax_t re_syntax_options;
1323
1324
1325 /* Specify the precise syntax of regexps for compilation. This provides
1326 for compatibility for various utilities which historically have
1327 different, incompatible syntaxes.
1328
1329 The argument SYNTAX is a bit mask comprised of the various bits
1330 defined in regex.h. We return the old syntax. */
1331
1332 reg_syntax_t
1333 re_set_syntax (reg_syntax_t syntax)
1334 {
1335 reg_syntax_t ret = re_syntax_options;
1336
1337 re_syntax_options = syntax;
1338 # ifdef DEBUG
1339 if (syntax & RE_DEBUG)
1340 debug = 1;
1341 else if (debug) /* was on but now is not */
1342 debug = 0;
1343 # endif /* DEBUG */
1344 return ret;
1345 }
1346 # ifdef _LIBC
1347 weak_alias (__re_set_syntax, re_set_syntax)
1348 # endif
1349
1350 /* This table gives an error message for each of the error codes listed
1352 in regex.h. Obviously the order here has to be same as there.
1353 POSIX doesn't require that we do anything for REG_NOERROR,
1354 but why not be nice? */
1355
1356 static const char *re_error_msgid[] =
1357 {
1358 gettext_noop ("Success"), /* REG_NOERROR */
1359 gettext_noop ("No match"), /* REG_NOMATCH */
1360 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1361 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1362 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1363 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1364 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1365 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1366 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1367 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1368 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1369 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1370 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1371 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1372 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1373 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1374 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
1375 };
1376
1377 #endif /* INSIDE_RECURSION */
1379
1380 #ifndef DEFINED_ONCE
1381 /* Avoiding alloca during matching, to placate r_alloc. */
1382
1383 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1384 searching and matching functions should not call alloca. On some
1385 systems, alloca is implemented in terms of malloc, and if we're
1386 using the relocating allocator routines, then malloc could cause a
1387 relocation, which might (if the strings being searched are in the
1388 ralloc heap) shift the data out from underneath the regexp
1389 routines.
1390
1391 Here's another reason to avoid allocation: Emacs
1392 processes input from X in a signal handler; processing X input may
1393 call malloc; if input arrives while a matching routine is calling
1394 malloc, then we're scrod. But Emacs can't just block input while
1395 calling matching routines; then we don't notice interrupts when
1396 they come in. So, Emacs blocks input around all regexp calls
1397 except the matching calls, which it leaves unprotected, in the
1398 faith that they will not malloc. */
1399
1400 /* Normally, this is fine. */
1401 # define MATCH_MAY_ALLOCATE
1402
1403 /* When using GNU C, we are not REALLY using the C alloca, no matter
1404 what config.h may say. So don't take precautions for it. */
1405 # ifdef __GNUC__
1406 # undef C_ALLOCA
1407 # endif
1408
1409 /* The match routines may not allocate if (1) they would do it with malloc
1410 and (2) it's not safe for them to use malloc.
1411 Note that if REL_ALLOC is defined, matching would not use malloc for the
1412 failure stack, but we would still use it for the register vectors;
1413 so REL_ALLOC should not affect this. */
1414 # if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1415 # undef MATCH_MAY_ALLOCATE
1416 # endif
1417 #endif /* not DEFINED_ONCE */
1418
1419 #ifdef INSIDE_RECURSION
1421 /* Failure stack declarations and macros; both re_compile_fastmap and
1422 re_match_2 use a failure stack. These have to be macros because of
1423 REGEX_ALLOCATE_STACK. */
1424
1425
1426 /* Number of failure points for which to initially allocate space
1427 when matching. If this number is exceeded, we allocate more
1428 space, so it is not a hard limit. */
1429 # ifndef INIT_FAILURE_ALLOC
1430 # define INIT_FAILURE_ALLOC 5
1431 # endif
1432
1433 /* Roughly the maximum number of failure points on the stack. Would be
1434 exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1435 This is a variable only so users of regex can assign to it; we never
1436 change it ourselves. */
1437
1438 # ifdef INT_IS_16BIT
1439
1440 # ifndef DEFINED_ONCE
1441 # if defined MATCH_MAY_ALLOCATE
1442 /* 4400 was enough to cause a crash on Alpha OSF/1,
1443 whose default stack limit is 2mb. */
1444 long int re_max_failures = 4000;
1445 # else
1446 long int re_max_failures = 2000;
1447 # endif
1448 # endif
1449
1450 union PREFIX(fail_stack_elt)
1451 {
1452 UCHAR_T *pointer;
1453 long int integer;
1454 };
1455
1456 typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1457
1458 typedef struct
1459 {
1460 PREFIX(fail_stack_elt_t) *stack;
1461 unsigned long int size;
1462 unsigned long int avail; /* Offset of next open position. */
1463 } PREFIX(fail_stack_type);
1464
1465 # else /* not INT_IS_16BIT */
1466
1467 # ifndef DEFINED_ONCE
1468 # if defined MATCH_MAY_ALLOCATE
1469 /* 4400 was enough to cause a crash on Alpha OSF/1,
1470 whose default stack limit is 2mb. */
1471 int re_max_failures = 4000;
1472 # else
1473 int re_max_failures = 2000;
1474 # endif
1475 # endif
1476
1477 union PREFIX(fail_stack_elt)
1478 {
1479 UCHAR_T *pointer;
1480 int integer;
1481 };
1482
1483 typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1484
1485 typedef struct
1486 {
1487 PREFIX(fail_stack_elt_t) *stack;
1488 unsigned size;
1489 unsigned avail; /* Offset of next open position. */
1490 } PREFIX(fail_stack_type);
1491
1492 # endif /* INT_IS_16BIT */
1493
1494 # ifndef DEFINED_ONCE
1495 # define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1496 # define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1497 # define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1498 # endif
1499
1500
1501 /* Define macros to initialize and free the failure stack.
1502 Do `return -2' if the alloc fails. */
1503
1504 # ifdef MATCH_MAY_ALLOCATE
1505 # define INIT_FAIL_STACK() \
1506 do { \
1507 fail_stack.stack = (PREFIX(fail_stack_elt_t) *) \
1508 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (PREFIX(fail_stack_elt_t))); \
1509 \
1510 if (fail_stack.stack == NULL) \
1511 return -2; \
1512 \
1513 fail_stack.size = INIT_FAILURE_ALLOC; \
1514 fail_stack.avail = 0; \
1515 } while (0)
1516
1517 # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1518 # else
1519 # define INIT_FAIL_STACK() \
1520 do { \
1521 fail_stack.avail = 0; \
1522 } while (0)
1523
1524 # define RESET_FAIL_STACK()
1525 # endif
1526
1527
1528 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1529
1530 Return 1 if succeeds, and 0 if either ran out of memory
1531 allocating space for it or it was already too large.
1532
1533 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1534
1535 # define DOUBLE_FAIL_STACK(fail_stack) \
1536 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1537 ? 0 \
1538 : ((fail_stack).stack = (PREFIX(fail_stack_elt_t) *) \
1539 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1540 (fail_stack).size * sizeof (PREFIX(fail_stack_elt_t)), \
1541 ((fail_stack).size << 1) * sizeof (PREFIX(fail_stack_elt_t))),\
1542 \
1543 (fail_stack).stack == NULL \
1544 ? 0 \
1545 : ((fail_stack).size <<= 1, \
1546 1)))
1547
1548
1549 /* Push pointer POINTER on FAIL_STACK.
1550 Return 1 if was able to do so and 0 if ran out of memory allocating
1551 space to do so. */
1552 # define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1553 ((FAIL_STACK_FULL () \
1554 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1555 ? 0 \
1556 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1557 1))
1558
1559 /* Push a pointer value onto the failure stack.
1560 Assumes the variable `fail_stack'. Probably should only
1561 be called from within `PUSH_FAILURE_POINT'. */
1562 # define PUSH_FAILURE_POINTER(item) \
1563 fail_stack.stack[fail_stack.avail++].pointer = (UCHAR_T *) (item)
1564
1565 /* This pushes an integer-valued item onto the failure stack.
1566 Assumes the variable `fail_stack'. Probably should only
1567 be called from within `PUSH_FAILURE_POINT'. */
1568 # define PUSH_FAILURE_INT(item) \
1569 fail_stack.stack[fail_stack.avail++].integer = (item)
1570
1571 /* Push a fail_stack_elt_t value onto the failure stack.
1572 Assumes the variable `fail_stack'. Probably should only
1573 be called from within `PUSH_FAILURE_POINT'. */
1574 # define PUSH_FAILURE_ELT(item) \
1575 fail_stack.stack[fail_stack.avail++] = (item)
1576
1577 /* These three POP... operations complement the three PUSH... operations.
1578 All assume that `fail_stack' is nonempty. */
1579 # define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1580 # define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1581 # define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1582
1583 /* Used to omit pushing failure point id's when we're not debugging. */
1584 # ifdef DEBUG
1585 # define DEBUG_PUSH PUSH_FAILURE_INT
1586 # define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1587 # else
1588 # define DEBUG_PUSH(item)
1589 # define DEBUG_POP(item_addr)
1590 # endif
1591
1592
1593 /* Push the information about the state we will need
1594 if we ever fail back to it.
1595
1596 Requires variables fail_stack, regstart, regend, reg_info, and
1597 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
1598 be declared.
1599
1600 Does `return FAILURE_CODE' if runs out of memory. */
1601
1602 # define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1603 do { \
1604 char *destination; \
1605 /* Must be int, so when we don't save any registers, the arithmetic \
1606 of 0 + -1 isn't done as unsigned. */ \
1607 /* Can't be int, since there is not a shred of a guarantee that int \
1608 is wide enough to hold a value of something to which pointer can \
1609 be assigned */ \
1610 active_reg_t this_reg; \
1611 \
1612 DEBUG_STATEMENT (failure_id++); \
1613 DEBUG_STATEMENT (nfailure_points_pushed++); \
1614 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1615 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1616 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1617 \
1618 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
1619 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1620 \
1621 /* Ensure we have enough space allocated for what we will push. */ \
1622 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1623 { \
1624 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1625 return failure_code; \
1626 \
1627 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1628 (fail_stack).size); \
1629 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1630 } \
1631 \
1632 /* Push the info, starting with the registers. */ \
1633 DEBUG_PRINT1 ("\n"); \
1634 \
1635 if (1) \
1636 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1637 this_reg++) \
1638 { \
1639 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
1640 DEBUG_STATEMENT (num_regs_pushed++); \
1641 \
1642 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1643 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1644 \
1645 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1646 PUSH_FAILURE_POINTER (regend[this_reg]); \
1647 \
1648 DEBUG_PRINT2 (" info: %p\n ", \
1649 reg_info[this_reg].word.pointer); \
1650 DEBUG_PRINT2 (" match_null=%d", \
1651 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1652 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1653 DEBUG_PRINT2 (" matched_something=%d", \
1654 MATCHED_SOMETHING (reg_info[this_reg])); \
1655 DEBUG_PRINT2 (" ever_matched=%d", \
1656 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1657 DEBUG_PRINT1 ("\n"); \
1658 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1659 } \
1660 \
1661 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
1662 PUSH_FAILURE_INT (lowest_active_reg); \
1663 \
1664 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
1665 PUSH_FAILURE_INT (highest_active_reg); \
1666 \
1667 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
1668 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1669 PUSH_FAILURE_POINTER (pattern_place); \
1670 \
1671 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
1672 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1673 size2); \
1674 DEBUG_PRINT1 ("'\n"); \
1675 PUSH_FAILURE_POINTER (string_place); \
1676 \
1677 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1678 DEBUG_PUSH (failure_id); \
1679 } while (0)
1680
1681 # ifndef DEFINED_ONCE
1682 /* This is the number of items that are pushed and popped on the stack
1683 for each register. */
1684 # define NUM_REG_ITEMS 3
1685
1686 /* Individual items aside from the registers. */
1687 # ifdef DEBUG
1688 # define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1689 # else
1690 # define NUM_NONREG_ITEMS 4
1691 # endif
1692
1693 /* We push at most this many items on the stack. */
1694 /* We used to use (num_regs - 1), which is the number of registers
1695 this regexp will save; but that was changed to 5
1696 to avoid stack overflow for a regexp with lots of parens. */
1697 # define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1698
1699 /* We actually push this many items. */
1700 # define NUM_FAILURE_ITEMS \
1701 (((0 \
1702 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1703 * NUM_REG_ITEMS) \
1704 + NUM_NONREG_ITEMS)
1705
1706 /* How many items can still be added to the stack without overflowing it. */
1707 # define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1708 # endif /* not DEFINED_ONCE */
1709
1710
1711 /* Pops what PUSH_FAIL_STACK pushes.
1712
1713 We restore into the parameters, all of which should be lvalues:
1714 STR -- the saved data position.
1715 PAT -- the saved pattern position.
1716 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1717 REGSTART, REGEND -- arrays of string positions.
1718 REG_INFO -- array of information about each subexpression.
1719
1720 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1721 `pend', `string1', `size1', `string2', and `size2'. */
1722 # define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1723 { \
1724 DEBUG_STATEMENT (unsigned failure_id;) \
1725 active_reg_t this_reg; \
1726 const UCHAR_T *string_temp; \
1727 \
1728 assert (!FAIL_STACK_EMPTY ()); \
1729 \
1730 /* Remove failure points and point to how many regs pushed. */ \
1731 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1732 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1733 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1734 \
1735 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1736 \
1737 DEBUG_POP (&failure_id); \
1738 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1739 \
1740 /* If the saved string location is NULL, it came from an \
1741 on_failure_keep_string_jump opcode, and we want to throw away the \
1742 saved NULL, thus retaining our current position in the string. */ \
1743 string_temp = POP_FAILURE_POINTER (); \
1744 if (string_temp != NULL) \
1745 str = (const CHAR_T *) string_temp; \
1746 \
1747 DEBUG_PRINT2 (" Popping string %p: `", str); \
1748 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1749 DEBUG_PRINT1 ("'\n"); \
1750 \
1751 pat = (UCHAR_T *) POP_FAILURE_POINTER (); \
1752 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
1753 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1754 \
1755 /* Restore register info. */ \
1756 high_reg = (active_reg_t) POP_FAILURE_INT (); \
1757 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
1758 \
1759 low_reg = (active_reg_t) POP_FAILURE_INT (); \
1760 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
1761 \
1762 if (1) \
1763 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1764 { \
1765 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
1766 \
1767 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1768 DEBUG_PRINT2 (" info: %p\n", \
1769 reg_info[this_reg].word.pointer); \
1770 \
1771 regend[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1772 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1773 \
1774 regstart[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1775 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1776 } \
1777 else \
1778 { \
1779 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1780 { \
1781 reg_info[this_reg].word.integer = 0; \
1782 regend[this_reg] = 0; \
1783 regstart[this_reg] = 0; \
1784 } \
1785 highest_active_reg = high_reg; \
1786 } \
1787 \
1788 set_regs_matched_done = 0; \
1789 DEBUG_STATEMENT (nfailure_points_popped++); \
1790 } /* POP_FAILURE_POINT */
1791
1792 /* Structure for per-register (a.k.a. per-group) information.
1794 Other register information, such as the
1795 starting and ending positions (which are addresses), and the list of
1796 inner groups (which is a bits list) are maintained in separate
1797 variables.
1798
1799 We are making a (strictly speaking) nonportable assumption here: that
1800 the compiler will pack our bit fields into something that fits into
1801 the type of `word', i.e., is something that fits into one item on the
1802 failure stack. */
1803
1804
1805 /* Declarations and macros for re_match_2. */
1806
1807 typedef union
1808 {
1809 PREFIX(fail_stack_elt_t) word;
1810 struct
1811 {
1812 /* This field is one if this group can match the empty string,
1813 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1814 # define MATCH_NULL_UNSET_VALUE 3
1815 unsigned match_null_string_p : 2;
1816 unsigned is_active : 1;
1817 unsigned matched_something : 1;
1818 unsigned ever_matched_something : 1;
1819 } bits;
1820 } PREFIX(register_info_type);
1821
1822 # ifndef DEFINED_ONCE
1823 # define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1824 # define IS_ACTIVE(R) ((R).bits.is_active)
1825 # define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1826 # define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1827
1828
1829 /* Call this when have matched a real character; it sets `matched' flags
1830 for the subexpressions which we are currently inside. Also records
1831 that those subexprs have matched. */
1832 # define SET_REGS_MATCHED() \
1833 do \
1834 { \
1835 if (!set_regs_matched_done) \
1836 { \
1837 active_reg_t r; \
1838 set_regs_matched_done = 1; \
1839 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1840 { \
1841 MATCHED_SOMETHING (reg_info[r]) \
1842 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1843 = 1; \
1844 } \
1845 } \
1846 } \
1847 while (0)
1848 # endif /* not DEFINED_ONCE */
1849
1850 /* Registers are set to a sentinel when they haven't yet matched. */
1851 static CHAR_T PREFIX(reg_unset_dummy);
1852 # define REG_UNSET_VALUE (&PREFIX(reg_unset_dummy))
1853 # define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1854
1855 /* Subroutine declarations and macros for regex_compile. */
1856 static void PREFIX(store_op1) (re_opcode_t op, UCHAR_T *loc, int arg);
1857 static void PREFIX(store_op2) (re_opcode_t op, UCHAR_T *loc,
1858 int arg1, int arg2);
1859 static void PREFIX(insert_op1) (re_opcode_t op, UCHAR_T *loc,
1860 int arg, UCHAR_T *end);
1861 static void PREFIX(insert_op2) (re_opcode_t op, UCHAR_T *loc,
1862 int arg1, int arg2, UCHAR_T *end);
1863 static boolean PREFIX(at_begline_loc_p) (const CHAR_T *pattern,
1864 const CHAR_T *p,
1865 reg_syntax_t syntax);
1866 static boolean PREFIX(at_endline_loc_p) (const CHAR_T *p,
1867 const CHAR_T *pend,
1868 reg_syntax_t syntax);
1869 # ifdef WCHAR
1870 static reg_errcode_t wcs_compile_range (CHAR_T range_start,
1871 const CHAR_T **p_ptr,
1872 const CHAR_T *pend,
1873 char *translate,
1874 reg_syntax_t syntax,
1875 UCHAR_T *b,
1876 CHAR_T *char_set);
1877 static void insert_space (int num, CHAR_T *loc, CHAR_T *end);
1878 # else /* BYTE */
1879 static reg_errcode_t byte_compile_range (unsigned int range_start,
1880 const char **p_ptr,
1881 const char *pend,
1882 char *translate,
1883 reg_syntax_t syntax,
1884 unsigned char *b);
1885 # endif /* WCHAR */
1886
1887 /* Fetch the next character in the uncompiled pattern---translating it
1888 if necessary. Also cast from a signed character in the constant
1889 string passed to us by the user to an unsigned char that we can use
1890 as an array index (in, e.g., `translate'). */
1891 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1892 because it is impossible to allocate 4GB array for some encodings
1893 which have 4 byte character_set like UCS4. */
1894 # ifndef PATFETCH
1895 # ifdef WCHAR
1896 # define PATFETCH(c) \
1897 do {if (p == pend) return REG_EEND; \
1898 c = (UCHAR_T) *p++; \
1899 if (translate && (c <= 0xff)) c = (UCHAR_T) translate[c]; \
1900 } while (0)
1901 # else /* BYTE */
1902 # define PATFETCH(c) \
1903 do {if (p == pend) return REG_EEND; \
1904 c = (unsigned char) *p++; \
1905 if (translate) c = (unsigned char) translate[c]; \
1906 } while (0)
1907 # endif /* WCHAR */
1908 # endif
1909
1910 /* Fetch the next character in the uncompiled pattern, with no
1911 translation. */
1912 # define PATFETCH_RAW(c) \
1913 do {if (p == pend) return REG_EEND; \
1914 c = (UCHAR_T) *p++; \
1915 } while (0)
1916
1917 /* Go backwards one character in the pattern. */
1918 # define PATUNFETCH p--
1919
1920
1921 /* If `translate' is non-null, return translate[D], else just D. We
1922 cast the subscript to translate because some data is declared as
1923 `char *', to avoid warnings when a string constant is passed. But
1924 when we use a character as a subscript we must make it unsigned. */
1925 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1926 because it is impossible to allocate 4GB array for some encodings
1927 which have 4 byte character_set like UCS4. */
1928
1929 # ifndef TRANSLATE
1930 # ifdef WCHAR
1931 # define TRANSLATE(d) \
1932 ((translate && ((UCHAR_T) (d)) <= 0xff) \
1933 ? (char) translate[(unsigned char) (d)] : (d))
1934 # else /* BYTE */
1935 # define TRANSLATE(d) \
1936 (translate ? (char) translate[(unsigned char) (d)] : (char) (d))
1937 # endif /* WCHAR */
1938 # endif
1939
1940
1941 /* Macros for outputting the compiled pattern into `buffer'. */
1942
1943 /* If the buffer isn't allocated when it comes in, use this. */
1944 # define INIT_BUF_SIZE (32 * sizeof(UCHAR_T))
1945
1946 /* Make sure we have at least N more bytes of space in buffer. */
1947 # ifdef WCHAR
1948 # define GET_BUFFER_SPACE(n) \
1949 while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \
1950 + (n)*sizeof(CHAR_T)) > bufp->allocated) \
1951 EXTEND_BUFFER ()
1952 # else /* BYTE */
1953 # define GET_BUFFER_SPACE(n) \
1954 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
1955 EXTEND_BUFFER ()
1956 # endif /* WCHAR */
1957
1958 /* Make sure we have one more byte of buffer space and then add C to it. */
1959 # define BUF_PUSH(c) \
1960 do { \
1961 GET_BUFFER_SPACE (1); \
1962 *b++ = (UCHAR_T) (c); \
1963 } while (0)
1964
1965
1966 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1967 # define BUF_PUSH_2(c1, c2) \
1968 do { \
1969 GET_BUFFER_SPACE (2); \
1970 *b++ = (UCHAR_T) (c1); \
1971 *b++ = (UCHAR_T) (c2); \
1972 } while (0)
1973
1974
1975 /* As with BUF_PUSH_2, except for three bytes. */
1976 # define BUF_PUSH_3(c1, c2, c3) \
1977 do { \
1978 GET_BUFFER_SPACE (3); \
1979 *b++ = (UCHAR_T) (c1); \
1980 *b++ = (UCHAR_T) (c2); \
1981 *b++ = (UCHAR_T) (c3); \
1982 } while (0)
1983
1984 /* Store a jump with opcode OP at LOC to location TO. We store a
1985 relative address offset by the three bytes the jump itself occupies. */
1986 # define STORE_JUMP(op, loc, to) \
1987 PREFIX(store_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)))
1988
1989 /* Likewise, for a two-argument jump. */
1990 # define STORE_JUMP2(op, loc, to, arg) \
1991 PREFIX(store_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg)
1992
1993 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1994 # define INSERT_JUMP(op, loc, to) \
1995 PREFIX(insert_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b)
1996
1997 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1998 # define INSERT_JUMP2(op, loc, to, arg) \
1999 PREFIX(insert_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\
2000 arg, b)
2001
2002 /* This is not an arbitrary limit: the arguments which represent offsets
2003 into the pattern are two bytes long. So if 2^16 bytes turns out to
2004 be too small, many things would have to change. */
2005 /* Any other compiler which, like MSC, has allocation limit below 2^16
2006 bytes will have to use approach similar to what was done below for
2007 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
2008 reallocating to 0 bytes. Such thing is not going to work too well.
2009 You have been warned!! */
2010 # ifndef DEFINED_ONCE
2011 # if defined _MSC_VER && !defined WIN32
2012 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
2013 The REALLOC define eliminates a flurry of conversion warnings,
2014 but is not required. */
2015 # define MAX_BUF_SIZE 65500L
2016 # define REALLOC(p,s) realloc ((p), (size_t) (s))
2017 # else
2018 # define MAX_BUF_SIZE (1L << 16)
2019 # define REALLOC(p,s) realloc ((p), (s))
2020 # endif
2021
2022 /* Extend the buffer by twice its current size via realloc and
2023 reset the pointers that pointed into the old block to point to the
2024 correct places in the new one. If extending the buffer results in it
2025 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
2026 # if __BOUNDED_POINTERS__
2027 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
2028 # define MOVE_BUFFER_POINTER(P) \
2029 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
2030 # define ELSE_EXTEND_BUFFER_HIGH_BOUND \
2031 else \
2032 { \
2033 SET_HIGH_BOUND (b); \
2034 SET_HIGH_BOUND (begalt); \
2035 if (fixup_alt_jump) \
2036 SET_HIGH_BOUND (fixup_alt_jump); \
2037 if (laststart) \
2038 SET_HIGH_BOUND (laststart); \
2039 if (pending_exact) \
2040 SET_HIGH_BOUND (pending_exact); \
2041 }
2042 # else
2043 # define MOVE_BUFFER_POINTER(P) (P) += incr
2044 # define ELSE_EXTEND_BUFFER_HIGH_BOUND
2045 # endif
2046 # endif /* not DEFINED_ONCE */
2047
2048 # ifdef WCHAR
2049 # define EXTEND_BUFFER() \
2050 do { \
2051 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2052 int wchar_count; \
2053 if (bufp->allocated + sizeof(UCHAR_T) > MAX_BUF_SIZE) \
2054 return REG_ESIZE; \
2055 bufp->allocated <<= 1; \
2056 if (bufp->allocated > MAX_BUF_SIZE) \
2057 bufp->allocated = MAX_BUF_SIZE; \
2058 /* How many characters the new buffer can have? */ \
2059 wchar_count = bufp->allocated / sizeof(UCHAR_T); \
2060 if (wchar_count == 0) wchar_count = 1; \
2061 /* Truncate the buffer to CHAR_T align. */ \
2062 bufp->allocated = wchar_count * sizeof(UCHAR_T); \
2063 RETALLOC (COMPILED_BUFFER_VAR, wchar_count, UCHAR_T); \
2064 bufp->buffer = (char*)COMPILED_BUFFER_VAR; \
2065 if (COMPILED_BUFFER_VAR == NULL) \
2066 return REG_ESPACE; \
2067 /* If the buffer moved, move all the pointers into it. */ \
2068 if (old_buffer != COMPILED_BUFFER_VAR) \
2069 { \
2070 PTR_INT_TYPE incr = COMPILED_BUFFER_VAR - old_buffer; \
2071 MOVE_BUFFER_POINTER (b); \
2072 MOVE_BUFFER_POINTER (begalt); \
2073 if (fixup_alt_jump) \
2074 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2075 if (laststart) \
2076 MOVE_BUFFER_POINTER (laststart); \
2077 if (pending_exact) \
2078 MOVE_BUFFER_POINTER (pending_exact); \
2079 } \
2080 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2081 } while (0)
2082 # else /* BYTE */
2083 # define EXTEND_BUFFER() \
2084 do { \
2085 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2086 if (bufp->allocated == MAX_BUF_SIZE) \
2087 return REG_ESIZE; \
2088 bufp->allocated <<= 1; \
2089 if (bufp->allocated > MAX_BUF_SIZE) \
2090 bufp->allocated = MAX_BUF_SIZE; \
2091 bufp->buffer = (UCHAR_T *) REALLOC (COMPILED_BUFFER_VAR, \
2092 bufp->allocated); \
2093 if (COMPILED_BUFFER_VAR == NULL) \
2094 return REG_ESPACE; \
2095 /* If the buffer moved, move all the pointers into it. */ \
2096 if (old_buffer != COMPILED_BUFFER_VAR) \
2097 { \
2098 PTR_INT_TYPE incr = COMPILED_BUFFER_VAR - old_buffer; \
2099 MOVE_BUFFER_POINTER (b); \
2100 MOVE_BUFFER_POINTER (begalt); \
2101 if (fixup_alt_jump) \
2102 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2103 if (laststart) \
2104 MOVE_BUFFER_POINTER (laststart); \
2105 if (pending_exact) \
2106 MOVE_BUFFER_POINTER (pending_exact); \
2107 } \
2108 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2109 } while (0)
2110 # endif /* WCHAR */
2111
2112 # ifndef DEFINED_ONCE
2113 /* Since we have one byte reserved for the register number argument to
2114 {start,stop}_memory, the maximum number of groups we can report
2115 things about is what fits in that byte. */
2116 # define MAX_REGNUM 255
2117
2118 /* But patterns can have more than `MAX_REGNUM' registers. We just
2119 ignore the excess. */
2120 typedef unsigned regnum_t;
2121
2122
2123 /* Macros for the compile stack. */
2124
2125 /* Since offsets can go either forwards or backwards, this type needs to
2126 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
2127 /* int may be not enough when sizeof(int) == 2. */
2128 typedef long pattern_offset_t;
2129
2130 typedef struct
2131 {
2132 pattern_offset_t begalt_offset;
2133 pattern_offset_t fixup_alt_jump;
2134 pattern_offset_t inner_group_offset;
2135 pattern_offset_t laststart_offset;
2136 regnum_t regnum;
2137 } compile_stack_elt_t;
2138
2139
2140 typedef struct
2141 {
2142 compile_stack_elt_t *stack;
2143 unsigned size;
2144 unsigned avail; /* Offset of next open position. */
2145 } compile_stack_type;
2146
2147
2148 # define INIT_COMPILE_STACK_SIZE 32
2149
2150 # define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
2151 # define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
2152
2153 /* The next available element. */
2154 # define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
2155
2156 # endif /* not DEFINED_ONCE */
2157
2158 /* Set the bit for character C in a list. */
2159 # ifndef DEFINED_ONCE
2160 # define SET_LIST_BIT(c) \
2161 (b[((unsigned char) (c)) / BYTEWIDTH] \
2162 |= 1 << (((unsigned char) c) % BYTEWIDTH))
2163 # endif /* DEFINED_ONCE */
2164
2165 /* Get the next unsigned number in the uncompiled pattern. */
2166 # define GET_UNSIGNED_NUMBER(num) \
2167 { \
2168 while (p != pend) \
2169 { \
2170 PATFETCH (c); \
2171 if (c < '0' || c > '9') \
2172 break; \
2173 if (num <= RE_DUP_MAX) \
2174 { \
2175 if (num < 0) \
2176 num = 0; \
2177 num = num * 10 + c - '0'; \
2178 } \
2179 } \
2180 }
2181
2182 # ifndef DEFINED_ONCE
2183 # if defined _LIBC || WIDE_CHAR_SUPPORT
2184 /* The GNU C library provides support for user-defined character classes
2185 and the functions from ISO C amendement 1. */
2186 # ifdef CHARCLASS_NAME_MAX
2187 # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
2188 # else
2189 /* This shouldn't happen but some implementation might still have this
2190 problem. Use a reasonable default value. */
2191 # define CHAR_CLASS_MAX_LENGTH 256
2192 # endif
2193
2194 # ifdef _LIBC
2195 # define IS_CHAR_CLASS(string) __wctype (string)
2196 # else
2197 # define IS_CHAR_CLASS(string) wctype (string)
2198 # endif
2199 # else
2200 # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
2201
2202 # define IS_CHAR_CLASS(string) \
2203 (STREQ (string, "alpha") || STREQ (string, "upper") \
2204 || STREQ (string, "lower") || STREQ (string, "digit") \
2205 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
2206 || STREQ (string, "space") || STREQ (string, "print") \
2207 || STREQ (string, "punct") || STREQ (string, "graph") \
2208 || STREQ (string, "cntrl") || STREQ (string, "blank"))
2209 # endif
2210 # endif /* DEFINED_ONCE */
2211
2212 # ifndef MATCH_MAY_ALLOCATE
2214
2215 /* If we cannot allocate large objects within re_match_2_internal,
2216 we make the fail stack and register vectors global.
2217 The fail stack, we grow to the maximum size when a regexp
2218 is compiled.
2219 The register vectors, we adjust in size each time we
2220 compile a regexp, according to the number of registers it needs. */
2221
2222 static PREFIX(fail_stack_type) fail_stack;
2223
2224 /* Size with which the following vectors are currently allocated.
2225 That is so we can make them bigger as needed,
2226 but never make them smaller. */
2227 # ifdef DEFINED_ONCE
2228 static int regs_allocated_size;
2229
2230 static const char ** regstart, ** regend;
2231 static const char ** old_regstart, ** old_regend;
2232 static const char **best_regstart, **best_regend;
2233 static const char **reg_dummy;
2234 # endif /* DEFINED_ONCE */
2235
2236 static PREFIX(register_info_type) *PREFIX(reg_info);
2237 static PREFIX(register_info_type) *PREFIX(reg_info_dummy);
2238
2239 /* Make the register vectors big enough for NUM_REGS registers,
2240 but don't make them smaller. */
2241
2242 static void
2243 PREFIX(regex_grow_registers) (int num_regs)
2244 {
2245 if (num_regs > regs_allocated_size)
2246 {
2247 RETALLOC_IF (regstart, num_regs, const char *);
2248 RETALLOC_IF (regend, num_regs, const char *);
2249 RETALLOC_IF (old_regstart, num_regs, const char *);
2250 RETALLOC_IF (old_regend, num_regs, const char *);
2251 RETALLOC_IF (best_regstart, num_regs, const char *);
2252 RETALLOC_IF (best_regend, num_regs, const char *);
2253 RETALLOC_IF (PREFIX(reg_info), num_regs, PREFIX(register_info_type));
2254 RETALLOC_IF (reg_dummy, num_regs, const char *);
2255 RETALLOC_IF (PREFIX(reg_info_dummy), num_regs, PREFIX(register_info_type));
2256
2257 regs_allocated_size = num_regs;
2258 }
2259 }
2260
2261 # endif /* not MATCH_MAY_ALLOCATE */
2262
2263 # ifndef DEFINED_ONCE
2265 static boolean group_in_compile_stack (compile_stack_type compile_stack,
2266 regnum_t regnum);
2267 # endif /* not DEFINED_ONCE */
2268
2269 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2270 Returns one of error codes defined in `regex.h', or zero for success.
2271
2272 Assumes the `allocated' (and perhaps `buffer') and `translate'
2273 fields are set in BUFP on entry.
2274
2275 If it succeeds, results are put in BUFP (if it returns an error, the
2276 contents of BUFP are undefined):
2277 `buffer' is the compiled pattern;
2278 `syntax' is set to SYNTAX;
2279 `used' is set to the length of the compiled pattern;
2280 `fastmap_accurate' is zero;
2281 `re_nsub' is the number of subexpressions in PATTERN;
2282 `not_bol' and `not_eol' are zero;
2283
2284 The `fastmap' and `newline_anchor' fields are neither
2285 examined nor set. */
2286
2287 /* Return, freeing storage we allocated. */
2288 # ifdef WCHAR
2289 # define FREE_STACK_RETURN(value) \
2290 return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value)
2291 # else
2292 # define FREE_STACK_RETURN(value) \
2293 return (free (compile_stack.stack), value)
2294 # endif /* WCHAR */
2295
2296 static reg_errcode_t
2297 PREFIX(regex_compile) (const char *ARG_PREFIX(pattern),
2298 size_t ARG_PREFIX(size), reg_syntax_t syntax,
2299 struct re_pattern_buffer *bufp)
2300 {
2301 /* We fetch characters from PATTERN here. Even though PATTERN is
2302 `char *' (i.e., signed), we declare these variables as unsigned, so
2303 they can be reliably used as array indices. */
2304 register UCHAR_T c, c1;
2305
2306 #ifdef WCHAR
2307 /* A temporary space to keep wchar_t pattern and compiled pattern. */
2308 CHAR_T *pattern, *COMPILED_BUFFER_VAR;
2309 size_t size;
2310 /* offset buffer for optimization. See convert_mbs_to_wc. */
2311 int *mbs_offset = NULL;
2312 /* It hold whether each wchar_t is binary data or not. */
2313 char *is_binary = NULL;
2314 /* A flag whether exactn is handling binary data or not. */
2315 char is_exactn_bin = FALSE;
2316 #endif /* WCHAR */
2317
2318 /* A random temporary spot in PATTERN. */
2319 const CHAR_T *p1;
2320
2321 /* Points to the end of the buffer, where we should append. */
2322 register UCHAR_T *b;
2323
2324 /* Keeps track of unclosed groups. */
2325 compile_stack_type compile_stack;
2326
2327 /* Points to the current (ending) position in the pattern. */
2328 #ifdef WCHAR
2329 const CHAR_T *p;
2330 const CHAR_T *pend;
2331 #else /* BYTE */
2332 const CHAR_T *p = pattern;
2333 const CHAR_T *pend = pattern + size;
2334 #endif /* WCHAR */
2335
2336 /* How to translate the characters in the pattern. */
2337 RE_TRANSLATE_TYPE translate = bufp->translate;
2338
2339 /* Address of the count-byte of the most recently inserted `exactn'
2340 command. This makes it possible to tell if a new exact-match
2341 character can be added to that command or if the character requires
2342 a new `exactn' command. */
2343 UCHAR_T *pending_exact = 0;
2344
2345 /* Address of start of the most recently finished expression.
2346 This tells, e.g., postfix * where to find the start of its
2347 operand. Reset at the beginning of groups and alternatives. */
2348 UCHAR_T *laststart = 0;
2349
2350 /* Address of beginning of regexp, or inside of last group. */
2351 UCHAR_T *begalt;
2352
2353 /* Address of the place where a forward jump should go to the end of
2354 the containing expression. Each alternative of an `or' -- except the
2355 last -- ends with a forward jump of this sort. */
2356 UCHAR_T *fixup_alt_jump = 0;
2357
2358 /* Counts open-groups as they are encountered. Remembered for the
2359 matching close-group on the compile stack, so the same register
2360 number is put in the stop_memory as the start_memory. */
2361 regnum_t regnum = 0;
2362
2363 #ifdef WCHAR
2364 /* Initialize the wchar_t PATTERN and offset_buffer. */
2365 p = pend = pattern = TALLOC(csize + 1, CHAR_T);
2366 mbs_offset = TALLOC(csize + 1, int);
2367 is_binary = TALLOC(csize + 1, char);
2368 if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
2369 {
2370 free(pattern);
2371 free(mbs_offset);
2372 free(is_binary);
2373 return REG_ESPACE;
2374 }
2375 pattern[csize] = L'\0'; /* sentinel */
2376 size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
2377 pend = p + size;
2378 if (size < 0)
2379 {
2380 free(pattern);
2381 free(mbs_offset);
2382 free(is_binary);
2383 return REG_BADPAT;
2384 }
2385 #endif
2386
2387 #ifdef DEBUG
2388 DEBUG_PRINT1 ("\nCompiling pattern: ");
2389 if (debug)
2390 {
2391 unsigned debug_count;
2392
2393 for (debug_count = 0; debug_count < size; debug_count++)
2394 PUT_CHAR (pattern[debug_count]);
2395 putchar ('\n');
2396 }
2397 #endif /* DEBUG */
2398
2399 /* Initialize the compile stack. */
2400 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2401 if (compile_stack.stack == NULL)
2402 {
2403 #ifdef WCHAR
2404 free(pattern);
2405 free(mbs_offset);
2406 free(is_binary);
2407 #endif
2408 return REG_ESPACE;
2409 }
2410
2411 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2412 compile_stack.avail = 0;
2413
2414 /* Initialize the pattern buffer. */
2415 bufp->syntax = syntax;
2416 bufp->fastmap_accurate = 0;
2417 bufp->not_bol = bufp->not_eol = 0;
2418
2419 /* Set `used' to zero, so that if we return an error, the pattern
2420 printer (for debugging) will think there's no pattern. We reset it
2421 at the end. */
2422 bufp->used = 0;
2423
2424 /* Always count groups, whether or not bufp->no_sub is set. */
2425 bufp->re_nsub = 0;
2426
2427 #if !defined emacs && !defined SYNTAX_TABLE
2428 /* Initialize the syntax table. */
2429 init_syntax_once ();
2430 #endif
2431
2432 if (bufp->allocated == 0)
2433 {
2434 if (bufp->buffer)
2435 { /* If zero allocated, but buffer is non-null, try to realloc
2436 enough space. This loses if buffer's address is bogus, but
2437 that is the user's responsibility. */
2438 #ifdef WCHAR
2439 /* Free bufp->buffer and allocate an array for wchar_t pattern
2440 buffer. */
2441 free(bufp->buffer);
2442 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(UCHAR_T),
2443 UCHAR_T);
2444 #else
2445 RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, UCHAR_T);
2446 #endif /* WCHAR */
2447 }
2448 else
2449 { /* Caller did not allocate a buffer. Do it for them. */
2450 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(UCHAR_T),
2451 UCHAR_T);
2452 }
2453
2454 if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE);
2455 #ifdef WCHAR
2456 bufp->buffer = (char*)COMPILED_BUFFER_VAR;
2457 #endif /* WCHAR */
2458 bufp->allocated = INIT_BUF_SIZE;
2459 }
2460 #ifdef WCHAR
2461 else
2462 COMPILED_BUFFER_VAR = (UCHAR_T*) bufp->buffer;
2463 #endif
2464
2465 begalt = b = COMPILED_BUFFER_VAR;
2466
2467 /* Loop through the uncompiled pattern until we're at the end. */
2468 while (p != pend)
2469 {
2470 PATFETCH (c);
2471
2472 switch (c)
2473 {
2474 case '^':
2475 {
2476 if ( /* If at start of pattern, it's an operator. */
2477 p == pattern + 1
2478 /* If context independent, it's an operator. */
2479 || syntax & RE_CONTEXT_INDEP_ANCHORS
2480 /* Otherwise, depends on what's come before. */
2481 || PREFIX(at_begline_loc_p) (pattern, p, syntax))
2482 BUF_PUSH (begline);
2483 else
2484 goto normal_char;
2485 }
2486 break;
2487
2488
2489 case '$':
2490 {
2491 if ( /* If at end of pattern, it's an operator. */
2492 p == pend
2493 /* If context independent, it's an operator. */
2494 || syntax & RE_CONTEXT_INDEP_ANCHORS
2495 /* Otherwise, depends on what's next. */
2496 || PREFIX(at_endline_loc_p) (p, pend, syntax))
2497 BUF_PUSH (endline);
2498 else
2499 goto normal_char;
2500 }
2501 break;
2502
2503
2504 case '+':
2505 case '?':
2506 if ((syntax & RE_BK_PLUS_QM)
2507 || (syntax & RE_LIMITED_OPS))
2508 goto normal_char;
2509 handle_plus:
2510 case '*':
2511 /* If there is no previous pattern... */
2512 if (!laststart)
2513 {
2514 if (syntax & RE_CONTEXT_INVALID_OPS)
2515 FREE_STACK_RETURN (REG_BADRPT);
2516 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2517 goto normal_char;
2518 }
2519
2520 {
2521 /* Are we optimizing this jump? */
2522 boolean keep_string_p = false;
2523
2524 /* 1 means zero (many) matches is allowed. */
2525 char zero_times_ok = 0, many_times_ok = 0;
2526
2527 /* If there is a sequence of repetition chars, collapse it
2528 down to just one (the right one). We can't combine
2529 interval operators with these because of, e.g., `a{2}*',
2530 which should only match an even number of `a's. */
2531
2532 for (;;)
2533 {
2534 zero_times_ok |= c != '+';
2535 many_times_ok |= c != '?';
2536
2537 if (p == pend)
2538 break;
2539
2540 PATFETCH (c);
2541
2542 if (c == '*'
2543 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2544 ;
2545
2546 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2547 {
2548 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2549
2550 PATFETCH (c1);
2551 if (!(c1 == '+' || c1 == '?'))
2552 {
2553 PATUNFETCH;
2554 PATUNFETCH;
2555 break;
2556 }
2557
2558 c = c1;
2559 }
2560 else
2561 {
2562 PATUNFETCH;
2563 break;
2564 }
2565
2566 /* If we get here, we found another repeat character. */
2567 }
2568
2569 /* Star, etc. applied to an empty pattern is equivalent
2570 to an empty pattern. */
2571 if (!laststart)
2572 break;
2573
2574 /* Now we know whether or not zero matches is allowed
2575 and also whether or not two or more matches is allowed. */
2576 if (many_times_ok)
2577 { /* More than one repetition is allowed, so put in at the
2578 end a backward relative jump from `b' to before the next
2579 jump we're going to put in below (which jumps from
2580 laststart to after this jump).
2581
2582 But if we are at the `*' in the exact sequence `.*\n',
2583 insert an unconditional jump backwards to the .,
2584 instead of the beginning of the loop. This way we only
2585 push a failure point once, instead of every time
2586 through the loop. */
2587 assert (p - 1 > pattern);
2588
2589 /* Allocate the space for the jump. */
2590 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2591
2592 /* We know we are not at the first character of the pattern,
2593 because laststart was nonzero. And we've already
2594 incremented `p', by the way, to be the character after
2595 the `*'. Do we have to do something analogous here
2596 for null bytes, because of RE_DOT_NOT_NULL? */
2597 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2598 && zero_times_ok
2599 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2600 && !(syntax & RE_DOT_NEWLINE))
2601 { /* We have .*\n. */
2602 STORE_JUMP (jump, b, laststart);
2603 keep_string_p = true;
2604 }
2605 else
2606 /* Anything else. */
2607 STORE_JUMP (maybe_pop_jump, b, laststart -
2608 (1 + OFFSET_ADDRESS_SIZE));
2609
2610 /* We've added more stuff to the buffer. */
2611 b += 1 + OFFSET_ADDRESS_SIZE;
2612 }
2613
2614 /* On failure, jump from laststart to b + 3, which will be the
2615 end of the buffer after this jump is inserted. */
2616 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of
2617 'b + 3'. */
2618 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2619 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2620 : on_failure_jump,
2621 laststart, b + 1 + OFFSET_ADDRESS_SIZE);
2622 pending_exact = 0;
2623 b += 1 + OFFSET_ADDRESS_SIZE;
2624
2625 if (!zero_times_ok)
2626 {
2627 /* At least one repetition is required, so insert a
2628 `dummy_failure_jump' before the initial
2629 `on_failure_jump' instruction of the loop. This
2630 effects a skip over that instruction the first time
2631 we hit that loop. */
2632 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2633 INSERT_JUMP (dummy_failure_jump, laststart, laststart +
2634 2 + 2 * OFFSET_ADDRESS_SIZE);
2635 b += 1 + OFFSET_ADDRESS_SIZE;
2636 }
2637 }
2638 break;
2639
2640
2641 case '.':
2642 laststart = b;
2643 BUF_PUSH (anychar);
2644 break;
2645
2646
2647 case '[':
2648 {
2649 boolean had_char_class = false;
2650 #ifdef WCHAR
2651 CHAR_T range_start = 0xffffffff;
2652 #else
2653 unsigned int range_start = 0xffffffff;
2654 #endif
2655 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2656
2657 #ifdef WCHAR
2658 /* We assume a charset(_not) structure as a wchar_t array.
2659 charset[0] = (re_opcode_t) charset(_not)
2660 charset[1] = l (= length of char_classes)
2661 charset[2] = m (= length of collating_symbols)
2662 charset[3] = n (= length of equivalence_classes)
2663 charset[4] = o (= length of char_ranges)
2664 charset[5] = p (= length of chars)
2665
2666 charset[6] = char_class (wctype_t)
2667 charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
2668 ...
2669 charset[l+5] = char_class (wctype_t)
2670
2671 charset[l+6] = collating_symbol (wchar_t)
2672 ...
2673 charset[l+m+5] = collating_symbol (wchar_t)
2674 ifdef _LIBC we use the index if
2675 _NL_COLLATE_SYMB_EXTRAMB instead of
2676 wchar_t string.
2677
2678 charset[l+m+6] = equivalence_classes (wchar_t)
2679 ...
2680 charset[l+m+n+5] = equivalence_classes (wchar_t)
2681 ifdef _LIBC we use the index in
2682 _NL_COLLATE_WEIGHT instead of
2683 wchar_t string.
2684
2685 charset[l+m+n+6] = range_start
2686 charset[l+m+n+7] = range_end
2687 ...
2688 charset[l+m+n+2o+4] = range_start
2689 charset[l+m+n+2o+5] = range_end
2690 ifdef _LIBC we use the value looked up
2691 in _NL_COLLATE_COLLSEQ instead of
2692 wchar_t character.
2693
2694 charset[l+m+n+2o+6] = char
2695 ...
2696 charset[l+m+n+2o+p+5] = char
2697
2698 */
2699
2700 /* We need at least 6 spaces: the opcode, the length of
2701 char_classes, the length of collating_symbols, the length of
2702 equivalence_classes, the length of char_ranges, the length of
2703 chars. */
2704 GET_BUFFER_SPACE (6);
2705
2706 /* Save b as laststart. And We use laststart as the pointer
2707 to the first element of the charset here.
2708 In other words, laststart[i] indicates charset[i]. */
2709 laststart = b;
2710
2711 /* We test `*p == '^' twice, instead of using an if
2712 statement, so we only need one BUF_PUSH. */
2713 BUF_PUSH (*p == '^' ? charset_not : charset);
2714 if (*p == '^')
2715 p++;
2716
2717 /* Push the length of char_classes, the length of
2718 collating_symbols, the length of equivalence_classes, the
2719 length of char_ranges and the length of chars. */
2720 BUF_PUSH_3 (0, 0, 0);
2721 BUF_PUSH_2 (0, 0);
2722
2723 /* Remember the first position in the bracket expression. */
2724 p1 = p;
2725
2726 /* charset_not matches newline according to a syntax bit. */
2727 if ((re_opcode_t) b[-6] == charset_not
2728 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2729 {
2730 BUF_PUSH('\n');
2731 laststart[5]++; /* Update the length of characters */
2732 }
2733
2734 /* Read in characters and ranges, setting map bits. */
2735 for (;;)
2736 {
2737 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2738
2739 PATFETCH (c);
2740
2741 /* \ might escape characters inside [...] and [^...]. */
2742 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2743 {
2744 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2745
2746 PATFETCH (c1);
2747 BUF_PUSH(c1);
2748 laststart[5]++; /* Update the length of chars */
2749 range_start = c1;
2750 continue;
2751 }
2752
2753 /* Could be the end of the bracket expression. If it's
2754 not (i.e., when the bracket expression is `[]' so
2755 far), the ']' character bit gets set way below. */
2756 if (c == ']' && p != p1 + 1)
2757 break;
2758
2759 /* Look ahead to see if it's a range when the last thing
2760 was a character class. */
2761 if (had_char_class && c == '-' && *p != ']')
2762 FREE_STACK_RETURN (REG_ERANGE);
2763
2764 /* Look ahead to see if it's a range when the last thing
2765 was a character: if this is a hyphen not at the
2766 beginning or the end of a list, then it's the range
2767 operator. */
2768 if (c == '-'
2769 && !(p - 2 >= pattern && p[-2] == '[')
2770 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2771 && *p != ']')
2772 {
2773 reg_errcode_t ret;
2774 /* Allocate the space for range_start and range_end. */
2775 GET_BUFFER_SPACE (2);
2776 /* Update the pointer to indicate end of buffer. */
2777 b += 2;
2778 ret = wcs_compile_range (range_start, &p, pend, translate,
2779 syntax, b, laststart);
2780 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2781 range_start = 0xffffffff;
2782 }
2783 else if (p[0] == '-' && p[1] != ']')
2784 { /* This handles ranges made up of characters only. */
2785 reg_errcode_t ret;
2786
2787 /* Move past the `-'. */
2788 PATFETCH (c1);
2789 /* Allocate the space for range_start and range_end. */
2790 GET_BUFFER_SPACE (2);
2791 /* Update the pointer to indicate end of buffer. */
2792 b += 2;
2793 ret = wcs_compile_range (c, &p, pend, translate, syntax, b,
2794 laststart);
2795 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2796 range_start = 0xffffffff;
2797 }
2798
2799 /* See if we're at the beginning of a possible character
2800 class. */
2801 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2802 { /* Leave room for the null. */
2803 char str[CHAR_CLASS_MAX_LENGTH + 1];
2804
2805 PATFETCH (c);
2806 c1 = 0;
2807
2808 /* If pattern is `[[:'. */
2809 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2810
2811 for (;;)
2812 {
2813 PATFETCH (c);
2814 if ((c == ':' && *p == ']') || p == pend)
2815 break;
2816 if (c1 < CHAR_CLASS_MAX_LENGTH)
2817 str[c1++] = c;
2818 else
2819 /* This is in any case an invalid class name. */
2820 str[0] = '\0';
2821 }
2822 str[c1] = '\0';
2823
2824 /* If isn't a word bracketed by `[:' and `:]':
2825 undo the ending character, the letters, and leave
2826 the leading `:' and `[' (but store them as character). */
2827 if (c == ':' && *p == ']')
2828 {
2829 wctype_t wt;
2830 uintptr_t alignedp;
2831
2832 /* Query the character class as wctype_t. */
2833 wt = IS_CHAR_CLASS (str);
2834 if (wt == 0)
2835 FREE_STACK_RETURN (REG_ECTYPE);
2836
2837 /* Throw away the ] at the end of the character
2838 class. */
2839 PATFETCH (c);
2840
2841 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2842
2843 /* Allocate the space for character class. */
2844 GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
2845 /* Update the pointer to indicate end of buffer. */
2846 b += CHAR_CLASS_SIZE;
2847 /* Move data which follow character classes
2848 not to violate the data. */
2849 insert_space(CHAR_CLASS_SIZE,
2850 laststart + 6 + laststart[1],
2851 b - 1);
2852 alignedp = ((uintptr_t)(laststart + 6 + laststart[1])
2853 + __alignof__(wctype_t) - 1)
2854 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
2855 /* Store the character class. */
2856 *((wctype_t*)alignedp) = wt;
2857 /* Update length of char_classes */
2858 laststart[1] += CHAR_CLASS_SIZE;
2859
2860 had_char_class = true;
2861 }
2862 else
2863 {
2864 c1++;
2865 while (c1--)
2866 PATUNFETCH;
2867 BUF_PUSH ('[');
2868 BUF_PUSH (':');
2869 laststart[5] += 2; /* Update the length of characters */
2870 range_start = ':';
2871 had_char_class = false;
2872 }
2873 }
2874 else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '='
2875 || *p == '.'))
2876 {
2877 CHAR_T str[128]; /* Should be large enough. */
2878 CHAR_T delim = *p; /* '=' or '.' */
2879 # ifdef _LIBC
2880 uint32_t nrules =
2881 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2882 # endif
2883 PATFETCH (c);
2884 c1 = 0;
2885
2886 /* If pattern is `[[=' or '[[.'. */
2887 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2888
2889 for (;;)
2890 {
2891 PATFETCH (c);
2892 if ((c == delim && *p == ']') || p == pend)
2893 break;
2894 if (c1 < sizeof (str) - 1)
2895 str[c1++] = c;
2896 else
2897 /* This is in any case an invalid class name. */
2898 str[0] = '\0';
2899 }
2900 str[c1] = '\0';
2901
2902 if (c == delim && *p == ']' && str[0] != '\0')
2903 {
2904 unsigned int i, offset;
2905 /* If we have no collation data we use the default
2906 collation in which each character is in a class
2907 by itself. It also means that ASCII is the
2908 character set and therefore we cannot have character
2909 with more than one byte in the multibyte
2910 representation. */
2911
2912 /* If not defined _LIBC, we push the name and
2913 `\0' for the sake of matching performance. */
2914 int datasize = c1 + 1;
2915
2916 # ifdef _LIBC
2917 int32_t idx = 0;
2918 if (nrules == 0)
2919 # endif
2920 {
2921 if (c1 != 1)
2922 FREE_STACK_RETURN (REG_ECOLLATE);
2923 }
2924 # ifdef _LIBC
2925 else
2926 {
2927 const int32_t *table;
2928 const int32_t *weights;
2929 const int32_t *extra;
2930 const int32_t *indirect;
2931 wint_t *cp;
2932
2933 /* This #include defines a local function! */
2934 # include <locale/weightwc.h>
2935
2936 if(delim == '=')
2937 {
2938 /* We push the index for equivalence class. */
2939 cp = (wint_t*)str;
2940
2941 table = (const int32_t *)
2942 _NL_CURRENT (LC_COLLATE,
2943 _NL_COLLATE_TABLEWC);
2944 weights = (const int32_t *)
2945 _NL_CURRENT (LC_COLLATE,
2946 _NL_COLLATE_WEIGHTWC);
2947 extra = (const int32_t *)
2948 _NL_CURRENT (LC_COLLATE,
2949 _NL_COLLATE_EXTRAWC);
2950 indirect = (const int32_t *)
2951 _NL_CURRENT (LC_COLLATE,
2952 _NL_COLLATE_INDIRECTWC);
2953
2954 idx = findidx ((const wint_t**)&cp);
2955 if (idx == 0 || cp < (wint_t*) str + c1)
2956 /* This is no valid character. */
2957 FREE_STACK_RETURN (REG_ECOLLATE);
2958
2959 str[0] = (wchar_t)idx;
2960 }
2961 else /* delim == '.' */
2962 {
2963 /* We push collation sequence value
2964 for collating symbol. */
2965 int32_t table_size;
2966 const int32_t *symb_table;
2967 const unsigned char *extra;
2968 int32_t idx;
2969 int32_t elem;
2970 int32_t second;
2971 int32_t hash;
2972 char char_str[c1];
2973
2974 /* We have to convert the name to a single-byte
2975 string. This is possible since the names
2976 consist of ASCII characters and the internal
2977 representation is UCS4. */
2978 for (i = 0; i < c1; ++i)
2979 char_str[i] = str[i];
2980
2981 table_size =
2982 _NL_CURRENT_WORD (LC_COLLATE,
2983 _NL_COLLATE_SYMB_HASH_SIZEMB);
2984 symb_table = (const int32_t *)
2985 _NL_CURRENT (LC_COLLATE,
2986 _NL_COLLATE_SYMB_TABLEMB);
2987 extra = (const unsigned char *)
2988 _NL_CURRENT (LC_COLLATE,
2989 _NL_COLLATE_SYMB_EXTRAMB);
2990
2991 /* Locate the character in the hashing table. */
2992 hash = elem_hash (char_str, c1);
2993
2994 idx = 0;
2995 elem = hash % table_size;
2996 second = hash % (table_size - 2);
2997 while (symb_table[2 * elem] != 0)
2998 {
2999 /* First compare the hashing value. */
3000 if (symb_table[2 * elem] == hash
3001 && c1 == extra[symb_table[2 * elem + 1]]
3002 && memcmp (char_str,
3003 &extra[symb_table[2 * elem + 1]
3004 + 1], c1) == 0)
3005 {
3006 /* Yep, this is the entry. */
3007 idx = symb_table[2 * elem + 1];
3008 idx += 1 + extra[idx];
3009 break;
3010 }
3011
3012 /* Next entry. */
3013 elem += second;
3014 }
3015
3016 if (symb_table[2 * elem] != 0)
3017 {
3018 /* Compute the index of the byte sequence
3019 in the table. */
3020 idx += 1 + extra[idx];
3021 /* Adjust for the alignment. */
3022 idx = (idx + 3) & ~3;
3023
3024 str[0] = (wchar_t) idx + 4;
3025 }
3026 else if (symb_table[2 * elem] == 0 && c1 == 1)
3027 {
3028 /* No valid character. Match it as a
3029 single byte character. */
3030 had_char_class = false;
3031 BUF_PUSH(str[0]);
3032 /* Update the length of characters */
3033 laststart[5]++;
3034 range_start = str[0];
3035
3036 /* Throw away the ] at the end of the
3037 collating symbol. */
3038 PATFETCH (c);
3039 /* exit from the switch block. */
3040 continue;
3041 }
3042 else
3043 FREE_STACK_RETURN (REG_ECOLLATE);
3044 }
3045 datasize = 1;
3046 }
3047 # endif
3048 /* Throw away the ] at the end of the equivalence
3049 class (or collating symbol). */
3050 PATFETCH (c);
3051
3052 /* Allocate the space for the equivalence class
3053 (or collating symbol) (and '\0' if needed). */
3054 GET_BUFFER_SPACE(datasize);
3055 /* Update the pointer to indicate end of buffer. */
3056 b += datasize;
3057
3058 if (delim == '=')
3059 { /* equivalence class */
3060 /* Calculate the offset of char_ranges,
3061 which is next to equivalence_classes. */
3062 offset = laststart[1] + laststart[2]
3063 + laststart[3] +6;
3064 /* Insert space. */
3065 insert_space(datasize, laststart + offset, b - 1);
3066
3067 /* Write the equivalence_class and \0. */
3068 for (i = 0 ; i < datasize ; i++)
3069 laststart[offset + i] = str[i];
3070
3071 /* Update the length of equivalence_classes. */
3072 laststart[3] += datasize;
3073 had_char_class = true;
3074 }
3075 else /* delim == '.' */
3076 { /* collating symbol */
3077 /* Calculate the offset of the equivalence_classes,
3078 which is next to collating_symbols. */
3079 offset = laststart[1] + laststart[2] + 6;
3080 /* Insert space and write the collationg_symbol
3081 and \0. */
3082 insert_space(datasize, laststart + offset, b-1);
3083 for (i = 0 ; i < datasize ; i++)
3084 laststart[offset + i] = str[i];
3085
3086 /* In re_match_2_internal if range_start < -1, we
3087 assume -range_start is the offset of the
3088 collating symbol which is specified as
3089 the character of the range start. So we assign
3090 -(laststart[1] + laststart[2] + 6) to
3091 range_start. */
3092 range_start = -(laststart[1] + laststart[2] + 6);
3093 /* Update the length of collating_symbol. */
3094 laststart[2] += datasize;
3095 had_char_class = false;
3096 }
3097 }
3098 else
3099 {
3100 c1++;
3101 while (c1--)
3102 PATUNFETCH;
3103 BUF_PUSH ('[');
3104 BUF_PUSH (delim);
3105 laststart[5] += 2; /* Update the length of characters */
3106 range_start = delim;
3107 had_char_class = false;
3108 }
3109 }
3110 else
3111 {
3112 had_char_class = false;
3113 BUF_PUSH(c);
3114 laststart[5]++; /* Update the length of characters */
3115 range_start = c;
3116 }
3117 }
3118
3119 #else /* BYTE */
3120 /* Ensure that we have enough space to push a charset: the
3121 opcode, the length count, and the bitset; 34 bytes in all. */
3122 GET_BUFFER_SPACE (34);
3123
3124 laststart = b;
3125
3126 /* We test `*p == '^' twice, instead of using an if
3127 statement, so we only need one BUF_PUSH. */
3128 BUF_PUSH (*p == '^' ? charset_not : charset);
3129 if (*p == '^')
3130 p++;
3131
3132 /* Remember the first position in the bracket expression. */
3133 p1 = p;
3134
3135 /* Push the number of bytes in the bitmap. */
3136 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
3137
3138 /* Clear the whole map. */
3139 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
3140
3141 /* charset_not matches newline according to a syntax bit. */
3142 if ((re_opcode_t) b[-2] == charset_not
3143 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
3144 SET_LIST_BIT ('\n');
3145
3146 /* Read in characters and ranges, setting map bits. */
3147 for (;;)
3148 {
3149 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3150
3151 PATFETCH (c);
3152
3153 /* \ might escape characters inside [...] and [^...]. */
3154 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
3155 {
3156 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3157
3158 PATFETCH (c1);
3159 SET_LIST_BIT (c1);
3160 range_start = c1;
3161 continue;
3162 }
3163
3164 /* Could be the end of the bracket expression. If it's
3165 not (i.e., when the bracket expression is `[]' so
3166 far), the ']' character bit gets set way below. */
3167 if (c == ']' && p != p1 + 1)
3168 break;
3169
3170 /* Look ahead to see if it's a range when the last thing
3171 was a character class. */
3172 if (had_char_class && c == '-' && *p != ']')
3173 FREE_STACK_RETURN (REG_ERANGE);
3174
3175 /* Look ahead to see if it's a range when the last thing
3176 was a character: if this is a hyphen not at the
3177 beginning or the end of a list, then it's the range
3178 operator. */
3179 if (c == '-'
3180 && !(p - 2 >= pattern && p[-2] == '[')
3181 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
3182 && *p != ']')
3183 {
3184 reg_errcode_t ret
3185 = byte_compile_range (range_start, &p, pend, translate,
3186 syntax, b);
3187 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3188 range_start = 0xffffffff;
3189 }
3190
3191 else if (p[0] == '-' && p[1] != ']')
3192 { /* This handles ranges made up of characters only. */
3193 reg_errcode_t ret;
3194
3195 /* Move past the `-'. */
3196 PATFETCH (c1);
3197
3198 ret = byte_compile_range (c, &p, pend, translate, syntax, b);
3199 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3200 range_start = 0xffffffff;
3201 }
3202
3203 /* See if we're at the beginning of a possible character
3204 class. */
3205
3206 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
3207 { /* Leave room for the null. */
3208 char str[CHAR_CLASS_MAX_LENGTH + 1];
3209
3210 PATFETCH (c);
3211 c1 = 0;
3212
3213 /* If pattern is `[[:'. */
3214 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3215
3216 for (;;)
3217 {
3218 PATFETCH (c);
3219 if ((c == ':' && *p == ']') || p == pend)
3220 break;
3221 if (c1 < CHAR_CLASS_MAX_LENGTH)
3222 str[c1++] = c;
3223 else
3224 /* This is in any case an invalid class name. */
3225 str[0] = '\0';
3226 }
3227 str[c1] = '\0';
3228
3229 /* If isn't a word bracketed by `[:' and `:]':
3230 undo the ending character, the letters, and leave
3231 the leading `:' and `[' (but set bits for them). */
3232 if (c == ':' && *p == ']')
3233 {
3234 # if defined _LIBC || WIDE_CHAR_SUPPORT
3235 boolean is_lower = STREQ (str, "lower");
3236 boolean is_upper = STREQ (str, "upper");
3237 wctype_t wt;
3238 int ch;
3239
3240 wt = IS_CHAR_CLASS (str);
3241 if (wt == 0)
3242 FREE_STACK_RETURN (REG_ECTYPE);
3243
3244 /* Throw away the ] at the end of the character
3245 class. */
3246 PATFETCH (c);
3247
3248 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3249
3250 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
3251 {
3252 # ifdef _LIBC
3253 if (__iswctype (__btowc (ch), wt))
3254 SET_LIST_BIT (ch);
3255 # else
3256 if (iswctype (btowc (ch), wt))
3257 SET_LIST_BIT (ch);
3258 # endif
3259
3260 if (translate && (is_upper || is_lower)
3261 && (ISUPPER (ch) || ISLOWER (ch)))
3262 SET_LIST_BIT (ch);
3263 }
3264
3265 had_char_class = true;
3266 # else
3267 int ch;
3268 boolean is_alnum = STREQ (str, "alnum");
3269 boolean is_alpha = STREQ (str, "alpha");
3270 boolean is_blank = STREQ (str, "blank");
3271 boolean is_cntrl = STREQ (str, "cntrl");
3272 boolean is_digit = STREQ (str, "digit");
3273 boolean is_graph = STREQ (str, "graph");
3274 boolean is_lower = STREQ (str, "lower");
3275 boolean is_print = STREQ (str, "print");
3276 boolean is_punct = STREQ (str, "punct");
3277 boolean is_space = STREQ (str, "space");
3278 boolean is_upper = STREQ (str, "upper");
3279 boolean is_xdigit = STREQ (str, "xdigit");
3280
3281 if (!IS_CHAR_CLASS (str))
3282 FREE_STACK_RETURN (REG_ECTYPE);
3283
3284 /* Throw away the ] at the end of the character
3285 class. */
3286 PATFETCH (c);
3287
3288 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3289
3290 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
3291 {
3292 /* This was split into 3 if's to
3293 avoid an arbitrary limit in some compiler. */
3294 if ( (is_alnum && ISALNUM (ch))
3295 || (is_alpha && ISALPHA (ch))
3296 || (is_blank && ISBLANK (ch))
3297 || (is_cntrl && ISCNTRL (ch)))
3298 SET_LIST_BIT (ch);
3299 if ( (is_digit && ISDIGIT (ch))
3300 || (is_graph && ISGRAPH (ch))
3301 || (is_lower && ISLOWER (ch))
3302 || (is_print && ISPRINT (ch)))
3303 SET_LIST_BIT (ch);
3304 if ( (is_punct && ISPUNCT (ch))
3305 || (is_space && ISSPACE (ch))
3306 || (is_upper && ISUPPER (ch))
3307 || (is_xdigit && ISXDIGIT (ch)))
3308 SET_LIST_BIT (ch);
3309 if ( translate && (is_upper || is_lower)
3310 && (ISUPPER (ch) || ISLOWER (ch)))
3311 SET_LIST_BIT (ch);
3312 }
3313 had_char_class = true;
3314 # endif /* libc || wctype.h */
3315 }
3316 else
3317 {
3318 c1++;
3319 while (c1--)
3320 PATUNFETCH;
3321 SET_LIST_BIT ('[');
3322 SET_LIST_BIT (':');
3323 range_start = ':';
3324 had_char_class = false;
3325 }
3326 }
3327 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
3328 {
3329 unsigned char str[MB_LEN_MAX + 1];
3330 # ifdef _LIBC
3331 uint32_t nrules =
3332 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3333 # endif
3334
3335 PATFETCH (c);
3336 c1 = 0;
3337
3338 /* If pattern is `[[='. */
3339 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3340
3341 for (;;)
3342 {
3343 PATFETCH (c);
3344 if ((c == '=' && *p == ']') || p == pend)
3345 break;
3346 if (c1 < MB_LEN_MAX)
3347 str[c1++] = c;
3348 else
3349 /* This is in any case an invalid class name. */
3350 str[0] = '\0';
3351 }
3352 str[c1] = '\0';
3353
3354 if (c == '=' && *p == ']' && str[0] != '\0')
3355 {
3356 /* If we have no collation data we use the default
3357 collation in which each character is in a class
3358 by itself. It also means that ASCII is the
3359 character set and therefore we cannot have character
3360 with more than one byte in the multibyte
3361 representation. */
3362 # ifdef _LIBC
3363 if (nrules == 0)
3364 # endif
3365 {
3366 if (c1 != 1)
3367 FREE_STACK_RETURN (REG_ECOLLATE);
3368
3369 /* Throw away the ] at the end of the equivalence
3370 class. */
3371 PATFETCH (c);
3372
3373 /* Set the bit for the character. */
3374 SET_LIST_BIT (str[0]);
3375 }
3376 # ifdef _LIBC
3377 else
3378 {
3379 /* Try to match the byte sequence in `str' against
3380 those known to the collate implementation.
3381 First find out whether the bytes in `str' are
3382 actually from exactly one character. */
3383 const int32_t *table;
3384 const unsigned char *weights;
3385 const unsigned char *extra;
3386 const int32_t *indirect;
3387 int32_t idx;
3388 const unsigned char *cp = str;
3389 int ch;
3390
3391 /* This #include defines a local function! */
3392 # include <locale/weight.h>
3393
3394 table = (const int32_t *)
3395 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3396 weights = (const unsigned char *)
3397 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3398 extra = (const unsigned char *)
3399 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3400 indirect = (const int32_t *)
3401 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3402
3403 idx = findidx (&cp);
3404 if (idx == 0 || cp < str + c1)
3405 /* This is no valid character. */
3406 FREE_STACK_RETURN (REG_ECOLLATE);
3407
3408 /* Throw away the ] at the end of the equivalence
3409 class. */
3410 PATFETCH (c);
3411
3412 /* Now we have to go through the whole table
3413 and find all characters which have the same
3414 first level weight.
3415
3416 XXX Note that this is not entirely correct.
3417 we would have to match multibyte sequences
3418 but this is not possible with the current
3419 implementation. */
3420 for (ch = 1; ch < 256; ++ch)
3421 /* XXX This test would have to be changed if we
3422 would allow matching multibyte sequences. */
3423 if (table[ch] > 0)
3424 {
3425 int32_t idx2 = table[ch];
3426 size_t len = weights[idx2];
3427
3428 /* Test whether the lenghts match. */
3429 if (weights[idx] == len)
3430 {
3431 /* They do. New compare the bytes of
3432 the weight. */
3433 size_t cnt = 0;
3434
3435 while (cnt < len
3436 && (weights[idx + 1 + cnt]
3437 == weights[idx2 + 1 + cnt]))
3438 ++cnt;
3439
3440 if (cnt == len)
3441 /* They match. Mark the character as
3442 acceptable. */
3443 SET_LIST_BIT (ch);
3444 }
3445 }
3446 }
3447 # endif
3448 had_char_class = true;
3449 }
3450 else
3451 {
3452 c1++;
3453 while (c1--)
3454 PATUNFETCH;
3455 SET_LIST_BIT ('[');
3456 SET_LIST_BIT ('=');
3457 range_start = '=';
3458 had_char_class = false;
3459 }
3460 }
3461 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
3462 {
3463 unsigned char str[128]; /* Should be large enough. */
3464 # ifdef _LIBC
3465 uint32_t nrules =
3466 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3467 # endif
3468
3469 PATFETCH (c);
3470 c1 = 0;
3471
3472 /* If pattern is `[[.'. */
3473 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3474
3475 for (;;)
3476 {
3477 PATFETCH (c);
3478 if ((c == '.' && *p == ']') || p == pend)
3479 break;
3480 if (c1 < sizeof (str))
3481 str[c1++] = c;
3482 else
3483 /* This is in any case an invalid class name. */
3484 str[0] = '\0';
3485 }
3486 str[c1] = '\0';
3487
3488 if (c == '.' && *p == ']' && str[0] != '\0')
3489 {
3490 /* If we have no collation data we use the default
3491 collation in which each character is the name
3492 for its own class which contains only the one
3493 character. It also means that ASCII is the
3494 character set and therefore we cannot have character
3495 with more than one byte in the multibyte
3496 representation. */
3497 # ifdef _LIBC
3498 if (nrules == 0)
3499 # endif
3500 {
3501 if (c1 != 1)
3502 FREE_STACK_RETURN (REG_ECOLLATE);
3503
3504 /* Throw away the ] at the end of the equivalence
3505 class. */
3506 PATFETCH (c);
3507
3508 /* Set the bit for the character. */
3509 SET_LIST_BIT (str[0]);
3510 range_start = ((const unsigned char *) str)[0];
3511 }
3512 # ifdef _LIBC
3513 else
3514 {
3515 /* Try to match the byte sequence in `str' against
3516 those known to the collate implementation.
3517 First find out whether the bytes in `str' are
3518 actually from exactly one character. */
3519 int32_t table_size;
3520 const int32_t *symb_table;
3521 const unsigned char *extra;
3522 int32_t idx;
3523 int32_t elem;
3524 int32_t second;
3525 int32_t hash;
3526
3527 table_size =
3528 _NL_CURRENT_WORD (LC_COLLATE,
3529 _NL_COLLATE_SYMB_HASH_SIZEMB);
3530 symb_table = (const int32_t *)
3531 _NL_CURRENT (LC_COLLATE,
3532 _NL_COLLATE_SYMB_TABLEMB);
3533 extra = (const unsigned char *)
3534 _NL_CURRENT (LC_COLLATE,
3535 _NL_COLLATE_SYMB_EXTRAMB);
3536
3537 /* Locate the character in the hashing table. */
3538 hash = elem_hash (str, c1);
3539
3540 idx = 0;
3541 elem = hash % table_size;
3542 second = hash % (table_size - 2);
3543 while (symb_table[2 * elem] != 0)
3544 {
3545 /* First compare the hashing value. */
3546 if (symb_table[2 * elem] == hash
3547 && c1 == extra[symb_table[2 * elem + 1]]
3548 && memcmp (str,
3549 &extra[symb_table[2 * elem + 1]
3550 + 1],
3551 c1) == 0)
3552 {
3553 /* Yep, this is the entry. */
3554 idx = symb_table[2 * elem + 1];
3555 idx += 1 + extra[idx];
3556 break;
3557 }
3558
3559 /* Next entry. */
3560 elem += second;
3561 }
3562
3563 if (symb_table[2 * elem] == 0)
3564 /* This is no valid character. */
3565 FREE_STACK_RETURN (REG_ECOLLATE);
3566
3567 /* Throw away the ] at the end of the equivalence
3568 class. */
3569 PATFETCH (c);
3570
3571 /* Now add the multibyte character(s) we found
3572 to the accept list.
3573
3574 XXX Note that this is not entirely correct.
3575 we would have to match multibyte sequences
3576 but this is not possible with the current
3577 implementation. Also, we have to match
3578 collating symbols, which expand to more than
3579 one file, as a whole and not allow the
3580 individual bytes. */
3581 c1 = extra[idx++];
3582 if (c1 == 1)
3583 range_start = extra[idx];
3584 while (c1-- > 0)
3585 {
3586 SET_LIST_BIT (extra[idx]);
3587 ++idx;
3588 }
3589 }
3590 # endif
3591 had_char_class = false;
3592 }
3593 else
3594 {
3595 c1++;
3596 while (c1--)
3597 PATUNFETCH;
3598 SET_LIST_BIT ('[');
3599 SET_LIST_BIT ('.');
3600 range_start = '.';
3601 had_char_class = false;
3602 }
3603 }
3604 else
3605 {
3606 had_char_class = false;
3607 SET_LIST_BIT (c);
3608 range_start = c;
3609 }
3610 }
3611
3612 /* Discard any (non)matching list bytes that are all 0 at the
3613 end of the map. Decrease the map-length byte too. */
3614 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3615 b[-1]--;
3616 b += b[-1];
3617 #endif /* WCHAR */
3618 }
3619 break;
3620
3621
3622 case '(':
3623 if (syntax & RE_NO_BK_PARENS)
3624 goto handle_open;
3625 else
3626 goto normal_char;
3627
3628
3629 case ')':
3630 if (syntax & RE_NO_BK_PARENS)
3631 goto handle_close;
3632 else
3633 goto normal_char;
3634
3635
3636 case '\n':
3637 if (syntax & RE_NEWLINE_ALT)
3638 goto handle_alt;
3639 else
3640 goto normal_char;
3641
3642
3643 case '|':
3644 if (syntax & RE_NO_BK_VBAR)
3645 goto handle_alt;
3646 else
3647 goto normal_char;
3648
3649
3650 case '{':
3651 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3652 goto handle_interval;
3653 else
3654 goto normal_char;
3655
3656
3657 case '\\':
3658 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3659
3660 /* Do not translate the character after the \, so that we can
3661 distinguish, e.g., \B from \b, even if we normally would
3662 translate, e.g., B to b. */
3663 PATFETCH_RAW (c);
3664
3665 switch (c)
3666 {
3667 case '(':
3668 if (syntax & RE_NO_BK_PARENS)
3669 goto normal_backslash;
3670
3671 handle_open:
3672 bufp->re_nsub++;
3673 regnum++;
3674
3675 if (COMPILE_STACK_FULL)
3676 {
3677 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3678 compile_stack_elt_t);
3679 if (compile_stack.stack == NULL) return REG_ESPACE;
3680
3681 compile_stack.size <<= 1;
3682 }
3683
3684 /* These are the values to restore when we hit end of this
3685 group. They are all relative offsets, so that if the
3686 whole pattern moves because of realloc, they will still
3687 be valid. */
3688 COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR;
3689 COMPILE_STACK_TOP.fixup_alt_jump
3690 = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0;
3691 COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR;
3692 COMPILE_STACK_TOP.regnum = regnum;
3693
3694 /* We will eventually replace the 0 with the number of
3695 groups inner to this one. But do not push a
3696 start_memory for groups beyond the last one we can
3697 represent in the compiled pattern. */
3698 if (regnum <= MAX_REGNUM)
3699 {
3700 COMPILE_STACK_TOP.inner_group_offset = b
3701 - COMPILED_BUFFER_VAR + 2;
3702 BUF_PUSH_3 (start_memory, regnum, 0);
3703 }
3704
3705 compile_stack.avail++;
3706
3707 fixup_alt_jump = 0;
3708 laststart = 0;
3709 begalt = b;
3710 /* If we've reached MAX_REGNUM groups, then this open
3711 won't actually generate any code, so we'll have to
3712 clear pending_exact explicitly. */
3713 pending_exact = 0;
3714 break;
3715
3716
3717 case ')':
3718 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3719
3720 if (COMPILE_STACK_EMPTY)
3721 {
3722 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3723 goto normal_backslash;
3724 else
3725 FREE_STACK_RETURN (REG_ERPAREN);
3726 }
3727
3728 handle_close:
3729 if (fixup_alt_jump)
3730 { /* Push a dummy failure point at the end of the
3731 alternative for a possible future
3732 `pop_failure_jump' to pop. See comments at
3733 `push_dummy_failure' in `re_match_2'. */
3734 BUF_PUSH (push_dummy_failure);
3735
3736 /* We allocated space for this jump when we assigned
3737 to `fixup_alt_jump', in the `handle_alt' case below. */
3738 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
3739 }
3740
3741 /* See similar code for backslashed left paren above. */
3742 if (COMPILE_STACK_EMPTY)
3743 {
3744 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3745 goto normal_char;
3746 else
3747 FREE_STACK_RETURN (REG_ERPAREN);
3748 }
3749
3750 /* Since we just checked for an empty stack above, this
3751 ``can't happen''. */
3752 assert (compile_stack.avail != 0);
3753 {
3754 /* We don't just want to restore into `regnum', because
3755 later groups should continue to be numbered higher,
3756 as in `(ab)c(de)' -- the second group is #2. */
3757 regnum_t this_group_regnum;
3758
3759 compile_stack.avail--;
3760 begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset;
3761 fixup_alt_jump
3762 = COMPILE_STACK_TOP.fixup_alt_jump
3763 ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1
3764 : 0;
3765 laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset;
3766 this_group_regnum = COMPILE_STACK_TOP.regnum;
3767 /* If we've reached MAX_REGNUM groups, then this open
3768 won't actually generate any code, so we'll have to
3769 clear pending_exact explicitly. */
3770 pending_exact = 0;
3771
3772 /* We're at the end of the group, so now we know how many
3773 groups were inside this one. */
3774 if (this_group_regnum <= MAX_REGNUM)
3775 {
3776 UCHAR_T *inner_group_loc
3777 = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset;
3778
3779 *inner_group_loc = regnum - this_group_regnum;
3780 BUF_PUSH_3 (stop_memory, this_group_regnum,
3781 regnum - this_group_regnum);
3782 }
3783 }
3784 break;
3785
3786
3787 case '|': /* `\|'. */
3788 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3789 goto normal_backslash;
3790 handle_alt:
3791 if (syntax & RE_LIMITED_OPS)
3792 goto normal_char;
3793
3794 /* Insert before the previous alternative a jump which
3795 jumps to this alternative if the former fails. */
3796 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3797 INSERT_JUMP (on_failure_jump, begalt,
3798 b + 2 + 2 * OFFSET_ADDRESS_SIZE);
3799 pending_exact = 0;
3800 b += 1 + OFFSET_ADDRESS_SIZE;
3801
3802 /* The alternative before this one has a jump after it
3803 which gets executed if it gets matched. Adjust that
3804 jump so it will jump to this alternative's analogous
3805 jump (put in below, which in turn will jump to the next
3806 (if any) alternative's such jump, etc.). The last such
3807 jump jumps to the correct final destination. A picture:
3808 _____ _____
3809 | | | |
3810 | v | v
3811 a | b | c
3812
3813 If we are at `b', then fixup_alt_jump right now points to a
3814 three-byte space after `a'. We'll put in the jump, set
3815 fixup_alt_jump to right after `b', and leave behind three
3816 bytes which we'll fill in when we get to after `c'. */
3817
3818 if (fixup_alt_jump)
3819 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3820
3821 /* Mark and leave space for a jump after this alternative,
3822 to be filled in later either by next alternative or
3823 when know we're at the end of a series of alternatives. */
3824 fixup_alt_jump = b;
3825 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3826 b += 1 + OFFSET_ADDRESS_SIZE;
3827
3828 laststart = 0;
3829 begalt = b;
3830 break;
3831
3832
3833 case '{':
3834 /* If \{ is a literal. */
3835 if (!(syntax & RE_INTERVALS)
3836 /* If we're at `\{' and it's not the open-interval
3837 operator. */
3838 || (syntax & RE_NO_BK_BRACES))
3839 goto normal_backslash;
3840
3841 handle_interval:
3842 {
3843 /* If got here, then the syntax allows intervals. */
3844
3845 /* At least (most) this many matches must be made. */
3846 int lower_bound = -1, upper_bound = -1;
3847
3848 /* Place in the uncompiled pattern (i.e., just after
3849 the '{') to go back to if the interval is invalid. */
3850 const CHAR_T *beg_interval = p;
3851
3852 if (p == pend)
3853 goto invalid_interval;
3854
3855 GET_UNSIGNED_NUMBER (lower_bound);
3856
3857 if (c == ',')
3858 {
3859 GET_UNSIGNED_NUMBER (upper_bound);
3860 if (upper_bound < 0)
3861 upper_bound = RE_DUP_MAX;
3862 }
3863 else
3864 /* Interval such as `{1}' => match exactly once. */
3865 upper_bound = lower_bound;
3866
3867 if (! (0 <= lower_bound && lower_bound <= upper_bound))
3868 goto invalid_interval;
3869
3870 if (!(syntax & RE_NO_BK_BRACES))
3871 {
3872 if (c != '\\' || p == pend)
3873 goto invalid_interval;
3874 PATFETCH (c);
3875 }
3876
3877 if (c != '}')
3878 goto invalid_interval;
3879
3880 /* If it's invalid to have no preceding re. */
3881 if (!laststart)
3882 {
3883 if (syntax & RE_CONTEXT_INVALID_OPS
3884 && !(syntax & RE_INVALID_INTERVAL_ORD))
3885 FREE_STACK_RETURN (REG_BADRPT);
3886 else if (syntax & RE_CONTEXT_INDEP_OPS)
3887 laststart = b;
3888 else
3889 goto unfetch_interval;
3890 }
3891
3892 /* We just parsed a valid interval. */
3893
3894 if (RE_DUP_MAX < upper_bound)
3895 FREE_STACK_RETURN (REG_BADBR);
3896
3897 /* If the upper bound is zero, don't want to succeed at
3898 all; jump from `laststart' to `b + 3', which will be
3899 the end of the buffer after we insert the jump. */
3900 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE'
3901 instead of 'b + 3'. */
3902 if (upper_bound == 0)
3903 {
3904 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3905 INSERT_JUMP (jump, laststart, b + 1
3906 + OFFSET_ADDRESS_SIZE);
3907 b += 1 + OFFSET_ADDRESS_SIZE;
3908 }
3909
3910 /* Otherwise, we have a nontrivial interval. When
3911 we're all done, the pattern will look like:
3912 set_number_at <jump count> <upper bound>
3913 set_number_at <succeed_n count> <lower bound>
3914 succeed_n <after jump addr> <succeed_n count>
3915 <body of loop>
3916 jump_n <succeed_n addr> <jump count>
3917 (The upper bound and `jump_n' are omitted if
3918 `upper_bound' is 1, though.) */
3919 else
3920 { /* If the upper bound is > 1, we need to insert
3921 more at the end of the loop. */
3922 unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE +
3923 (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE);
3924
3925 GET_BUFFER_SPACE (nbytes);
3926
3927 /* Initialize lower bound of the `succeed_n', even
3928 though it will be set during matching by its
3929 attendant `set_number_at' (inserted next),
3930 because `re_compile_fastmap' needs to know.
3931 Jump to the `jump_n' we might insert below. */
3932 INSERT_JUMP2 (succeed_n, laststart,
3933 b + 1 + 2 * OFFSET_ADDRESS_SIZE
3934 + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE)
3935 , lower_bound);
3936 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3937
3938 /* Code to initialize the lower bound. Insert
3939 before the `succeed_n'. The `5' is the last two
3940 bytes of this `set_number_at', plus 3 bytes of
3941 the following `succeed_n'. */
3942 /* ifdef WCHAR, The '1+2*OFFSET_ADDRESS_SIZE'
3943 is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE'
3944 of the following `succeed_n'. */
3945 PREFIX(insert_op2) (set_number_at, laststart, 1
3946 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b);
3947 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3948
3949 if (upper_bound > 1)
3950 { /* More than one repetition is allowed, so
3951 append a backward jump to the `succeed_n'
3952 that starts this interval.
3953
3954 When we've reached this during matching,
3955 we'll have matched the interval once, so
3956 jump back only `upper_bound - 1' times. */
3957 STORE_JUMP2 (jump_n, b, laststart
3958 + 2 * OFFSET_ADDRESS_SIZE + 1,
3959 upper_bound - 1);
3960 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3961
3962 /* The location we want to set is the second
3963 parameter of the `jump_n'; that is `b-2' as
3964 an absolute address. `laststart' will be
3965 the `set_number_at' we're about to insert;
3966 `laststart+3' the number to set, the source
3967 for the relative address. But we are
3968 inserting into the middle of the pattern --
3969 so everything is getting moved up by 5.
3970 Conclusion: (b - 2) - (laststart + 3) + 5,
3971 i.e., b - laststart.
3972
3973 We insert this at the beginning of the loop
3974 so that if we fail during matching, we'll
3975 reinitialize the bounds. */
3976 PREFIX(insert_op2) (set_number_at, laststart,
3977 b - laststart,
3978 upper_bound - 1, b);
3979 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3980 }
3981 }
3982 pending_exact = 0;
3983 break;
3984
3985 invalid_interval:
3986 if (!(syntax & RE_INVALID_INTERVAL_ORD))
3987 FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR);
3988 unfetch_interval:
3989 /* Match the characters as literals. */
3990 p = beg_interval;
3991 c = '{';
3992 if (syntax & RE_NO_BK_BRACES)
3993 goto normal_char;
3994 else
3995 goto normal_backslash;
3996 }
3997
3998 #ifdef emacs
3999 /* There is no way to specify the before_dot and after_dot
4000 operators. rms says this is ok. --karl */
4001 case '=':
4002 BUF_PUSH (at_dot);
4003 break;
4004
4005 case 's':
4006 laststart = b;
4007 PATFETCH (c);
4008 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
4009 break;
4010
4011 case 'S':
4012 laststart = b;
4013 PATFETCH (c);
4014 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
4015 break;
4016 #endif /* emacs */
4017
4018
4019 case 'w':
4020 if (syntax & RE_NO_GNU_OPS)
4021 goto normal_char;
4022 laststart = b;
4023 BUF_PUSH (wordchar);
4024 break;
4025
4026
4027 case 'W':
4028 if (syntax & RE_NO_GNU_OPS)
4029 goto normal_char;
4030 laststart = b;
4031 BUF_PUSH (notwordchar);
4032 break;
4033
4034
4035 case '<':
4036 if (syntax & RE_NO_GNU_OPS)
4037 goto normal_char;
4038 BUF_PUSH (wordbeg);
4039 break;
4040
4041 case '>':
4042 if (syntax & RE_NO_GNU_OPS)
4043 goto normal_char;
4044 BUF_PUSH (wordend);
4045 break;
4046
4047 case 'b':
4048 if (syntax & RE_NO_GNU_OPS)
4049 goto normal_char;
4050 BUF_PUSH (wordbound);
4051 break;
4052
4053 case 'B':
4054 if (syntax & RE_NO_GNU_OPS)
4055 goto normal_char;
4056 BUF_PUSH (notwordbound);
4057 break;
4058
4059 case '`':
4060 if (syntax & RE_NO_GNU_OPS)
4061 goto normal_char;
4062 BUF_PUSH (begbuf);
4063 break;
4064
4065 case '\'':
4066 if (syntax & RE_NO_GNU_OPS)
4067 goto normal_char;
4068 BUF_PUSH (endbuf);
4069 break;
4070
4071 case '1': case '2': case '3': case '4': case '5':
4072 case '6': case '7': case '8': case '9':
4073 if (syntax & RE_NO_BK_REFS)
4074 goto normal_char;
4075
4076 c1 = c - '0';
4077
4078 if (c1 > regnum)
4079 FREE_STACK_RETURN (REG_ESUBREG);
4080
4081 /* Can't back reference to a subexpression if inside of it. */
4082 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
4083 goto normal_char;
4084
4085 laststart = b;
4086 BUF_PUSH_2 (duplicate, c1);
4087 break;
4088
4089
4090 case '+':
4091 case '?':
4092 if (syntax & RE_BK_PLUS_QM)
4093 goto handle_plus;
4094 else
4095 goto normal_backslash;
4096
4097 default:
4098 normal_backslash:
4099 /* You might think it would be useful for \ to mean
4100 not to translate; but if we don't translate it
4101 it will never match anything. */
4102 c = TRANSLATE (c);
4103 goto normal_char;
4104 }
4105 break;
4106
4107
4108 default:
4109 /* Expects the character in `c'. */
4110 normal_char:
4111 /* If no exactn currently being built. */
4112 if (!pending_exact
4113 #ifdef WCHAR
4114 /* If last exactn handle binary(or character) and
4115 new exactn handle character(or binary). */
4116 || is_exactn_bin != is_binary[p - 1 - pattern]
4117 #endif /* WCHAR */
4118
4119 /* If last exactn not at current position. */
4120 || pending_exact + *pending_exact + 1 != b
4121
4122 /* We have only one byte following the exactn for the count. */
4123 || *pending_exact == (1 << BYTEWIDTH) - 1
4124
4125 /* If followed by a repetition operator. */
4126 || *p == '*' || *p == '^'
4127 || ((syntax & RE_BK_PLUS_QM)
4128 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
4129 : (*p == '+' || *p == '?'))
4130 || ((syntax & RE_INTERVALS)
4131 && ((syntax & RE_NO_BK_BRACES)
4132 ? *p == '{'
4133 : (p[0] == '\\' && p[1] == '{'))))
4134 {
4135 /* Start building a new exactn. */
4136
4137 laststart = b;
4138
4139 #ifdef WCHAR
4140 /* Is this exactn binary data or character? */
4141 is_exactn_bin = is_binary[p - 1 - pattern];
4142 if (is_exactn_bin)
4143 BUF_PUSH_2 (exactn_bin, 0);
4144 else
4145 BUF_PUSH_2 (exactn, 0);
4146 #else
4147 BUF_PUSH_2 (exactn, 0);
4148 #endif /* WCHAR */
4149 pending_exact = b - 1;
4150 }
4151
4152 BUF_PUSH (c);
4153 (*pending_exact)++;
4154 break;
4155 } /* switch (c) */
4156 } /* while p != pend */
4157
4158
4159 /* Through the pattern now. */
4160
4161 if (fixup_alt_jump)
4162 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
4163
4164 if (!COMPILE_STACK_EMPTY)
4165 FREE_STACK_RETURN (REG_EPAREN);
4166
4167 /* If we don't want backtracking, force success
4168 the first time we reach the end of the compiled pattern. */
4169 if (syntax & RE_NO_POSIX_BACKTRACKING)
4170 BUF_PUSH (succeed);
4171
4172 #ifdef WCHAR
4173 free (pattern);
4174 free (mbs_offset);
4175 free (is_binary);
4176 #endif
4177 free (compile_stack.stack);
4178
4179 /* We have succeeded; set the length of the buffer. */
4180 #ifdef WCHAR
4181 bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR;
4182 #else
4183 bufp->used = b - bufp->buffer;
4184 #endif
4185
4186 #ifdef DEBUG
4187 if (debug)
4188 {
4189 DEBUG_PRINT1 ("\nCompiled pattern: \n");
4190 PREFIX(print_compiled_pattern) (bufp);
4191 }
4192 #endif /* DEBUG */
4193
4194 #ifndef MATCH_MAY_ALLOCATE
4195 /* Initialize the failure stack to the largest possible stack. This
4196 isn't necessary unless we're trying to avoid calling alloca in
4197 the search and match routines. */
4198 {
4199 int num_regs = bufp->re_nsub + 1;
4200
4201 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
4202 is strictly greater than re_max_failures, the largest possible stack
4203 is 2 * re_max_failures failure points. */
4204 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
4205 {
4206 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
4207
4208 # ifdef emacs
4209 if (! fail_stack.stack)
4210 fail_stack.stack
4211 = (PREFIX(fail_stack_elt_t) *) xmalloc (fail_stack.size
4212 * sizeof (PREFIX(fail_stack_elt_t)));
4213 else
4214 fail_stack.stack
4215 = (PREFIX(fail_stack_elt_t) *) xrealloc (fail_stack.stack,
4216 (fail_stack.size
4217 * sizeof (PREFIX(fail_stack_elt_t))));
4218 # else /* not emacs */
4219 if (! fail_stack.stack)
4220 fail_stack.stack
4221 = (PREFIX(fail_stack_elt_t) *) malloc (fail_stack.size
4222 * sizeof (PREFIX(fail_stack_elt_t)));
4223 else
4224 fail_stack.stack
4225 = (PREFIX(fail_stack_elt_t) *) realloc (fail_stack.stack,
4226 (fail_stack.size
4227 * sizeof (PREFIX(fail_stack_elt_t))));
4228 # endif /* not emacs */
4229 }
4230
4231 PREFIX(regex_grow_registers) (num_regs);
4232 }
4233 #endif /* not MATCH_MAY_ALLOCATE */
4234
4235 return REG_NOERROR;
4236 } /* regex_compile */
4237
4238 /* Subroutines for `regex_compile'. */
4239
4240 /* Store OP at LOC followed by two-byte integer parameter ARG. */
4241 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4242
4243 static void
4244 PREFIX(store_op1) (re_opcode_t op, UCHAR_T *loc, int arg)
4245 {
4246 *loc = (UCHAR_T) op;
4247 STORE_NUMBER (loc + 1, arg);
4248 }
4249
4250
4251 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
4252 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4253
4254 static void
4255 PREFIX(store_op2) (re_opcode_t op, UCHAR_T *loc, int arg1, int arg2)
4256 {
4257 *loc = (UCHAR_T) op;
4258 STORE_NUMBER (loc + 1, arg1);
4259 STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2);
4260 }
4261
4262
4263 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
4264 for OP followed by two-byte integer parameter ARG. */
4265 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4266
4267 static void
4268 PREFIX(insert_op1) (re_opcode_t op, UCHAR_T *loc, int arg, UCHAR_T *end)
4269 {
4270 register UCHAR_T *pfrom = end;
4271 register UCHAR_T *pto = end + 1 + OFFSET_ADDRESS_SIZE;
4272
4273 while (pfrom != loc)
4274 *--pto = *--pfrom;
4275
4276 PREFIX(store_op1) (op, loc, arg);
4277 }
4278
4279
4280 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
4281 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4282
4283 static void
4284 PREFIX(insert_op2) (re_opcode_t op, UCHAR_T *loc, int arg1,
4285 int arg2, UCHAR_T *end)
4286 {
4287 register UCHAR_T *pfrom = end;
4288 register UCHAR_T *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE;
4289
4290 while (pfrom != loc)
4291 *--pto = *--pfrom;
4292
4293 PREFIX(store_op2) (op, loc, arg1, arg2);
4294 }
4295
4296
4297 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
4298 after an alternative or a begin-subexpression. We assume there is at
4299 least one character before the ^. */
4300
4301 static boolean
4302 PREFIX(at_begline_loc_p) (const CHAR_T *pattern, const CHAR_T *p,
4303 reg_syntax_t syntax)
4304 {
4305 const CHAR_T *prev = p - 2;
4306 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
4307
4308 return
4309 /* After a subexpression? */
4310 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
4311 /* After an alternative? */
4312 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
4313 }
4314
4315
4316 /* The dual of at_begline_loc_p. This one is for $. We assume there is
4317 at least one character after the $, i.e., `P < PEND'. */
4318
4319 static boolean
4320 PREFIX(at_endline_loc_p) (const CHAR_T *p, const CHAR_T *pend,
4321 reg_syntax_t syntax)
4322 {
4323 const CHAR_T *next = p;
4324 boolean next_backslash = *next == '\\';
4325 const CHAR_T *next_next = p + 1 < pend ? p + 1 : 0;
4326
4327 return
4328 /* Before a subexpression? */
4329 (syntax & RE_NO_BK_PARENS ? *next == ')'
4330 : next_backslash && next_next && *next_next == ')')
4331 /* Before an alternative? */
4332 || (syntax & RE_NO_BK_VBAR ? *next == '|'
4333 : next_backslash && next_next && *next_next == '|');
4334 }
4335
4336 #else /* not INSIDE_RECURSION */
4337
4338 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4339 false if it's not. */
4340
4341 static boolean
4342 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
4343 {
4344 int this_element;
4345
4346 for (this_element = compile_stack.avail - 1;
4347 this_element >= 0;
4348 this_element--)
4349 if (compile_stack.stack[this_element].regnum == regnum)
4350 return true;
4351
4352 return false;
4353 }
4354 #endif /* not INSIDE_RECURSION */
4355
4356 #ifdef INSIDE_RECURSION
4357
4358 #ifdef WCHAR
4359 /* This insert space, which size is "num", into the pattern at "loc".
4360 "end" must point the end of the allocated buffer. */
4361 static void
4362 insert_space (int num, CHAR_T *loc, CHAR_T *end)
4363 {
4364 register CHAR_T *pto = end;
4365 register CHAR_T *pfrom = end - num;
4366
4367 while (pfrom >= loc)
4368 *pto-- = *pfrom--;
4369 }
4370 #endif /* WCHAR */
4371
4372 #ifdef WCHAR
4373 static reg_errcode_t
4374 wcs_compile_range (CHAR_T range_start_char, const CHAR_T **p_ptr,
4375 const CHAR_T *pend, RE_TRANSLATE_TYPE translate,
4376 reg_syntax_t syntax, CHAR_T *b, CHAR_T *char_set)
4377 {
4378 const CHAR_T *p = *p_ptr;
4379 CHAR_T range_start, range_end;
4380 reg_errcode_t ret;
4381 # ifdef _LIBC
4382 uint32_t nrules;
4383 uint32_t start_val, end_val;
4384 # endif
4385 if (p == pend)
4386 return REG_ERANGE;
4387
4388 # ifdef _LIBC
4389 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4390 if (nrules != 0)
4391 {
4392 const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
4393 _NL_COLLATE_COLLSEQWC);
4394 const unsigned char *extra = (const unsigned char *)
4395 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4396
4397 if (range_start_char < -1)
4398 {
4399 /* range_start is a collating symbol. */
4400 int32_t *wextra;
4401 /* Retreive the index and get collation sequence value. */
4402 wextra = (int32_t*)(extra + char_set[-range_start_char]);
4403 start_val = wextra[1 + *wextra];
4404 }
4405 else
4406 start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char));
4407
4408 end_val = collseq_table_lookup (collseq, TRANSLATE (p[0]));
4409
4410 /* Report an error if the range is empty and the syntax prohibits
4411 this. */
4412 ret = ((syntax & RE_NO_EMPTY_RANGES)
4413 && (start_val > end_val))? REG_ERANGE : REG_NOERROR;
4414
4415 /* Insert space to the end of the char_ranges. */
4416 insert_space(2, b - char_set[5] - 2, b - 1);
4417 *(b - char_set[5] - 2) = (wchar_t)start_val;
4418 *(b - char_set[5] - 1) = (wchar_t)end_val;
4419 char_set[4]++; /* ranges_index */
4420 }
4421 else
4422 # endif
4423 {
4424 range_start = (range_start_char >= 0)? TRANSLATE (range_start_char):
4425 range_start_char;
4426 range_end = TRANSLATE (p[0]);
4427 /* Report an error if the range is empty and the syntax prohibits
4428 this. */
4429 ret = ((syntax & RE_NO_EMPTY_RANGES)
4430 && (range_start > range_end))? REG_ERANGE : REG_NOERROR;
4431
4432 /* Insert space to the end of the char_ranges. */
4433 insert_space(2, b - char_set[5] - 2, b - 1);
4434 *(b - char_set[5] - 2) = range_start;
4435 *(b - char_set[5] - 1) = range_end;
4436 char_set[4]++; /* ranges_index */
4437 }
4438 /* Have to increment the pointer into the pattern string, so the
4439 caller isn't still at the ending character. */
4440 (*p_ptr)++;
4441
4442 return ret;
4443 }
4444 #else /* BYTE */
4445 /* Read the ending character of a range (in a bracket expression) from the
4446 uncompiled pattern *P_PTR (which ends at PEND). We assume the
4447 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4448 Then we set the translation of all bits between the starting and
4449 ending characters (inclusive) in the compiled pattern B.
4450
4451 Return an error code.
4452
4453 We use these short variable names so we can use the same macros as
4454 `regex_compile' itself. */
4455
4456 static reg_errcode_t
4457 byte_compile_range (unsigned int range_start_char, const char **p_ptr,
4458 const char *pend, RE_TRANSLATE_TYPE translate,
4459 reg_syntax_t syntax, unsigned char *b)
4460 {
4461 unsigned this_char;
4462 const char *p = *p_ptr;
4463 reg_errcode_t ret;
4464 # if _LIBC
4465 const unsigned char *collseq;
4466 unsigned int start_colseq;
4467 unsigned int end_colseq;
4468 # else
4469 unsigned end_char;
4470 # endif
4471
4472 if (p == pend)
4473 return REG_ERANGE;
4474
4475 /* Have to increment the pointer into the pattern string, so the
4476 caller isn't still at the ending character. */
4477 (*p_ptr)++;
4478
4479 /* Report an error if the range is empty and the syntax prohibits this. */
4480 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
4481
4482 # if _LIBC
4483 collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
4484 _NL_COLLATE_COLLSEQMB);
4485
4486 start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)];
4487 end_colseq = collseq[(unsigned char) TRANSLATE (p[0])];
4488 for (this_char = 0; this_char <= (unsigned char) -1; ++this_char)
4489 {
4490 unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)];
4491
4492 if (start_colseq <= this_colseq && this_colseq <= end_colseq)
4493 {
4494 SET_LIST_BIT (TRANSLATE (this_char));
4495 ret = REG_NOERROR;
4496 }
4497 }
4498 # else
4499 /* Here we see why `this_char' has to be larger than an `unsigned
4500 char' -- we would otherwise go into an infinite loop, since all
4501 characters <= 0xff. */
4502 range_start_char = TRANSLATE (range_start_char);
4503 /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE,
4504 and some compilers cast it to int implicitly, so following for_loop
4505 may fall to (almost) infinite loop.
4506 e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff.
4507 To avoid this, we cast p[0] to unsigned int and truncate it. */
4508 end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1));
4509
4510 for (this_char = range_start_char; this_char <= end_char; ++this_char)
4511 {
4512 SET_LIST_BIT (TRANSLATE (this_char));
4513 ret = REG_NOERROR;
4514 }
4515 # endif
4516
4517 return ret;
4518 }
4519 #endif /* WCHAR */
4520
4521 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4523 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4524 characters can start a string that matches the pattern. This fastmap
4525 is used by re_search to skip quickly over impossible starting points.
4526
4527 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4528 area as BUFP->fastmap.
4529
4530 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4531 the pattern buffer.
4532
4533 Returns 0 if we succeed, -2 if an internal error. */
4534
4535 #ifdef WCHAR
4536 /* local function for re_compile_fastmap.
4537 truncate wchar_t character to char. */
4538 static unsigned char truncate_wchar (CHAR_T c);
4539
4540 static unsigned char
4541 truncate_wchar (CHAR_T c)
4542 {
4543 unsigned char buf[MB_CUR_MAX];
4544 mbstate_t state;
4545 int retval;
4546 memset (&state, '\0', sizeof (state));
4547 # ifdef _LIBC
4548 retval = __wcrtomb (buf, c, &state);
4549 # else
4550 retval = wcrtomb (buf, c, &state);
4551 # endif
4552 return retval > 0 ? buf[0] : (unsigned char) c;
4553 }
4554 #endif /* WCHAR */
4555
4556 static int
4557 PREFIX(re_compile_fastmap) (struct re_pattern_buffer *bufp)
4558 {
4559 int j, k;
4560 #ifdef MATCH_MAY_ALLOCATE
4561 PREFIX(fail_stack_type) fail_stack;
4562 #endif
4563 #ifndef REGEX_MALLOC
4564 char *destination;
4565 #endif
4566
4567 register char *fastmap = bufp->fastmap;
4568
4569 #ifdef WCHAR
4570 /* We need to cast pattern to (wchar_t*), because we casted this compiled
4571 pattern to (char*) in regex_compile. */
4572 UCHAR_T *pattern = (UCHAR_T*)bufp->buffer;
4573 register UCHAR_T *pend = (UCHAR_T*) (bufp->buffer + bufp->used);
4574 #else /* BYTE */
4575 UCHAR_T *pattern = bufp->buffer;
4576 register UCHAR_T *pend = pattern + bufp->used;
4577 #endif /* WCHAR */
4578 UCHAR_T *p = pattern;
4579
4580 #ifdef REL_ALLOC
4581 /* This holds the pointer to the failure stack, when
4582 it is allocated relocatably. */
4583 fail_stack_elt_t *failure_stack_ptr;
4584 #endif
4585
4586 /* Assume that each path through the pattern can be null until
4587 proven otherwise. We set this false at the bottom of switch
4588 statement, to which we get only if a particular path doesn't
4589 match the empty string. */
4590 boolean path_can_be_null = true;
4591
4592 /* We aren't doing a `succeed_n' to begin with. */
4593 boolean succeed_n_p = false;
4594
4595 assert (fastmap != NULL && p != NULL);
4596
4597 INIT_FAIL_STACK ();
4598 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4599 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4600 bufp->can_be_null = 0;
4601
4602 while (1)
4603 {
4604 if (p == pend || *p == (UCHAR_T) succeed)
4605 {
4606 /* We have reached the (effective) end of pattern. */
4607 if (!FAIL_STACK_EMPTY ())
4608 {
4609 bufp->can_be_null |= path_can_be_null;
4610
4611 /* Reset for next path. */
4612 path_can_be_null = true;
4613
4614 p = fail_stack.stack[--fail_stack.avail].pointer;
4615
4616 continue;
4617 }
4618 else
4619 break;
4620 }
4621
4622 /* We should never be about to go beyond the end of the pattern. */
4623 assert (p < pend);
4624
4625 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4626 {
4627
4628 /* I guess the idea here is to simply not bother with a fastmap
4629 if a backreference is used, since it's too hard to figure out
4630 the fastmap for the corresponding group. Setting
4631 `can_be_null' stops `re_search_2' from using the fastmap, so
4632 that is all we do. */
4633 case duplicate:
4634 bufp->can_be_null = 1;
4635 goto done;
4636
4637
4638 /* Following are the cases which match a character. These end
4639 with `break'. */
4640
4641 #ifdef WCHAR
4642 case exactn:
4643 fastmap[truncate_wchar(p[1])] = 1;
4644 break;
4645 #else /* BYTE */
4646 case exactn:
4647 fastmap[p[1]] = 1;
4648 break;
4649 #endif /* WCHAR */
4650 #ifdef MBS_SUPPORT
4651 case exactn_bin:
4652 fastmap[p[1]] = 1;
4653 break;
4654 #endif
4655
4656 #ifdef WCHAR
4657 /* It is hard to distinguish fastmap from (multi byte) characters
4658 which depends on current locale. */
4659 case charset:
4660 case charset_not:
4661 case wordchar:
4662 case notwordchar:
4663 bufp->can_be_null = 1;
4664 goto done;
4665 #else /* BYTE */
4666 case charset:
4667 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4668 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
4669 fastmap[j] = 1;
4670 break;
4671
4672
4673 case charset_not:
4674 /* Chars beyond end of map must be allowed. */
4675 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
4676 fastmap[j] = 1;
4677
4678 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4679 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
4680 fastmap[j] = 1;
4681 break;
4682
4683
4684 case wordchar:
4685 for (j = 0; j < (1 << BYTEWIDTH); j++)
4686 if (SYNTAX (j) == Sword)
4687 fastmap[j] = 1;
4688 break;
4689
4690
4691 case notwordchar:
4692 for (j = 0; j < (1 << BYTEWIDTH); j++)
4693 if (SYNTAX (j) != Sword)
4694 fastmap[j] = 1;
4695 break;
4696 #endif /* WCHAR */
4697
4698 case anychar:
4699 {
4700 int fastmap_newline = fastmap['\n'];
4701
4702 /* `.' matches anything ... */
4703 for (j = 0; j < (1 << BYTEWIDTH); j++)
4704 fastmap[j] = 1;
4705
4706 /* ... except perhaps newline. */
4707 if (!(bufp->syntax & RE_DOT_NEWLINE))
4708 fastmap['\n'] = fastmap_newline;
4709
4710 /* Return if we have already set `can_be_null'; if we have,
4711 then the fastmap is irrelevant. Something's wrong here. */
4712 else if (bufp->can_be_null)
4713 goto done;
4714
4715 /* Otherwise, have to check alternative paths. */
4716 break;
4717 }
4718
4719 #ifdef emacs
4720 case syntaxspec:
4721 k = *p++;
4722 for (j = 0; j < (1 << BYTEWIDTH); j++)
4723 if (SYNTAX (j) == (enum syntaxcode) k)
4724 fastmap[j] = 1;
4725 break;
4726
4727
4728 case notsyntaxspec:
4729 k = *p++;
4730 for (j = 0; j < (1 << BYTEWIDTH); j++)
4731 if (SYNTAX (j) != (enum syntaxcode) k)
4732 fastmap[j] = 1;
4733 break;
4734
4735
4736 /* All cases after this match the empty string. These end with
4737 `continue'. */
4738
4739
4740 case before_dot:
4741 case at_dot:
4742 case after_dot:
4743 continue;
4744 #endif /* emacs */
4745
4746
4747 case no_op:
4748 case begline:
4749 case endline:
4750 case begbuf:
4751 case endbuf:
4752 case wordbound:
4753 case notwordbound:
4754 case wordbeg:
4755 case wordend:
4756 case push_dummy_failure:
4757 continue;
4758
4759
4760 case jump_n:
4761 case pop_failure_jump:
4762 case maybe_pop_jump:
4763 case jump:
4764 case jump_past_alt:
4765 case dummy_failure_jump:
4766 EXTRACT_NUMBER_AND_INCR (j, p);
4767 p += j;
4768 if (j > 0)
4769 continue;
4770
4771 /* Jump backward implies we just went through the body of a
4772 loop and matched nothing. Opcode jumped to should be
4773 `on_failure_jump' or `succeed_n'. Just treat it like an
4774 ordinary jump. For a * loop, it has pushed its failure
4775 point already; if so, discard that as redundant. */
4776 if ((re_opcode_t) *p != on_failure_jump
4777 && (re_opcode_t) *p != succeed_n)
4778 continue;
4779
4780 p++;
4781 EXTRACT_NUMBER_AND_INCR (j, p);
4782 p += j;
4783
4784 /* If what's on the stack is where we are now, pop it. */
4785 if (!FAIL_STACK_EMPTY ()
4786 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
4787 fail_stack.avail--;
4788
4789 continue;
4790
4791
4792 case on_failure_jump:
4793 case on_failure_keep_string_jump:
4794 handle_on_failure_jump:
4795 EXTRACT_NUMBER_AND_INCR (j, p);
4796
4797 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
4798 end of the pattern. We don't want to push such a point,
4799 since when we restore it above, entering the switch will
4800 increment `p' past the end of the pattern. We don't need
4801 to push such a point since we obviously won't find any more
4802 fastmap entries beyond `pend'. Such a pattern can match
4803 the null string, though. */
4804 if (p + j < pend)
4805 {
4806 if (!PUSH_PATTERN_OP (p + j, fail_stack))
4807 {
4808 RESET_FAIL_STACK ();
4809 return -2;
4810 }
4811 }
4812 else
4813 bufp->can_be_null = 1;
4814
4815 if (succeed_n_p)
4816 {
4817 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
4818 succeed_n_p = false;
4819 }
4820
4821 continue;
4822
4823
4824 case succeed_n:
4825 /* Get to the number of times to succeed. */
4826 p += OFFSET_ADDRESS_SIZE;
4827
4828 /* Increment p past the n for when k != 0. */
4829 EXTRACT_NUMBER_AND_INCR (k, p);
4830 if (k == 0)
4831 {
4832 p -= 2 * OFFSET_ADDRESS_SIZE;
4833 succeed_n_p = true; /* Spaghetti code alert. */
4834 goto handle_on_failure_jump;
4835 }
4836 continue;
4837
4838
4839 case set_number_at:
4840 p += 2 * OFFSET_ADDRESS_SIZE;
4841 continue;
4842
4843
4844 case start_memory:
4845 case stop_memory:
4846 p += 2;
4847 continue;
4848
4849
4850 default:
4851 abort (); /* We have listed all the cases. */
4852 } /* switch *p++ */
4853
4854 /* Getting here means we have found the possible starting
4855 characters for one path of the pattern -- and that the empty
4856 string does not match. We need not follow this path further.
4857 Instead, look at the next alternative (remembered on the
4858 stack), or quit if no more. The test at the top of the loop
4859 does these things. */
4860 path_can_be_null = false;
4861 p = pend;
4862 } /* while p */
4863
4864 /* Set `can_be_null' for the last path (also the first path, if the
4865 pattern is empty). */
4866 bufp->can_be_null |= path_can_be_null;
4867
4868 done:
4869 RESET_FAIL_STACK ();
4870 return 0;
4871 }
4872
4873 #else /* not INSIDE_RECURSION */
4874
4875 int
4876 re_compile_fastmap (struct re_pattern_buffer *bufp)
4877 {
4878 # ifdef MBS_SUPPORT
4879 if (MB_CUR_MAX != 1)
4880 return wcs_re_compile_fastmap(bufp);
4881 else
4882 # endif
4883 return byte_re_compile_fastmap(bufp);
4884 } /* re_compile_fastmap */
4885 #ifdef _LIBC
4886 weak_alias (__re_compile_fastmap, re_compile_fastmap)
4887 #endif
4888
4889
4891 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4892 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4893 this memory for recording register information. STARTS and ENDS
4894 must be allocated using the malloc library routine, and must each
4895 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4896
4897 If NUM_REGS == 0, then subsequent matches should allocate their own
4898 register data.
4899
4900 Unless this function is called, the first search or match using
4901 PATTERN_BUFFER will allocate its own register data, without
4902 freeing the old data. */
4903
4904 void
4905 re_set_registers (struct re_pattern_buffer *bufp,
4906 struct re_registers *regs, unsigned num_regs,
4907 regoff_t *starts, regoff_t *ends)
4908 {
4909 if (num_regs)
4910 {
4911 bufp->regs_allocated = REGS_REALLOCATE;
4912 regs->num_regs = num_regs;
4913 regs->start = starts;
4914 regs->end = ends;
4915 }
4916 else
4917 {
4918 bufp->regs_allocated = REGS_UNALLOCATED;
4919 regs->num_regs = 0;
4920 regs->start = regs->end = (regoff_t *) 0;
4921 }
4922 }
4923 #ifdef _LIBC
4924 weak_alias (__re_set_registers, re_set_registers)
4925 #endif
4926
4927 /* Searching routines. */
4929
4930 /* Like re_search_2, below, but only one string is specified, and
4931 doesn't let you say where to stop matching. */
4932
4933 int
4934 re_search (struct re_pattern_buffer *bufp, const char *string, int size,
4935 int startpos, int range, struct re_registers *regs)
4936 {
4937 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
4938 regs, size);
4939 }
4940 #ifdef _LIBC
4941 weak_alias (__re_search, re_search)
4942 #endif
4943
4944
4945 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4946 virtual concatenation of STRING1 and STRING2, starting first at index
4947 STARTPOS, then at STARTPOS + 1, and so on.
4948
4949 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4950
4951 RANGE is how far to scan while trying to match. RANGE = 0 means try
4952 only at STARTPOS; in general, the last start tried is STARTPOS +
4953 RANGE.
4954
4955 In REGS, return the indices of the virtual concatenation of STRING1
4956 and STRING2 that matched the entire BUFP->buffer and its contained
4957 subexpressions.
4958
4959 Do not consider matching one past the index STOP in the virtual
4960 concatenation of STRING1 and STRING2.
4961
4962 We return either the position in the strings at which the match was
4963 found, -1 if no match, or -2 if error (such as failure
4964 stack overflow). */
4965
4966 int
4967 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, int size1,
4968 const char *string2, int size2, int startpos, int range,
4969 struct re_registers *regs, int stop)
4970 {
4971 # ifdef MBS_SUPPORT
4972 if (MB_CUR_MAX != 1)
4973 return wcs_re_search_2 (bufp, string1, size1, string2, size2, startpos,
4974 range, regs, stop);
4975 else
4976 # endif
4977 return byte_re_search_2 (bufp, string1, size1, string2, size2, startpos,
4978 range, regs, stop);
4979 } /* re_search_2 */
4980 #ifdef _LIBC
4981 weak_alias (__re_search_2, re_search_2)
4982 #endif
4983
4984 #endif /* not INSIDE_RECURSION */
4985
4986 #ifdef INSIDE_RECURSION
4987
4988 #ifdef MATCH_MAY_ALLOCATE
4989 # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4990 #else
4991 # define FREE_VAR(var) free (var); var = NULL
4992 #endif
4993
4994 #ifdef WCHAR
4995 # define MAX_ALLOCA_SIZE 2000
4996
4997 # define FREE_WCS_BUFFERS() \
4998 do { \
4999 if (size1 > MAX_ALLOCA_SIZE) \
5000 { \
5001 free (wcs_string1); \
5002 free (mbs_offset1); \
5003 } \
5004 else \
5005 { \
5006 FREE_VAR (wcs_string1); \
5007 FREE_VAR (mbs_offset1); \
5008 } \
5009 if (size2 > MAX_ALLOCA_SIZE) \
5010 { \
5011 free (wcs_string2); \
5012 free (mbs_offset2); \
5013 } \
5014 else \
5015 { \
5016 FREE_VAR (wcs_string2); \
5017 FREE_VAR (mbs_offset2); \
5018 } \
5019 } while (0)
5020
5021 #endif
5022
5023
5024 static int
5025 PREFIX(re_search_2) (struct re_pattern_buffer *bufp, const char *string1,
5026 int size1, const char *string2, int size2,
5027 int startpos, int range,
5028 struct re_registers *regs, int stop)
5029 {
5030 int val;
5031 register char *fastmap = bufp->fastmap;
5032 register RE_TRANSLATE_TYPE translate = bufp->translate;
5033 int total_size = size1 + size2;
5034 int endpos = startpos + range;
5035 #ifdef WCHAR
5036 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5037 wchar_t *wcs_string1 = NULL, *wcs_string2 = NULL;
5038 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5039 int wcs_size1 = 0, wcs_size2 = 0;
5040 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
5041 int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
5042 /* They hold whether each wchar_t is binary data or not. */
5043 char *is_binary = NULL;
5044 #endif /* WCHAR */
5045
5046 /* Check for out-of-range STARTPOS. */
5047 if (startpos < 0 || startpos > total_size)
5048 return -1;
5049
5050 /* Fix up RANGE if it might eventually take us outside
5051 the virtual concatenation of STRING1 and STRING2.
5052 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
5053 if (endpos < 0)
5054 range = 0 - startpos;
5055 else if (endpos > total_size)
5056 range = total_size - startpos;
5057
5058 /* If the search isn't to be a backwards one, don't waste time in a
5059 search for a pattern that must be anchored. */
5060 if (bufp->used > 0 && range > 0
5061 && ((re_opcode_t) bufp->buffer[0] == begbuf
5062 /* `begline' is like `begbuf' if it cannot match at newlines. */
5063 || ((re_opcode_t) bufp->buffer[0] == begline
5064 && !bufp->newline_anchor)))
5065 {
5066 if (startpos > 0)
5067 return -1;
5068 else
5069 range = 1;
5070 }
5071
5072 #ifdef emacs
5073 /* In a forward search for something that starts with \=.
5074 don't keep searching past point. */
5075 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
5076 {
5077 range = PT - startpos;
5078 if (range <= 0)
5079 return -1;
5080 }
5081 #endif /* emacs */
5082
5083 /* Update the fastmap now if not correct already. */
5084 if (fastmap && !bufp->fastmap_accurate)
5085 if (re_compile_fastmap (bufp) == -2)
5086 return -2;
5087
5088 #ifdef WCHAR
5089 /* Allocate wchar_t array for wcs_string1 and wcs_string2 and
5090 fill them with converted string. */
5091 if (size1 != 0)
5092 {
5093 if (size1 > MAX_ALLOCA_SIZE)
5094 {
5095 wcs_string1 = TALLOC (size1 + 1, CHAR_T);
5096 mbs_offset1 = TALLOC (size1 + 1, int);
5097 is_binary = TALLOC (size1 + 1, char);
5098 }
5099 else
5100 {
5101 wcs_string1 = REGEX_TALLOC (size1 + 1, CHAR_T);
5102 mbs_offset1 = REGEX_TALLOC (size1 + 1, int);
5103 is_binary = REGEX_TALLOC (size1 + 1, char);
5104 }
5105 if (!wcs_string1 || !mbs_offset1 || !is_binary)
5106 {
5107 if (size1 > MAX_ALLOCA_SIZE)
5108 {
5109 free (wcs_string1);
5110 free (mbs_offset1);
5111 free (is_binary);
5112 }
5113 else
5114 {
5115 FREE_VAR (wcs_string1);
5116 FREE_VAR (mbs_offset1);
5117 FREE_VAR (is_binary);
5118 }
5119 return -2;
5120 }
5121 wcs_size1 = convert_mbs_to_wcs(wcs_string1, string1, size1,
5122 mbs_offset1, is_binary);
5123 wcs_string1[wcs_size1] = L'\0'; /* for a sentinel */
5124 if (size1 > MAX_ALLOCA_SIZE)
5125 free (is_binary);
5126 else
5127 FREE_VAR (is_binary);
5128 }
5129 if (size2 != 0)
5130 {
5131 if (size2 > MAX_ALLOCA_SIZE)
5132 {
5133 wcs_string2 = TALLOC (size2 + 1, CHAR_T);
5134 mbs_offset2 = TALLOC (size2 + 1, int);
5135 is_binary = TALLOC (size2 + 1, char);
5136 }
5137 else
5138 {
5139 wcs_string2 = REGEX_TALLOC (size2 + 1, CHAR_T);
5140 mbs_offset2 = REGEX_TALLOC (size2 + 1, int);
5141 is_binary = REGEX_TALLOC (size2 + 1, char);
5142 }
5143 if (!wcs_string2 || !mbs_offset2 || !is_binary)
5144 {
5145 FREE_WCS_BUFFERS ();
5146 if (size2 > MAX_ALLOCA_SIZE)
5147 free (is_binary);
5148 else
5149 FREE_VAR (is_binary);
5150 return -2;
5151 }
5152 wcs_size2 = convert_mbs_to_wcs(wcs_string2, string2, size2,
5153 mbs_offset2, is_binary);
5154 wcs_string2[wcs_size2] = L'\0'; /* for a sentinel */
5155 if (size2 > MAX_ALLOCA_SIZE)
5156 free (is_binary);
5157 else
5158 FREE_VAR (is_binary);
5159 }
5160 #endif /* WCHAR */
5161
5162
5163 /* Loop through the string, looking for a place to start matching. */
5164 for (;;)
5165 {
5166 /* If a fastmap is supplied, skip quickly over characters that
5167 cannot be the start of a match. If the pattern can match the
5168 null string, however, we don't need to skip characters; we want
5169 the first null string. */
5170 if (fastmap && startpos < total_size && !bufp->can_be_null)
5171 {
5172 if (range > 0) /* Searching forwards. */
5173 {
5174 register const char *d;
5175 register int lim = 0;
5176 int irange = range;
5177
5178 if (startpos < size1 && startpos + range >= size1)
5179 lim = range - (size1 - startpos);
5180
5181 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
5182
5183 /* Written out as an if-else to avoid testing `translate'
5184 inside the loop. */
5185 if (translate)
5186 while (range > lim
5187 && !fastmap[(unsigned char)
5188 translate[(unsigned char) *d++]])
5189 range--;
5190 else
5191 while (range > lim && !fastmap[(unsigned char) *d++])
5192 range--;
5193
5194 startpos += irange - range;
5195 }
5196 else /* Searching backwards. */
5197 {
5198 register CHAR_T c = (size1 == 0 || startpos >= size1
5199 ? string2[startpos - size1]
5200 : string1[startpos]);
5201
5202 if (!fastmap[(unsigned char) TRANSLATE (c)])
5203 goto advance;
5204 }
5205 }
5206
5207 /* If can't match the null string, and that's all we have left, fail. */
5208 if (range >= 0 && startpos == total_size && fastmap
5209 && !bufp->can_be_null)
5210 {
5211 #ifdef WCHAR
5212 FREE_WCS_BUFFERS ();
5213 #endif
5214 return -1;
5215 }
5216
5217 #ifdef WCHAR
5218 val = wcs_re_match_2_internal (bufp, string1, size1, string2,
5219 size2, startpos, regs, stop,
5220 wcs_string1, wcs_size1,
5221 wcs_string2, wcs_size2,
5222 mbs_offset1, mbs_offset2);
5223 #else /* BYTE */
5224 val = byte_re_match_2_internal (bufp, string1, size1, string2,
5225 size2, startpos, regs, stop);
5226 #endif /* BYTE */
5227
5228 #ifndef REGEX_MALLOC
5229 # ifdef C_ALLOCA
5230 alloca (0);
5231 # endif
5232 #endif
5233
5234 if (val >= 0)
5235 {
5236 #ifdef WCHAR
5237 FREE_WCS_BUFFERS ();
5238 #endif
5239 return startpos;
5240 }
5241
5242 if (val == -2)
5243 {
5244 #ifdef WCHAR
5245 FREE_WCS_BUFFERS ();
5246 #endif
5247 return -2;
5248 }
5249
5250 advance:
5251 if (!range)
5252 break;
5253 else if (range > 0)
5254 {
5255 range--;
5256 startpos++;
5257 }
5258 else
5259 {
5260 range++;
5261 startpos--;
5262 }
5263 }
5264 #ifdef WCHAR
5265 FREE_WCS_BUFFERS ();
5266 #endif
5267 return -1;
5268 }
5269
5270 #ifdef WCHAR
5271 /* This converts PTR, a pointer into one of the search wchar_t strings
5272 `string1' and `string2' into an multibyte string offset from the
5273 beginning of that string. We use mbs_offset to optimize.
5274 See convert_mbs_to_wcs. */
5275 # define POINTER_TO_OFFSET(ptr) \
5276 (FIRST_STRING_P (ptr) \
5277 ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \
5278 : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \
5279 + csize1)))
5280 #else /* BYTE */
5281 /* This converts PTR, a pointer into one of the search strings `string1'
5282 and `string2' into an offset from the beginning of that string. */
5283 # define POINTER_TO_OFFSET(ptr) \
5284 (FIRST_STRING_P (ptr) \
5285 ? ((regoff_t) ((ptr) - string1)) \
5286 : ((regoff_t) ((ptr) - string2 + size1)))
5287 #endif /* WCHAR */
5288
5289 /* Macros for dealing with the split strings in re_match_2. */
5290
5291 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
5292
5293 /* Call before fetching a character with *d. This switches over to
5294 string2 if necessary. */
5295 #define PREFETCH() \
5296 while (d == dend) \
5297 { \
5298 /* End of string2 => fail. */ \
5299 if (dend == end_match_2) \
5300 goto fail; \
5301 /* End of string1 => advance to string2. */ \
5302 d = string2; \
5303 dend = end_match_2; \
5304 }
5305
5306 /* Test if at very beginning or at very end of the virtual concatenation
5307 of `string1' and `string2'. If only one string, it's `string2'. */
5308 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5309 #define AT_STRINGS_END(d) ((d) == end2)
5310
5311
5312 /* Test if D points to a character which is word-constituent. We have
5313 two special cases to check for: if past the end of string1, look at
5314 the first character in string2; and if before the beginning of
5315 string2, look at the last character in string1. */
5316 #ifdef WCHAR
5317 /* Use internationalized API instead of SYNTAX. */
5318 # define WORDCHAR_P(d) \
5319 (iswalnum ((wint_t)((d) == end1 ? *string2 \
5320 : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0 \
5321 || ((d) == end1 ? *string2 \
5322 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == L'_')
5323 #else /* BYTE */
5324 # define WORDCHAR_P(d) \
5325 (SYNTAX ((d) == end1 ? *string2 \
5326 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
5327 == Sword)
5328 #endif /* WCHAR */
5329
5330 /* Disabled due to a compiler bug -- see comment at case wordbound */
5331 #if 0
5332 /* Test if the character before D and the one at D differ with respect
5333 to being word-constituent. */
5334 #define AT_WORD_BOUNDARY(d) \
5335 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
5336 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
5337 #endif
5338
5339 /* Free everything we malloc. */
5340 #ifdef MATCH_MAY_ALLOCATE
5341 # ifdef WCHAR
5342 # define FREE_VARIABLES() \
5343 do { \
5344 REGEX_FREE_STACK (fail_stack.stack); \
5345 FREE_VAR (regstart); \
5346 FREE_VAR (regend); \
5347 FREE_VAR (old_regstart); \
5348 FREE_VAR (old_regend); \
5349 FREE_VAR (best_regstart); \
5350 FREE_VAR (best_regend); \
5351 FREE_VAR (reg_info); \
5352 FREE_VAR (reg_dummy); \
5353 FREE_VAR (reg_info_dummy); \
5354 if (!cant_free_wcs_buf) \
5355 { \
5356 FREE_VAR (string1); \
5357 FREE_VAR (string2); \
5358 FREE_VAR (mbs_offset1); \
5359 FREE_VAR (mbs_offset2); \
5360 } \
5361 } while (0)
5362 # else /* BYTE */
5363 # define FREE_VARIABLES() \
5364 do { \
5365 REGEX_FREE_STACK (fail_stack.stack); \
5366 FREE_VAR (regstart); \
5367 FREE_VAR (regend); \
5368 FREE_VAR (old_regstart); \
5369 FREE_VAR (old_regend); \
5370 FREE_VAR (best_regstart); \
5371 FREE_VAR (best_regend); \
5372 FREE_VAR (reg_info); \
5373 FREE_VAR (reg_dummy); \
5374 FREE_VAR (reg_info_dummy); \
5375 } while (0)
5376 # endif /* WCHAR */
5377 #else
5378 # ifdef WCHAR
5379 # define FREE_VARIABLES() \
5380 do { \
5381 if (!cant_free_wcs_buf) \
5382 { \
5383 FREE_VAR (string1); \
5384 FREE_VAR (string2); \
5385 FREE_VAR (mbs_offset1); \
5386 FREE_VAR (mbs_offset2); \
5387 } \
5388 } while (0)
5389 # else /* BYTE */
5390 # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
5391 # endif /* WCHAR */
5392 #endif /* not MATCH_MAY_ALLOCATE */
5393
5394 /* These values must meet several constraints. They must not be valid
5395 register values; since we have a limit of 255 registers (because
5396 we use only one byte in the pattern for the register number), we can
5397 use numbers larger than 255. They must differ by 1, because of
5398 NUM_FAILURE_ITEMS above. And the value for the lowest register must
5399 be larger than the value for the highest register, so we do not try
5400 to actually save any registers when none are active. */
5401 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
5402 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
5403
5404 #else /* not INSIDE_RECURSION */
5406 /* Matching routines. */
5407
5408 #ifndef emacs /* Emacs never uses this. */
5409 /* re_match is like re_match_2 except it takes only a single string. */
5410
5411 int
5412 re_match (struct re_pattern_buffer *bufp, const char *string,
5413 int size, int pos, struct re_registers *regs)
5414 {
5415 int result;
5416 # ifdef MBS_SUPPORT
5417 if (MB_CUR_MAX != 1)
5418 result = wcs_re_match_2_internal (bufp, NULL, 0, string, size,
5419 pos, regs, size,
5420 NULL, 0, NULL, 0, NULL, NULL);
5421 else
5422 # endif
5423 result = byte_re_match_2_internal (bufp, NULL, 0, string, size,
5424 pos, regs, size);
5425 # ifndef REGEX_MALLOC
5426 # ifdef C_ALLOCA
5427 alloca (0);
5428 # endif
5429 # endif
5430 return result;
5431 }
5432 # ifdef _LIBC
5433 weak_alias (__re_match, re_match)
5434 # endif
5435 #endif /* not emacs */
5436
5437 #endif /* not INSIDE_RECURSION */
5438
5439 #ifdef INSIDE_RECURSION
5440 static boolean PREFIX(group_match_null_string_p) (UCHAR_T **p,
5441 UCHAR_T *end,
5442 PREFIX(register_info_type) *reg_info);
5443 static boolean PREFIX(alt_match_null_string_p) (UCHAR_T *p,
5444 UCHAR_T *end,
5445 PREFIX(register_info_type) *reg_info);
5446 static boolean PREFIX(common_op_match_null_string_p) (UCHAR_T **p,
5447 UCHAR_T *end,
5448 PREFIX(register_info_type) *reg_info);
5449 static int PREFIX(bcmp_translate) (const CHAR_T *s1, const CHAR_T *s2,
5450 int len, char *translate);
5451 #else /* not INSIDE_RECURSION */
5452
5453 /* re_match_2 matches the compiled pattern in BUFP against the
5454 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5455 and SIZE2, respectively). We start matching at POS, and stop
5456 matching at STOP.
5457
5458 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
5459 store offsets for the substring each group matched in REGS. See the
5460 documentation for exactly how many groups we fill.
5461
5462 We return -1 if no match, -2 if an internal error (such as the
5463 failure stack overflowing). Otherwise, we return the length of the
5464 matched substring. */
5465
5466 int
5467 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int size1,
5468 const char *string2, int size2, int pos,
5469 struct re_registers *regs, int stop)
5470 {
5471 int result;
5472 # ifdef MBS_SUPPORT
5473 if (MB_CUR_MAX != 1)
5474 result = wcs_re_match_2_internal (bufp, string1, size1, string2, size2,
5475 pos, regs, stop,
5476 NULL, 0, NULL, 0, NULL, NULL);
5477 else
5478 # endif
5479 result = byte_re_match_2_internal (bufp, string1, size1, string2, size2,
5480 pos, regs, stop);
5481
5482 #ifndef REGEX_MALLOC
5483 # ifdef C_ALLOCA
5484 alloca (0);
5485 # endif
5486 #endif
5487 return result;
5488 }
5489 #ifdef _LIBC
5490 weak_alias (__re_match_2, re_match_2)
5491 #endif
5492
5493 #endif /* not INSIDE_RECURSION */
5494
5495 #ifdef INSIDE_RECURSION
5496
5497 #ifdef WCHAR
5498 static int count_mbs_length (int *, int);
5499
5500 /* This check the substring (from 0, to length) of the multibyte string,
5501 to which offset_buffer correspond. And count how many wchar_t_characters
5502 the substring occupy. We use offset_buffer to optimization.
5503 See convert_mbs_to_wcs. */
5504
5505 static int
5506 count_mbs_length(int *offset_buffer, int length)
5507 {
5508 int upper, lower;
5509
5510 /* Check whether the size is valid. */
5511 if (length < 0)
5512 return -1;
5513
5514 if (offset_buffer == NULL)
5515 return 0;
5516
5517 /* If there are no multibyte character, offset_buffer[i] == i.
5518 Optmize for this case. */
5519 if (offset_buffer[length] == length)
5520 return length;
5521
5522 /* Set up upper with length. (because for all i, offset_buffer[i] >= i) */
5523 upper = length;
5524 lower = 0;
5525
5526 while (true)
5527 {
5528 int middle = (lower + upper) / 2;
5529 if (middle == lower || middle == upper)
5530 break;
5531 if (offset_buffer[middle] > length)
5532 upper = middle;
5533 else if (offset_buffer[middle] < length)
5534 lower = middle;
5535 else
5536 return middle;
5537 }
5538
5539 return -1;
5540 }
5541 #endif /* WCHAR */
5542
5543 /* This is a separate function so that we can force an alloca cleanup
5544 afterwards. */
5545 #ifdef WCHAR
5546 static int
5547 wcs_re_match_2_internal (struct re_pattern_buffer *bufp,
5548 const char *cstring1, int csize1,
5549 const char *cstring2, int csize2,
5550 int pos,
5551 struct re_registers *regs,
5552 int stop,
5553 /* string1 == string2 == NULL means string1/2, size1/2 and
5554 mbs_offset1/2 need seting up in this function. */
5555 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5556 wchar_t *string1, int size1,
5557 wchar_t *string2, int size2,
5558 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
5559 int *mbs_offset1, int *mbs_offset2)
5560 #else /* BYTE */
5561 static int
5562 byte_re_match_2_internal (struct re_pattern_buffer *bufp,
5563 const char *string1, int size1,
5564 const char *string2, int size2,
5565 int pos,
5566 struct re_registers *regs, int stop)
5567 #endif /* BYTE */
5568 {
5569 /* General temporaries. */
5570 int mcnt;
5571 UCHAR_T *p1;
5572 #ifdef WCHAR
5573 /* They hold whether each wchar_t is binary data or not. */
5574 char *is_binary = NULL;
5575 /* If true, we can't free string1/2, mbs_offset1/2. */
5576 int cant_free_wcs_buf = 1;
5577 #endif /* WCHAR */
5578
5579 /* Just past the end of the corresponding string. */
5580 const CHAR_T *end1, *end2;
5581
5582 /* Pointers into string1 and string2, just past the last characters in
5583 each to consider matching. */
5584 const CHAR_T *end_match_1, *end_match_2;
5585
5586 /* Where we are in the data, and the end of the current string. */
5587 const CHAR_T *d, *dend;
5588
5589 /* Where we are in the pattern, and the end of the pattern. */
5590 #ifdef WCHAR
5591 UCHAR_T *pattern, *p;
5592 register UCHAR_T *pend;
5593 #else /* BYTE */
5594 UCHAR_T *p = bufp->buffer;
5595 register UCHAR_T *pend = p + bufp->used;
5596 #endif /* WCHAR */
5597
5598 /* Mark the opcode just after a start_memory, so we can test for an
5599 empty subpattern when we get to the stop_memory. */
5600 UCHAR_T *just_past_start_mem = 0;
5601
5602 /* We use this to map every character in the string. */
5603 RE_TRANSLATE_TYPE translate = bufp->translate;
5604
5605 /* Failure point stack. Each place that can handle a failure further
5606 down the line pushes a failure point on this stack. It consists of
5607 restart, regend, and reg_info for all registers corresponding to
5608 the subexpressions we're currently inside, plus the number of such
5609 registers, and, finally, two char *'s. The first char * is where
5610 to resume scanning the pattern; the second one is where to resume
5611 scanning the strings. If the latter is zero, the failure point is
5612 a ``dummy''; if a failure happens and the failure point is a dummy,
5613 it gets discarded and the next next one is tried. */
5614 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5615 PREFIX(fail_stack_type) fail_stack;
5616 #endif
5617 #ifdef DEBUG
5618 static unsigned failure_id;
5619 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5620 #endif
5621
5622 #ifdef REL_ALLOC
5623 /* This holds the pointer to the failure stack, when
5624 it is allocated relocatably. */
5625 fail_stack_elt_t *failure_stack_ptr;
5626 #endif
5627
5628 /* We fill all the registers internally, independent of what we
5629 return, for use in backreferences. The number here includes
5630 an element for register zero. */
5631 size_t num_regs = bufp->re_nsub + 1;
5632
5633 /* The currently active registers. */
5634 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5635 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5636
5637 /* Information on the contents of registers. These are pointers into
5638 the input strings; they record just what was matched (on this
5639 attempt) by a subexpression part of the pattern, that is, the
5640 regnum-th regstart pointer points to where in the pattern we began
5641 matching and the regnum-th regend points to right after where we
5642 stopped matching the regnum-th subexpression. (The zeroth register
5643 keeps track of what the whole pattern matches.) */
5644 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5645 const CHAR_T **regstart, **regend;
5646 #endif
5647
5648 /* If a group that's operated upon by a repetition operator fails to
5649 match anything, then the register for its start will need to be
5650 restored because it will have been set to wherever in the string we
5651 are when we last see its open-group operator. Similarly for a
5652 register's end. */
5653 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5654 const CHAR_T **old_regstart, **old_regend;
5655 #endif
5656
5657 /* The is_active field of reg_info helps us keep track of which (possibly
5658 nested) subexpressions we are currently in. The matched_something
5659 field of reg_info[reg_num] helps us tell whether or not we have
5660 matched any of the pattern so far this time through the reg_num-th
5661 subexpression. These two fields get reset each time through any
5662 loop their register is in. */
5663 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5664 PREFIX(register_info_type) *reg_info;
5665 #endif
5666
5667 /* The following record the register info as found in the above
5668 variables when we find a match better than any we've seen before.
5669 This happens as we backtrack through the failure points, which in
5670 turn happens only if we have not yet matched the entire string. */
5671 unsigned best_regs_set = false;
5672 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5673 const CHAR_T **best_regstart, **best_regend;
5674 #endif
5675
5676 /* Logically, this is `best_regend[0]'. But we don't want to have to
5677 allocate space for that if we're not allocating space for anything
5678 else (see below). Also, we never need info about register 0 for
5679 any of the other register vectors, and it seems rather a kludge to
5680 treat `best_regend' differently than the rest. So we keep track of
5681 the end of the best match so far in a separate variable. We
5682 initialize this to NULL so that when we backtrack the first time
5683 and need to test it, it's not garbage. */
5684 const CHAR_T *match_end = NULL;
5685
5686 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
5687 int set_regs_matched_done = 0;
5688
5689 /* Used when we pop values we don't care about. */
5690 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5691 const CHAR_T **reg_dummy;
5692 PREFIX(register_info_type) *reg_info_dummy;
5693 #endif
5694
5695 #ifdef DEBUG
5696 /* Counts the total number of registers pushed. */
5697 unsigned num_regs_pushed = 0;
5698 #endif
5699
5700 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5701
5702 INIT_FAIL_STACK ();
5703
5704 #ifdef MATCH_MAY_ALLOCATE
5705 /* Do not bother to initialize all the register variables if there are
5706 no groups in the pattern, as it takes a fair amount of time. If
5707 there are groups, we include space for register 0 (the whole
5708 pattern), even though we never use it, since it simplifies the
5709 array indexing. We should fix this. */
5710 if (bufp->re_nsub)
5711 {
5712 regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5713 regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5714 old_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5715 old_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5716 best_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5717 best_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5718 reg_info = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5719 reg_dummy = REGEX_TALLOC (num_regs, const CHAR_T *);
5720 reg_info_dummy = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5721
5722 if (!(regstart && regend && old_regstart && old_regend && reg_info
5723 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
5724 {
5725 FREE_VARIABLES ();
5726 return -2;
5727 }
5728 }
5729 else
5730 {
5731 /* We must initialize all our variables to NULL, so that
5732 `FREE_VARIABLES' doesn't try to free them. */
5733 regstart = regend = old_regstart = old_regend = best_regstart
5734 = best_regend = reg_dummy = NULL;
5735 reg_info = reg_info_dummy = (PREFIX(register_info_type) *) NULL;
5736 }
5737 #endif /* MATCH_MAY_ALLOCATE */
5738
5739 /* The starting position is bogus. */
5740 #ifdef WCHAR
5741 if (pos < 0 || pos > csize1 + csize2)
5742 #else /* BYTE */
5743 if (pos < 0 || pos > size1 + size2)
5744 #endif
5745 {
5746 FREE_VARIABLES ();
5747 return -1;
5748 }
5749
5750 #ifdef WCHAR
5751 /* Allocate wchar_t array for string1 and string2 and
5752 fill them with converted string. */
5753 if (string1 == NULL && string2 == NULL)
5754 {
5755 /* We need seting up buffers here. */
5756
5757 /* We must free wcs buffers in this function. */
5758 cant_free_wcs_buf = 0;
5759
5760 if (csize1 != 0)
5761 {
5762 string1 = REGEX_TALLOC (csize1 + 1, CHAR_T);
5763 mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
5764 is_binary = REGEX_TALLOC (csize1 + 1, char);
5765 if (!string1 || !mbs_offset1 || !is_binary)
5766 {
5767 FREE_VAR (string1);
5768 FREE_VAR (mbs_offset1);
5769 FREE_VAR (is_binary);
5770 return -2;
5771 }
5772 }
5773 if (csize2 != 0)
5774 {
5775 string2 = REGEX_TALLOC (csize2 + 1, CHAR_T);
5776 mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
5777 is_binary = REGEX_TALLOC (csize2 + 1, char);
5778 if (!string2 || !mbs_offset2 || !is_binary)
5779 {
5780 FREE_VAR (string1);
5781 FREE_VAR (mbs_offset1);
5782 FREE_VAR (string2);
5783 FREE_VAR (mbs_offset2);
5784 FREE_VAR (is_binary);
5785 return -2;
5786 }
5787 size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
5788 mbs_offset2, is_binary);
5789 string2[size2] = L'\0'; /* for a sentinel */
5790 FREE_VAR (is_binary);
5791 }
5792 }
5793
5794 /* We need to cast pattern to (wchar_t*), because we casted this compiled
5795 pattern to (char*) in regex_compile. */
5796 p = pattern = (CHAR_T*)bufp->buffer;
5797 pend = (CHAR_T*)(bufp->buffer + bufp->used);
5798
5799 #endif /* WCHAR */
5800
5801 /* Initialize subexpression text positions to -1 to mark ones that no
5802 start_memory/stop_memory has been seen for. Also initialize the
5803 register information struct. */
5804 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5805 {
5806 regstart[mcnt] = regend[mcnt]
5807 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
5808
5809 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
5810 IS_ACTIVE (reg_info[mcnt]) = 0;
5811 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5812 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5813 }
5814
5815 /* We move `string1' into `string2' if the latter's empty -- but not if
5816 `string1' is null. */
5817 if (size2 == 0 && string1 != NULL)
5818 {
5819 string2 = string1;
5820 size2 = size1;
5821 string1 = 0;
5822 size1 = 0;
5823 #ifdef WCHAR
5824 mbs_offset2 = mbs_offset1;
5825 csize2 = csize1;
5826 mbs_offset1 = NULL;
5827 csize1 = 0;
5828 #endif
5829 }
5830 end1 = string1 + size1;
5831 end2 = string2 + size2;
5832
5833 /* Compute where to stop matching, within the two strings. */
5834 #ifdef WCHAR
5835 if (stop <= csize1)
5836 {
5837 mcnt = count_mbs_length(mbs_offset1, stop);
5838 end_match_1 = string1 + mcnt;
5839 end_match_2 = string2;
5840 }
5841 else
5842 {
5843 if (stop > csize1 + csize2)
5844 stop = csize1 + csize2;
5845 end_match_1 = end1;
5846 mcnt = count_mbs_length(mbs_offset2, stop-csize1);
5847 end_match_2 = string2 + mcnt;
5848 }
5849 if (mcnt < 0)
5850 { /* count_mbs_length return error. */
5851 FREE_VARIABLES ();
5852 return -1;
5853 }
5854 #else
5855 if (stop <= size1)
5856 {
5857 end_match_1 = string1 + stop;
5858 end_match_2 = string2;
5859 }
5860 else
5861 {
5862 end_match_1 = end1;
5863 end_match_2 = string2 + stop - size1;
5864 }
5865 #endif /* WCHAR */
5866
5867 /* `p' scans through the pattern as `d' scans through the data.
5868 `dend' is the end of the input string that `d' points within. `d'
5869 is advanced into the following input string whenever necessary, but
5870 this happens before fetching; therefore, at the beginning of the
5871 loop, `d' can be pointing at the end of a string, but it cannot
5872 equal `string2'. */
5873 #ifdef WCHAR
5874 if (size1 > 0 && pos <= csize1)
5875 {
5876 mcnt = count_mbs_length(mbs_offset1, pos);
5877 d = string1 + mcnt;
5878 dend = end_match_1;
5879 }
5880 else
5881 {
5882 mcnt = count_mbs_length(mbs_offset2, pos-csize1);
5883 d = string2 + mcnt;
5884 dend = end_match_2;
5885 }
5886
5887 if (mcnt < 0)
5888 { /* count_mbs_length return error. */
5889 FREE_VARIABLES ();
5890 return -1;
5891 }
5892 #else
5893 if (size1 > 0 && pos <= size1)
5894 {
5895 d = string1 + pos;
5896 dend = end_match_1;
5897 }
5898 else
5899 {
5900 d = string2 + pos - size1;
5901 dend = end_match_2;
5902 }
5903 #endif /* WCHAR */
5904
5905 DEBUG_PRINT1 ("The compiled pattern is:\n");
5906 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5907 DEBUG_PRINT1 ("The string to match is: `");
5908 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5909 DEBUG_PRINT1 ("'\n");
5910
5911 /* This loops over pattern commands. It exits by returning from the
5912 function if the match is complete, or it drops through if the match
5913 fails at this starting point in the input data. */
5914 for (;;)
5915 {
5916 #ifdef _LIBC
5917 DEBUG_PRINT2 ("\n%p: ", p);
5918 #else
5919 DEBUG_PRINT2 ("\n0x%x: ", p);
5920 #endif
5921
5922 if (p == pend)
5923 { /* End of pattern means we might have succeeded. */
5924 DEBUG_PRINT1 ("end of pattern ... ");
5925
5926 /* If we haven't matched the entire string, and we want the
5927 longest match, try backtracking. */
5928 if (d != end_match_2)
5929 {
5930 /* 1 if this match ends in the same string (string1 or string2)
5931 as the best previous match. */
5932 boolean same_str_p;
5933
5934 /* 1 if this match is the best seen so far. */
5935 boolean best_match_p;
5936
5937 same_str_p = (FIRST_STRING_P (match_end)
5938 == MATCHING_IN_FIRST_STRING);
5939
5940 /* AIX compiler got confused when this was combined
5941 with the previous declaration. */
5942 if (same_str_p)
5943 best_match_p = d > match_end;
5944 else
5945 best_match_p = !MATCHING_IN_FIRST_STRING;
5946
5947 DEBUG_PRINT1 ("backtracking.\n");
5948
5949 if (!FAIL_STACK_EMPTY ())
5950 { /* More failure points to try. */
5951
5952 /* If exceeds best match so far, save it. */
5953 if (!best_regs_set || best_match_p)
5954 {
5955 best_regs_set = true;
5956 match_end = d;
5957
5958 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5959
5960 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5961 {
5962 best_regstart[mcnt] = regstart[mcnt];
5963 best_regend[mcnt] = regend[mcnt];
5964 }
5965 }
5966 goto fail;
5967 }
5968
5969 /* If no failure points, don't restore garbage. And if
5970 last match is real best match, don't restore second
5971 best one. */
5972 else if (best_regs_set && !best_match_p)
5973 {
5974 restore_best_regs:
5975 /* Restore best match. It may happen that `dend ==
5976 end_match_1' while the restored d is in string2.
5977 For example, the pattern `x.*y.*z' against the
5978 strings `x-' and `y-z-', if the two strings are
5979 not consecutive in memory. */
5980 DEBUG_PRINT1 ("Restoring best registers.\n");
5981
5982 d = match_end;
5983 dend = ((d >= string1 && d <= end1)
5984 ? end_match_1 : end_match_2);
5985
5986 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5987 {
5988 regstart[mcnt] = best_regstart[mcnt];
5989 regend[mcnt] = best_regend[mcnt];
5990 }
5991 }
5992 } /* d != end_match_2 */
5993
5994 succeed_label:
5995 DEBUG_PRINT1 ("Accepting match.\n");
5996 /* If caller wants register contents data back, do it. */
5997 if (regs && !bufp->no_sub)
5998 {
5999 /* Have the register data arrays been allocated? */
6000 if (bufp->regs_allocated == REGS_UNALLOCATED)
6001 { /* No. So allocate them with malloc. We need one
6002 extra element beyond `num_regs' for the `-1' marker
6003 GNU code uses. */
6004 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
6005 regs->start = TALLOC (regs->num_regs, regoff_t);
6006 regs->end = TALLOC (regs->num_regs, regoff_t);
6007 if (regs->start == NULL || regs->end == NULL)
6008 {
6009 FREE_VARIABLES ();
6010 return -2;
6011 }
6012 bufp->regs_allocated = REGS_REALLOCATE;
6013 }
6014 else if (bufp->regs_allocated == REGS_REALLOCATE)
6015 { /* Yes. If we need more elements than were already
6016 allocated, reallocate them. If we need fewer, just
6017 leave it alone. */
6018 if (regs->num_regs < num_regs + 1)
6019 {
6020 regs->num_regs = num_regs + 1;
6021 RETALLOC (regs->start, regs->num_regs, regoff_t);
6022 RETALLOC (regs->end, regs->num_regs, regoff_t);
6023 if (regs->start == NULL || regs->end == NULL)
6024 {
6025 FREE_VARIABLES ();
6026 return -2;
6027 }
6028 }
6029 }
6030 else
6031 {
6032 /* These braces fend off a "empty body in an else-statement"
6033 warning under GCC when assert expands to nothing. */
6034 assert (bufp->regs_allocated == REGS_FIXED);
6035 }
6036
6037 /* Convert the pointer data in `regstart' and `regend' to
6038 indices. Register zero has to be set differently,
6039 since we haven't kept track of any info for it. */
6040 if (regs->num_regs > 0)
6041 {
6042 regs->start[0] = pos;
6043 #ifdef WCHAR
6044 if (MATCHING_IN_FIRST_STRING)
6045 regs->end[0] = mbs_offset1 != NULL ?
6046 mbs_offset1[d-string1] : 0;
6047 else
6048 regs->end[0] = csize1 + (mbs_offset2 != NULL ?
6049 mbs_offset2[d-string2] : 0);
6050 #else
6051 regs->end[0] = (MATCHING_IN_FIRST_STRING
6052 ? ((regoff_t) (d - string1))
6053 : ((regoff_t) (d - string2 + size1)));
6054 #endif /* WCHAR */
6055 }
6056
6057 /* Go through the first `min (num_regs, regs->num_regs)'
6058 registers, since that is all we initialized. */
6059 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
6060 mcnt++)
6061 {
6062 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
6063 regs->start[mcnt] = regs->end[mcnt] = -1;
6064 else
6065 {
6066 regs->start[mcnt]
6067 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
6068 regs->end[mcnt]
6069 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
6070 }
6071 }
6072
6073 /* If the regs structure we return has more elements than
6074 were in the pattern, set the extra elements to -1. If
6075 we (re)allocated the registers, this is the case,
6076 because we always allocate enough to have at least one
6077 -1 at the end. */
6078 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
6079 regs->start[mcnt] = regs->end[mcnt] = -1;
6080 } /* regs && !bufp->no_sub */
6081
6082 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
6083 nfailure_points_pushed, nfailure_points_popped,
6084 nfailure_points_pushed - nfailure_points_popped);
6085 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
6086
6087 #ifdef WCHAR
6088 if (MATCHING_IN_FIRST_STRING)
6089 mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
6090 else
6091 mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
6092 csize1;
6093 mcnt -= pos;
6094 #else
6095 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
6096 ? string1
6097 : string2 - size1);
6098 #endif /* WCHAR */
6099
6100 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
6101
6102 FREE_VARIABLES ();
6103 return mcnt;
6104 }
6105
6106 /* Otherwise match next pattern command. */
6107 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
6108 {
6109 /* Ignore these. Used to ignore the n of succeed_n's which
6110 currently have n == 0. */
6111 case no_op:
6112 DEBUG_PRINT1 ("EXECUTING no_op.\n");
6113 break;
6114
6115 case succeed:
6116 DEBUG_PRINT1 ("EXECUTING succeed.\n");
6117 goto succeed_label;
6118
6119 /* Match the next n pattern characters exactly. The following
6120 byte in the pattern defines n, and the n bytes after that
6121 are the characters to match. */
6122 case exactn:
6123 #ifdef MBS_SUPPORT
6124 case exactn_bin:
6125 #endif
6126 mcnt = *p++;
6127 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
6128
6129 /* This is written out as an if-else so we don't waste time
6130 testing `translate' inside the loop. */
6131 if (translate)
6132 {
6133 do
6134 {
6135 PREFETCH ();
6136 #ifdef WCHAR
6137 if (*d <= 0xff)
6138 {
6139 if ((UCHAR_T) translate[(unsigned char) *d++]
6140 != (UCHAR_T) *p++)
6141 goto fail;
6142 }
6143 else
6144 {
6145 if (*d++ != (CHAR_T) *p++)
6146 goto fail;
6147 }
6148 #else
6149 if ((UCHAR_T) translate[(unsigned char) *d++]
6150 != (UCHAR_T) *p++)
6151 goto fail;
6152 #endif /* WCHAR */
6153 }
6154 while (--mcnt);
6155 }
6156 else
6157 {
6158 do
6159 {
6160 PREFETCH ();
6161 if (*d++ != (CHAR_T) *p++) goto fail;
6162 }
6163 while (--mcnt);
6164 }
6165 SET_REGS_MATCHED ();
6166 break;
6167
6168
6169 /* Match any character except possibly a newline or a null. */
6170 case anychar:
6171 DEBUG_PRINT1 ("EXECUTING anychar.\n");
6172
6173 PREFETCH ();
6174
6175 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
6176 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
6177 goto fail;
6178
6179 SET_REGS_MATCHED ();
6180 DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d);
6181 d++;
6182 break;
6183
6184
6185 case charset:
6186 case charset_not:
6187 {
6188 register UCHAR_T c;
6189 #ifdef WCHAR
6190 unsigned int i, char_class_length, coll_symbol_length,
6191 equiv_class_length, ranges_length, chars_length, length;
6192 CHAR_T *workp, *workp2, *charset_top;
6193 #define WORK_BUFFER_SIZE 128
6194 CHAR_T str_buf[WORK_BUFFER_SIZE];
6195 # ifdef _LIBC
6196 uint32_t nrules;
6197 # endif /* _LIBC */
6198 #endif /* WCHAR */
6199 boolean negate = (re_opcode_t) *(p - 1) == charset_not;
6200
6201 DEBUG_PRINT2 ("EXECUTING charset%s.\n", negate ? "_not" : "");
6202 PREFETCH ();
6203 c = TRANSLATE (*d); /* The character to match. */
6204 #ifdef WCHAR
6205 # ifdef _LIBC
6206 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
6207 # endif /* _LIBC */
6208 charset_top = p - 1;
6209 char_class_length = *p++;
6210 coll_symbol_length = *p++;
6211 equiv_class_length = *p++;
6212 ranges_length = *p++;
6213 chars_length = *p++;
6214 /* p points charset[6], so the address of the next instruction
6215 (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'],
6216 where l=length of char_classes, m=length of collating_symbol,
6217 n=equivalence_class, o=length of char_range,
6218 p'=length of character. */
6219 workp = p;
6220 /* Update p to indicate the next instruction. */
6221 p += char_class_length + coll_symbol_length+ equiv_class_length +
6222 2*ranges_length + chars_length;
6223
6224 /* match with char_class? */
6225 for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
6226 {
6227 wctype_t wctype;
6228 uintptr_t alignedp = ((uintptr_t)workp
6229 + __alignof__(wctype_t) - 1)
6230 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
6231 wctype = *((wctype_t*)alignedp);
6232 workp += CHAR_CLASS_SIZE;
6233 # ifdef _LIBC
6234 if (__iswctype((wint_t)c, wctype))
6235 goto char_set_matched;
6236 # else
6237 if (iswctype((wint_t)c, wctype))
6238 goto char_set_matched;
6239 # endif
6240 }
6241
6242 /* match with collating_symbol? */
6243 # ifdef _LIBC
6244 if (nrules != 0)
6245 {
6246 const unsigned char *extra = (const unsigned char *)
6247 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
6248
6249 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
6250 workp++)
6251 {
6252 int32_t *wextra;
6253 wextra = (int32_t*)(extra + *workp++);
6254 for (i = 0; i < *wextra; ++i)
6255 if (TRANSLATE(d[i]) != wextra[1 + i])
6256 break;
6257
6258 if (i == *wextra)
6259 {
6260 /* Update d, however d will be incremented at
6261 char_set_matched:, we decrement d here. */
6262 d += i - 1;
6263 goto char_set_matched;
6264 }
6265 }
6266 }
6267 else /* (nrules == 0) */
6268 # endif
6269 /* If we can't look up collation data, we use wcscoll
6270 instead. */
6271 {
6272 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;)
6273 {
6274 const CHAR_T *backup_d = d, *backup_dend = dend;
6275 # ifdef _LIBC
6276 length = __wcslen (workp);
6277 # else
6278 length = wcslen (workp);
6279 # endif
6280
6281 /* If wcscoll(the collating symbol, whole string) > 0,
6282 any substring of the string never match with the
6283 collating symbol. */
6284 # ifdef _LIBC
6285 if (__wcscoll (workp, d) > 0)
6286 # else
6287 if (wcscoll (workp, d) > 0)
6288 # endif
6289 {
6290 workp += length + 1;
6291 continue;
6292 }
6293
6294 /* First, we compare the collating symbol with
6295 the first character of the string.
6296 If it don't match, we add the next character to
6297 the compare buffer in turn. */
6298 for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++)
6299 {
6300 int match;
6301 if (d == dend)
6302 {
6303 if (dend == end_match_2)
6304 break;
6305 d = string2;
6306 dend = end_match_2;
6307 }
6308
6309 /* add next character to the compare buffer. */
6310 str_buf[i] = TRANSLATE(*d);
6311 str_buf[i+1] = '\0';
6312
6313 # ifdef _LIBC
6314 match = __wcscoll (workp, str_buf);
6315 # else
6316 match = wcscoll (workp, str_buf);
6317 # endif
6318 if (match == 0)
6319 goto char_set_matched;
6320
6321 if (match < 0)
6322 /* (str_buf > workp) indicate (str_buf + X > workp),
6323 because for all X (str_buf + X > str_buf).
6324 So we don't need continue this loop. */
6325 break;
6326
6327 /* Otherwise(str_buf < workp),
6328 (str_buf+next_character) may equals (workp).
6329 So we continue this loop. */
6330 }
6331 /* not matched */
6332 d = backup_d;
6333 dend = backup_dend;
6334 workp += length + 1;
6335 }
6336 }
6337 /* match with equivalence_class? */
6338 # ifdef _LIBC
6339 if (nrules != 0)
6340 {
6341 const CHAR_T *backup_d = d, *backup_dend = dend;
6342 /* Try to match the equivalence class against
6343 those known to the collate implementation. */
6344 const int32_t *table;
6345 const int32_t *weights;
6346 const int32_t *extra;
6347 const int32_t *indirect;
6348 int32_t idx, idx2;
6349 wint_t *cp;
6350 size_t len;
6351
6352 /* This #include defines a local function! */
6353 # include <locale/weightwc.h>
6354
6355 table = (const int32_t *)
6356 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC);
6357 weights = (const wint_t *)
6358 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC);
6359 extra = (const wint_t *)
6360 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC);
6361 indirect = (const int32_t *)
6362 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC);
6363
6364 /* Write 1 collating element to str_buf, and
6365 get its index. */
6366 idx2 = 0;
6367
6368 for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++)
6369 {
6370 cp = (wint_t*)str_buf;
6371 if (d == dend)
6372 {
6373 if (dend == end_match_2)
6374 break;
6375 d = string2;
6376 dend = end_match_2;
6377 }
6378 str_buf[i] = TRANSLATE(*(d+i));
6379 str_buf[i+1] = '\0'; /* sentinel */
6380 idx2 = findidx ((const wint_t**)&cp);
6381 }
6382
6383 /* Update d, however d will be incremented at
6384 char_set_matched:, we decrement d here. */
6385 d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1);
6386 if (d >= dend)
6387 {
6388 if (dend == end_match_2)
6389 d = dend;
6390 else
6391 {
6392 d = string2;
6393 dend = end_match_2;
6394 }
6395 }
6396
6397 len = weights[idx2];
6398
6399 for (workp2 = workp + equiv_class_length ; workp < workp2 ;
6400 workp++)
6401 {
6402 idx = (int32_t)*workp;
6403 /* We already checked idx != 0 in regex_compile. */
6404
6405 if (idx2 != 0 && len == weights[idx])
6406 {
6407 int cnt = 0;
6408 while (cnt < len && (weights[idx + 1 + cnt]
6409 == weights[idx2 + 1 + cnt]))
6410 ++cnt;
6411
6412 if (cnt == len)
6413 goto char_set_matched;
6414 }
6415 }
6416 /* not matched */
6417 d = backup_d;
6418 dend = backup_dend;
6419 }
6420 else /* (nrules == 0) */
6421 # endif
6422 /* If we can't look up collation data, we use wcscoll
6423 instead. */
6424 {
6425 for (workp2 = workp + equiv_class_length ; workp < workp2 ;)
6426 {
6427 const CHAR_T *backup_d = d, *backup_dend = dend;
6428 # ifdef _LIBC
6429 length = __wcslen (workp);
6430 # else
6431 length = wcslen (workp);
6432 # endif
6433
6434 /* If wcscoll(the collating symbol, whole string) > 0,
6435 any substring of the string never match with the
6436 collating symbol. */
6437 # ifdef _LIBC
6438 if (__wcscoll (workp, d) > 0)
6439 # else
6440 if (wcscoll (workp, d) > 0)
6441 # endif
6442 {
6443 workp += length + 1;
6444 break;
6445 }
6446
6447 /* First, we compare the equivalence class with
6448 the first character of the string.
6449 If it don't match, we add the next character to
6450 the compare buffer in turn. */
6451 for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++)
6452 {
6453 int match;
6454 if (d == dend)
6455 {
6456 if (dend == end_match_2)
6457 break;
6458 d = string2;
6459 dend = end_match_2;
6460 }
6461
6462 /* add next character to the compare buffer. */
6463 str_buf[i] = TRANSLATE(*d);
6464 str_buf[i+1] = '\0';
6465
6466 # ifdef _LIBC
6467 match = __wcscoll (workp, str_buf);
6468 # else
6469 match = wcscoll (workp, str_buf);
6470 # endif
6471
6472 if (match == 0)
6473 goto char_set_matched;
6474
6475 if (match < 0)
6476 /* (str_buf > workp) indicate (str_buf + X > workp),
6477 because for all X (str_buf + X > str_buf).
6478 So we don't need continue this loop. */
6479 break;
6480
6481 /* Otherwise(str_buf < workp),
6482 (str_buf+next_character) may equals (workp).
6483 So we continue this loop. */
6484 }
6485 /* not matched */
6486 d = backup_d;
6487 dend = backup_dend;
6488 workp += length + 1;
6489 }
6490 }
6491
6492 /* match with char_range? */
6493 # ifdef _LIBC
6494 if (nrules != 0)
6495 {
6496 uint32_t collseqval;
6497 const char *collseq = (const char *)
6498 _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC);
6499
6500 collseqval = collseq_table_lookup (collseq, c);
6501
6502 for (; workp < p - chars_length ;)
6503 {
6504 uint32_t start_val, end_val;
6505
6506 /* We already compute the collation sequence value
6507 of the characters (or collating symbols). */
6508 start_val = (uint32_t) *workp++; /* range_start */
6509 end_val = (uint32_t) *workp++; /* range_end */
6510
6511 if (start_val <= collseqval && collseqval <= end_val)
6512 goto char_set_matched;
6513 }
6514 }
6515 else
6516 # endif
6517 {
6518 /* We set range_start_char at str_buf[0], range_end_char
6519 at str_buf[4], and compared char at str_buf[2]. */
6520 str_buf[1] = 0;
6521 str_buf[2] = c;
6522 str_buf[3] = 0;
6523 str_buf[5] = 0;
6524 for (; workp < p - chars_length ;)
6525 {
6526 wchar_t *range_start_char, *range_end_char;
6527
6528 /* match if (range_start_char <= c <= range_end_char). */
6529
6530 /* If range_start(or end) < 0, we assume -range_start(end)
6531 is the offset of the collating symbol which is specified
6532 as the character of the range start(end). */
6533
6534 /* range_start */
6535 if (*workp < 0)
6536 range_start_char = charset_top - (*workp++);
6537 else
6538 {
6539 str_buf[0] = *workp++;
6540 range_start_char = str_buf;
6541 }
6542
6543 /* range_end */
6544 if (*workp < 0)
6545 range_end_char = charset_top - (*workp++);
6546 else
6547 {
6548 str_buf[4] = *workp++;
6549 range_end_char = str_buf + 4;
6550 }
6551
6552 # ifdef _LIBC
6553 if (__wcscoll (range_start_char, str_buf+2) <= 0
6554 && __wcscoll (str_buf+2, range_end_char) <= 0)
6555 # else
6556 if (wcscoll (range_start_char, str_buf+2) <= 0
6557 && wcscoll (str_buf+2, range_end_char) <= 0)
6558 # endif
6559 goto char_set_matched;
6560 }
6561 }
6562
6563 /* match with char? */
6564 for (; workp < p ; workp++)
6565 if (c == *workp)
6566 goto char_set_matched;
6567
6568 negate = !negate;
6569
6570 char_set_matched:
6571 if (negate) goto fail;
6572 #else
6573 /* Cast to `unsigned' instead of `unsigned char' in case the
6574 bit list is a full 32 bytes long. */
6575 if (c < (unsigned) (*p * BYTEWIDTH)
6576 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
6577 negate = !negate;
6578
6579 p += 1 + *p;
6580
6581 if (!negate) goto fail;
6582 #undef WORK_BUFFER_SIZE
6583 #endif /* WCHAR */
6584 SET_REGS_MATCHED ();
6585 d++;
6586 break;
6587 }
6588
6589
6590 /* The beginning of a group is represented by start_memory.
6591 The arguments are the register number in the next byte, and the
6592 number of groups inner to this one in the next. The text
6593 matched within the group is recorded (in the internal
6594 registers data structure) under the register number. */
6595 case start_memory:
6596 DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n",
6597 (long int) *p, (long int) p[1]);
6598
6599 /* Find out if this group can match the empty string. */
6600 p1 = p; /* To send to group_match_null_string_p. */
6601
6602 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
6603 REG_MATCH_NULL_STRING_P (reg_info[*p])
6604 = PREFIX(group_match_null_string_p) (&p1, pend, reg_info);
6605
6606 /* Save the position in the string where we were the last time
6607 we were at this open-group operator in case the group is
6608 operated upon by a repetition operator, e.g., with `(a*)*b'
6609 against `ab'; then we want to ignore where we are now in
6610 the string in case this attempt to match fails. */
6611 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6612 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
6613 : regstart[*p];
6614 DEBUG_PRINT2 (" old_regstart: %d\n",
6615 POINTER_TO_OFFSET (old_regstart[*p]));
6616
6617 regstart[*p] = d;
6618 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
6619
6620 IS_ACTIVE (reg_info[*p]) = 1;
6621 MATCHED_SOMETHING (reg_info[*p]) = 0;
6622
6623 /* Clear this whenever we change the register activity status. */
6624 set_regs_matched_done = 0;
6625
6626 /* This is the new highest active register. */
6627 highest_active_reg = *p;
6628
6629 /* If nothing was active before, this is the new lowest active
6630 register. */
6631 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6632 lowest_active_reg = *p;
6633
6634 /* Move past the register number and inner group count. */
6635 p += 2;
6636 just_past_start_mem = p;
6637
6638 break;
6639
6640
6641 /* The stop_memory opcode represents the end of a group. Its
6642 arguments are the same as start_memory's: the register
6643 number, and the number of inner groups. */
6644 case stop_memory:
6645 DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n",
6646 (long int) *p, (long int) p[1]);
6647
6648 /* We need to save the string position the last time we were at
6649 this close-group operator in case the group is operated
6650 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
6651 against `aba'; then we want to ignore where we are now in
6652 the string in case this attempt to match fails. */
6653 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6654 ? REG_UNSET (regend[*p]) ? d : regend[*p]
6655 : regend[*p];
6656 DEBUG_PRINT2 (" old_regend: %d\n",
6657 POINTER_TO_OFFSET (old_regend[*p]));
6658
6659 regend[*p] = d;
6660 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
6661
6662 /* This register isn't active anymore. */
6663 IS_ACTIVE (reg_info[*p]) = 0;
6664
6665 /* Clear this whenever we change the register activity status. */
6666 set_regs_matched_done = 0;
6667
6668 /* If this was the only register active, nothing is active
6669 anymore. */
6670 if (lowest_active_reg == highest_active_reg)
6671 {
6672 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6673 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6674 }
6675 else
6676 { /* We must scan for the new highest active register, since
6677 it isn't necessarily one less than now: consider
6678 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
6679 new highest active register is 1. */
6680 UCHAR_T r = *p - 1;
6681 while (r > 0 && !IS_ACTIVE (reg_info[r]))
6682 r--;
6683
6684 /* If we end up at register zero, that means that we saved
6685 the registers as the result of an `on_failure_jump', not
6686 a `start_memory', and we jumped to past the innermost
6687 `stop_memory'. For example, in ((.)*) we save
6688 registers 1 and 2 as a result of the *, but when we pop
6689 back to the second ), we are at the stop_memory 1.
6690 Thus, nothing is active. */
6691 if (r == 0)
6692 {
6693 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6694 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6695 }
6696 else
6697 highest_active_reg = r;
6698 }
6699
6700 /* If just failed to match something this time around with a
6701 group that's operated on by a repetition operator, try to
6702 force exit from the ``loop'', and restore the register
6703 information for this group that we had before trying this
6704 last match. */
6705 if ((!MATCHED_SOMETHING (reg_info[*p])
6706 || just_past_start_mem == p - 1)
6707 && (p + 2) < pend)
6708 {
6709 boolean is_a_jump_n = false;
6710
6711 p1 = p + 2;
6712 mcnt = 0;
6713 switch ((re_opcode_t) *p1++)
6714 {
6715 case jump_n:
6716 is_a_jump_n = true;
6717 case pop_failure_jump:
6718 case maybe_pop_jump:
6719 case jump:
6720 case dummy_failure_jump:
6721 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6722 if (is_a_jump_n)
6723 p1 += OFFSET_ADDRESS_SIZE;
6724 break;
6725
6726 default:
6727 /* do nothing */ ;
6728 }
6729 p1 += mcnt;
6730
6731 /* If the next operation is a jump backwards in the pattern
6732 to an on_failure_jump right before the start_memory
6733 corresponding to this stop_memory, exit from the loop
6734 by forcing a failure after pushing on the stack the
6735 on_failure_jump's jump in the pattern, and d. */
6736 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
6737 && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory
6738 && p1[2+OFFSET_ADDRESS_SIZE] == *p)
6739 {
6740 /* If this group ever matched anything, then restore
6741 what its registers were before trying this last
6742 failed match, e.g., with `(a*)*b' against `ab' for
6743 regstart[1], and, e.g., with `((a*)*(b*)*)*'
6744 against `aba' for regend[3].
6745
6746 Also restore the registers for inner groups for,
6747 e.g., `((a*)(b*))*' against `aba' (register 3 would
6748 otherwise get trashed). */
6749
6750 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
6751 {
6752 unsigned r;
6753
6754 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
6755
6756 /* Restore this and inner groups' (if any) registers. */
6757 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
6758 r++)
6759 {
6760 regstart[r] = old_regstart[r];
6761
6762 /* xx why this test? */
6763 if (old_regend[r] >= regstart[r])
6764 regend[r] = old_regend[r];
6765 }
6766 }
6767 p1++;
6768 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6769 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
6770
6771 goto fail;
6772 }
6773 }
6774
6775 /* Move past the register number and the inner group count. */
6776 p += 2;
6777 break;
6778
6779
6780 /* \<digit> has been turned into a `duplicate' command which is
6781 followed by the numeric value of <digit> as the register number. */
6782 case duplicate:
6783 {
6784 register const CHAR_T *d2, *dend2;
6785 int regno = *p++; /* Get which register to match against. */
6786 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
6787
6788 /* Can't back reference a group which we've never matched. */
6789 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
6790 goto fail;
6791
6792 /* Where in input to try to start matching. */
6793 d2 = regstart[regno];
6794
6795 /* Where to stop matching; if both the place to start and
6796 the place to stop matching are in the same string, then
6797 set to the place to stop, otherwise, for now have to use
6798 the end of the first string. */
6799
6800 dend2 = ((FIRST_STRING_P (regstart[regno])
6801 == FIRST_STRING_P (regend[regno]))
6802 ? regend[regno] : end_match_1);
6803 for (;;)
6804 {
6805 /* If necessary, advance to next segment in register
6806 contents. */
6807 while (d2 == dend2)
6808 {
6809 if (dend2 == end_match_2) break;
6810 if (dend2 == regend[regno]) break;
6811
6812 /* End of string1 => advance to string2. */
6813 d2 = string2;
6814 dend2 = regend[regno];
6815 }
6816 /* At end of register contents => success */
6817 if (d2 == dend2) break;
6818
6819 /* If necessary, advance to next segment in data. */
6820 PREFETCH ();
6821
6822 /* How many characters left in this segment to match. */
6823 mcnt = dend - d;
6824
6825 /* Want how many consecutive characters we can match in
6826 one shot, so, if necessary, adjust the count. */
6827 if (mcnt > dend2 - d2)
6828 mcnt = dend2 - d2;
6829
6830 /* Compare that many; failure if mismatch, else move
6831 past them. */
6832 if (translate
6833 ? PREFIX(bcmp_translate) (d, d2, mcnt, translate)
6834 : memcmp (d, d2, mcnt*sizeof(UCHAR_T)))
6835 goto fail;
6836 d += mcnt, d2 += mcnt;
6837
6838 /* Do this because we've match some characters. */
6839 SET_REGS_MATCHED ();
6840 }
6841 }
6842 break;
6843
6844
6845 /* begline matches the empty string at the beginning of the string
6846 (unless `not_bol' is set in `bufp'), and, if
6847 `newline_anchor' is set, after newlines. */
6848 case begline:
6849 DEBUG_PRINT1 ("EXECUTING begline.\n");
6850
6851 if (AT_STRINGS_BEG (d))
6852 {
6853 if (!bufp->not_bol) break;
6854 }
6855 else if (d[-1] == '\n' && bufp->newline_anchor)
6856 {
6857 break;
6858 }
6859 /* In all other cases, we fail. */
6860 goto fail;
6861
6862
6863 /* endline is the dual of begline. */
6864 case endline:
6865 DEBUG_PRINT1 ("EXECUTING endline.\n");
6866
6867 if (AT_STRINGS_END (d))
6868 {
6869 if (!bufp->not_eol) break;
6870 }
6871
6872 /* We have to ``prefetch'' the next character. */
6873 else if ((d == end1 ? *string2 : *d) == '\n'
6874 && bufp->newline_anchor)
6875 {
6876 break;
6877 }
6878 goto fail;
6879
6880
6881 /* Match at the very beginning of the data. */
6882 case begbuf:
6883 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
6884 if (AT_STRINGS_BEG (d))
6885 break;
6886 goto fail;
6887
6888
6889 /* Match at the very end of the data. */
6890 case endbuf:
6891 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
6892 if (AT_STRINGS_END (d))
6893 break;
6894 goto fail;
6895
6896
6897 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
6898 pushes NULL as the value for the string on the stack. Then
6899 `pop_failure_point' will keep the current value for the
6900 string, instead of restoring it. To see why, consider
6901 matching `foo\nbar' against `.*\n'. The .* matches the foo;
6902 then the . fails against the \n. But the next thing we want
6903 to do is match the \n against the \n; if we restored the
6904 string value, we would be back at the foo.
6905
6906 Because this is used only in specific cases, we don't need to
6907 check all the things that `on_failure_jump' does, to make
6908 sure the right things get saved on the stack. Hence we don't
6909 share its code. The only reason to push anything on the
6910 stack at all is that otherwise we would have to change
6911 `anychar's code to do something besides goto fail in this
6912 case; that seems worse than this. */
6913 case on_failure_keep_string_jump:
6914 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
6915
6916 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6917 #ifdef _LIBC
6918 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
6919 #else
6920 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
6921 #endif
6922
6923 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
6924 break;
6925
6926
6927 /* Uses of on_failure_jump:
6928
6929 Each alternative starts with an on_failure_jump that points
6930 to the beginning of the next alternative. Each alternative
6931 except the last ends with a jump that in effect jumps past
6932 the rest of the alternatives. (They really jump to the
6933 ending jump of the following alternative, because tensioning
6934 these jumps is a hassle.)
6935
6936 Repeats start with an on_failure_jump that points past both
6937 the repetition text and either the following jump or
6938 pop_failure_jump back to this on_failure_jump. */
6939 case on_failure_jump:
6940 on_failure:
6941 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
6942
6943 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6944 #ifdef _LIBC
6945 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
6946 #else
6947 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
6948 #endif
6949
6950 /* If this on_failure_jump comes right before a group (i.e.,
6951 the original * applied to a group), save the information
6952 for that group and all inner ones, so that if we fail back
6953 to this point, the group's information will be correct.
6954 For example, in \(a*\)*\1, we need the preceding group,
6955 and in \(zz\(a*\)b*\)\2, we need the inner group. */
6956
6957 /* We can't use `p' to check ahead because we push
6958 a failure point to `p + mcnt' after we do this. */
6959 p1 = p;
6960
6961 /* We need to skip no_op's before we look for the
6962 start_memory in case this on_failure_jump is happening as
6963 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
6964 against aba. */
6965 while (p1 < pend && (re_opcode_t) *p1 == no_op)
6966 p1++;
6967
6968 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
6969 {
6970 /* We have a new highest active register now. This will
6971 get reset at the start_memory we are about to get to,
6972 but we will have saved all the registers relevant to
6973 this repetition op, as described above. */
6974 highest_active_reg = *(p1 + 1) + *(p1 + 2);
6975 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6976 lowest_active_reg = *(p1 + 1);
6977 }
6978
6979 DEBUG_PRINT1 (":\n");
6980 PUSH_FAILURE_POINT (p + mcnt, d, -2);
6981 break;
6982
6983
6984 /* A smart repeat ends with `maybe_pop_jump'.
6985 We change it to either `pop_failure_jump' or `jump'. */
6986 case maybe_pop_jump:
6987 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6988 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
6989 {
6990 register UCHAR_T *p2 = p;
6991
6992 /* Compare the beginning of the repeat with what in the
6993 pattern follows its end. If we can establish that there
6994 is nothing that they would both match, i.e., that we
6995 would have to backtrack because of (as in, e.g., `a*a')
6996 then we can change to pop_failure_jump, because we'll
6997 never have to backtrack.
6998
6999 This is not true in the case of alternatives: in
7000 `(a|ab)*' we do need to backtrack to the `ab' alternative
7001 (e.g., if the string was `ab'). But instead of trying to
7002 detect that here, the alternative has put on a dummy
7003 failure point which is what we will end up popping. */
7004
7005 /* Skip over open/close-group commands.
7006 If what follows this loop is a ...+ construct,
7007 look at what begins its body, since we will have to
7008 match at least one of that. */
7009 while (1)
7010 {
7011 if (p2 + 2 < pend
7012 && ((re_opcode_t) *p2 == stop_memory
7013 || (re_opcode_t) *p2 == start_memory))
7014 p2 += 3;
7015 else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend
7016 && (re_opcode_t) *p2 == dummy_failure_jump)
7017 p2 += 2 + 2 * OFFSET_ADDRESS_SIZE;
7018 else
7019 break;
7020 }
7021
7022 p1 = p + mcnt;
7023 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
7024 to the `maybe_finalize_jump' of this case. Examine what
7025 follows. */
7026
7027 /* If we're at the end of the pattern, we can change. */
7028 if (p2 == pend)
7029 {
7030 /* Consider what happens when matching ":\(.*\)"
7031 against ":/". I don't really understand this code
7032 yet. */
7033 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7034 pop_failure_jump;
7035 DEBUG_PRINT1
7036 (" End of pattern: change to `pop_failure_jump'.\n");
7037 }
7038
7039 else if ((re_opcode_t) *p2 == exactn
7040 #ifdef MBS_SUPPORT
7041 || (re_opcode_t) *p2 == exactn_bin
7042 #endif
7043 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
7044 {
7045 register UCHAR_T c
7046 = *p2 == (UCHAR_T) endline ? '\n' : p2[2];
7047
7048 if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn
7049 #ifdef MBS_SUPPORT
7050 || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin
7051 #endif
7052 ) && p1[3+OFFSET_ADDRESS_SIZE] != c)
7053 {
7054 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7055 pop_failure_jump;
7056 #ifdef WCHAR
7057 DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n",
7058 (wint_t) c,
7059 (wint_t) p1[3+OFFSET_ADDRESS_SIZE]);
7060 #else
7061 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
7062 (char) c,
7063 (char) p1[3+OFFSET_ADDRESS_SIZE]);
7064 #endif
7065 }
7066
7067 #ifndef WCHAR
7068 else if ((re_opcode_t) p1[3] == charset
7069 || (re_opcode_t) p1[3] == charset_not)
7070 {
7071 int negate = (re_opcode_t) p1[3] == charset_not;
7072
7073 if (c < (unsigned) (p1[4] * BYTEWIDTH)
7074 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
7075 negate = !negate;
7076
7077 /* `negate' is equal to 1 if c would match, which means
7078 that we can't change to pop_failure_jump. */
7079 if (!negate)
7080 {
7081 p[-3] = (unsigned char) pop_failure_jump;
7082 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7083 }
7084 }
7085 #endif /* not WCHAR */
7086 }
7087 #ifndef WCHAR
7088 else if ((re_opcode_t) *p2 == charset)
7089 {
7090 /* We win if the first character of the loop is not part
7091 of the charset. */
7092 if ((re_opcode_t) p1[3] == exactn
7093 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
7094 && (p2[2 + p1[5] / BYTEWIDTH]
7095 & (1 << (p1[5] % BYTEWIDTH)))))
7096 {
7097 p[-3] = (unsigned char) pop_failure_jump;
7098 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7099 }
7100
7101 else if ((re_opcode_t) p1[3] == charset_not)
7102 {
7103 int idx;
7104 /* We win if the charset_not inside the loop
7105 lists every character listed in the charset after. */
7106 for (idx = 0; idx < (int) p2[1]; idx++)
7107 if (! (p2[2 + idx] == 0
7108 || (idx < (int) p1[4]
7109 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
7110 break;
7111
7112 if (idx == p2[1])
7113 {
7114 p[-3] = (unsigned char) pop_failure_jump;
7115 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7116 }
7117 }
7118 else if ((re_opcode_t) p1[3] == charset)
7119 {
7120 int idx;
7121 /* We win if the charset inside the loop
7122 has no overlap with the one after the loop. */
7123 for (idx = 0;
7124 idx < (int) p2[1] && idx < (int) p1[4];
7125 idx++)
7126 if ((p2[2 + idx] & p1[5 + idx]) != 0)
7127 break;
7128
7129 if (idx == p2[1] || idx == p1[4])
7130 {
7131 p[-3] = (unsigned char) pop_failure_jump;
7132 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7133 }
7134 }
7135 }
7136 #endif /* not WCHAR */
7137 }
7138 p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */
7139 if ((re_opcode_t) p[-1] != pop_failure_jump)
7140 {
7141 p[-1] = (UCHAR_T) jump;
7142 DEBUG_PRINT1 (" Match => jump.\n");
7143 goto unconditional_jump;
7144 }
7145 /* Note fall through. */
7146
7147
7148 /* The end of a simple repeat has a pop_failure_jump back to
7149 its matching on_failure_jump, where the latter will push a
7150 failure point. The pop_failure_jump takes off failure
7151 points put on by this pop_failure_jump's matching
7152 on_failure_jump; we got through the pattern to here from the
7153 matching on_failure_jump, so didn't fail. */
7154 case pop_failure_jump:
7155 {
7156 /* We need to pass separate storage for the lowest and
7157 highest registers, even though we don't care about the
7158 actual values. Otherwise, we will restore only one
7159 register from the stack, since lowest will == highest in
7160 `pop_failure_point'. */
7161 active_reg_t dummy_low_reg, dummy_high_reg;
7162 UCHAR_T *pdummy ATTRIBUTE_UNUSED = NULL;
7163 const CHAR_T *sdummy ATTRIBUTE_UNUSED = NULL;
7164
7165 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
7166 POP_FAILURE_POINT (sdummy, pdummy,
7167 dummy_low_reg, dummy_high_reg,
7168 reg_dummy, reg_dummy, reg_info_dummy);
7169 }
7170 /* Note fall through. */
7171
7172 unconditional_jump:
7173 #ifdef _LIBC
7174 DEBUG_PRINT2 ("\n%p: ", p);
7175 #else
7176 DEBUG_PRINT2 ("\n0x%x: ", p);
7177 #endif
7178 /* Note fall through. */
7179
7180 /* Unconditionally jump (without popping any failure points). */
7181 case jump:
7182 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
7183 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7184 p += mcnt; /* Do the jump. */
7185 #ifdef _LIBC
7186 DEBUG_PRINT2 ("(to %p).\n", p);
7187 #else
7188 DEBUG_PRINT2 ("(to 0x%x).\n", p);
7189 #endif
7190 break;
7191
7192
7193 /* We need this opcode so we can detect where alternatives end
7194 in `group_match_null_string_p' et al. */
7195 case jump_past_alt:
7196 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
7197 goto unconditional_jump;
7198
7199
7200 /* Normally, the on_failure_jump pushes a failure point, which
7201 then gets popped at pop_failure_jump. We will end up at
7202 pop_failure_jump, also, and with a pattern of, say, `a+', we
7203 are skipping over the on_failure_jump, so we have to push
7204 something meaningless for pop_failure_jump to pop. */
7205 case dummy_failure_jump:
7206 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
7207 /* It doesn't matter what we push for the string here. What
7208 the code at `fail' tests is the value for the pattern. */
7209 PUSH_FAILURE_POINT (NULL, NULL, -2);
7210 goto unconditional_jump;
7211
7212
7213 /* At the end of an alternative, we need to push a dummy failure
7214 point in case we are followed by a `pop_failure_jump', because
7215 we don't want the failure point for the alternative to be
7216 popped. For example, matching `(a|ab)*' against `aab'
7217 requires that we match the `ab' alternative. */
7218 case push_dummy_failure:
7219 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
7220 /* See comments just above at `dummy_failure_jump' about the
7221 two zeroes. */
7222 PUSH_FAILURE_POINT (NULL, NULL, -2);
7223 break;
7224
7225 /* Have to succeed matching what follows at least n times.
7226 After that, handle like `on_failure_jump'. */
7227 case succeed_n:
7228 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7229 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
7230
7231 assert (mcnt >= 0);
7232 /* Originally, this is how many times we HAVE to succeed. */
7233 if (mcnt > 0)
7234 {
7235 mcnt--;
7236 p += OFFSET_ADDRESS_SIZE;
7237 STORE_NUMBER_AND_INCR (p, mcnt);
7238 #ifdef _LIBC
7239 DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE
7240 , mcnt);
7241 #else
7242 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE
7243 , mcnt);
7244 #endif
7245 }
7246 else if (mcnt == 0)
7247 {
7248 #ifdef _LIBC
7249 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n",
7250 p + OFFSET_ADDRESS_SIZE);
7251 #else
7252 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n",
7253 p + OFFSET_ADDRESS_SIZE);
7254 #endif /* _LIBC */
7255
7256 #ifdef WCHAR
7257 p[1] = (UCHAR_T) no_op;
7258 #else
7259 p[2] = (UCHAR_T) no_op;
7260 p[3] = (UCHAR_T) no_op;
7261 #endif /* WCHAR */
7262 goto on_failure;
7263 }
7264 break;
7265
7266 case jump_n:
7267 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7268 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
7269
7270 /* Originally, this is how many times we CAN jump. */
7271 if (mcnt)
7272 {
7273 mcnt--;
7274 STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt);
7275
7276 #ifdef _LIBC
7277 DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE,
7278 mcnt);
7279 #else
7280 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE,
7281 mcnt);
7282 #endif /* _LIBC */
7283 goto unconditional_jump;
7284 }
7285 /* If don't have to jump any more, skip over the rest of command. */
7286 else
7287 p += 2 * OFFSET_ADDRESS_SIZE;
7288 break;
7289
7290 case set_number_at:
7291 {
7292 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
7293
7294 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7295 p1 = p + mcnt;
7296 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7297 #ifdef _LIBC
7298 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
7299 #else
7300 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
7301 #endif
7302 STORE_NUMBER (p1, mcnt);
7303 break;
7304 }
7305
7306 #if 0
7307 /* The DEC Alpha C compiler 3.x generates incorrect code for the
7308 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7309 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
7310 macro and introducing temporary variables works around the bug. */
7311
7312 case wordbound:
7313 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7314 if (AT_WORD_BOUNDARY (d))
7315 break;
7316 goto fail;
7317
7318 case notwordbound:
7319 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7320 if (AT_WORD_BOUNDARY (d))
7321 goto fail;
7322 break;
7323 #else
7324 case wordbound:
7325 {
7326 boolean prevchar, thischar;
7327
7328 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7329 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7330 break;
7331
7332 prevchar = WORDCHAR_P (d - 1);
7333 thischar = WORDCHAR_P (d);
7334 if (prevchar != thischar)
7335 break;
7336 goto fail;
7337 }
7338
7339 case notwordbound:
7340 {
7341 boolean prevchar, thischar;
7342
7343 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7344 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7345 goto fail;
7346
7347 prevchar = WORDCHAR_P (d - 1);
7348 thischar = WORDCHAR_P (d);
7349 if (prevchar != thischar)
7350 goto fail;
7351 break;
7352 }
7353 #endif
7354
7355 case wordbeg:
7356 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
7357 if (!AT_STRINGS_END (d) && WORDCHAR_P (d)
7358 && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
7359 break;
7360 goto fail;
7361
7362 case wordend:
7363 DEBUG_PRINT1 ("EXECUTING wordend.\n");
7364 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
7365 && (AT_STRINGS_END (d) || !WORDCHAR_P (d)))
7366 break;
7367 goto fail;
7368
7369 #ifdef emacs
7370 case before_dot:
7371 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
7372 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
7373 goto fail;
7374 break;
7375
7376 case at_dot:
7377 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
7378 if (PTR_CHAR_POS ((unsigned char *) d) != point)
7379 goto fail;
7380 break;
7381
7382 case after_dot:
7383 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
7384 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
7385 goto fail;
7386 break;
7387
7388 case syntaxspec:
7389 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
7390 mcnt = *p++;
7391 goto matchsyntax;
7392
7393 case wordchar:
7394 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
7395 mcnt = (int) Sword;
7396 matchsyntax:
7397 PREFETCH ();
7398 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7399 d++;
7400 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
7401 goto fail;
7402 SET_REGS_MATCHED ();
7403 break;
7404
7405 case notsyntaxspec:
7406 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
7407 mcnt = *p++;
7408 goto matchnotsyntax;
7409
7410 case notwordchar:
7411 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
7412 mcnt = (int) Sword;
7413 matchnotsyntax:
7414 PREFETCH ();
7415 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7416 d++;
7417 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
7418 goto fail;
7419 SET_REGS_MATCHED ();
7420 break;
7421
7422 #else /* not emacs */
7423 case wordchar:
7424 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
7425 PREFETCH ();
7426 if (!WORDCHAR_P (d))
7427 goto fail;
7428 SET_REGS_MATCHED ();
7429 d++;
7430 break;
7431
7432 case notwordchar:
7433 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
7434 PREFETCH ();
7435 if (WORDCHAR_P (d))
7436 goto fail;
7437 SET_REGS_MATCHED ();
7438 d++;
7439 break;
7440 #endif /* not emacs */
7441
7442 default:
7443 abort ();
7444 }
7445 continue; /* Successfully executed one pattern command; keep going. */
7446
7447
7448 /* We goto here if a matching operation fails. */
7449 fail:
7450 if (!FAIL_STACK_EMPTY ())
7451 { /* A restart point is known. Restore to that state. */
7452 DEBUG_PRINT1 ("\nFAIL:\n");
7453 POP_FAILURE_POINT (d, p,
7454 lowest_active_reg, highest_active_reg,
7455 regstart, regend, reg_info);
7456
7457 /* If this failure point is a dummy, try the next one. */
7458 if (!p)
7459 goto fail;
7460
7461 /* If we failed to the end of the pattern, don't examine *p. */
7462 assert (p <= pend);
7463 if (p < pend)
7464 {
7465 boolean is_a_jump_n = false;
7466
7467 /* If failed to a backwards jump that's part of a repetition
7468 loop, need to pop this failure point and use the next one. */
7469 switch ((re_opcode_t) *p)
7470 {
7471 case jump_n:
7472 is_a_jump_n = true;
7473 case maybe_pop_jump:
7474 case pop_failure_jump:
7475 case jump:
7476 p1 = p + 1;
7477 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7478 p1 += mcnt;
7479
7480 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
7481 || (!is_a_jump_n
7482 && (re_opcode_t) *p1 == on_failure_jump))
7483 goto fail;
7484 break;
7485 default:
7486 /* do nothing */ ;
7487 }
7488 }
7489
7490 if (d >= string1 && d <= end1)
7491 dend = end_match_1;
7492 }
7493 else
7494 break; /* Matching at this starting point really fails. */
7495 } /* for (;;) */
7496
7497 if (best_regs_set)
7498 goto restore_best_regs;
7499
7500 FREE_VARIABLES ();
7501
7502 return -1; /* Failure to match. */
7503 } /* re_match_2 */
7504
7505 /* Subroutine definitions for re_match_2. */
7507
7508
7509 /* We are passed P pointing to a register number after a start_memory.
7510
7511 Return true if the pattern up to the corresponding stop_memory can
7512 match the empty string, and false otherwise.
7513
7514 If we find the matching stop_memory, sets P to point to one past its number.
7515 Otherwise, sets P to an undefined byte less than or equal to END.
7516
7517 We don't handle duplicates properly (yet). */
7518
7519 static boolean
7520 PREFIX(group_match_null_string_p) (UCHAR_T **p, UCHAR_T *end,
7521 PREFIX(register_info_type) *reg_info)
7522 {
7523 int mcnt;
7524 /* Point to after the args to the start_memory. */
7525 UCHAR_T *p1 = *p + 2;
7526
7527 while (p1 < end)
7528 {
7529 /* Skip over opcodes that can match nothing, and return true or
7530 false, as appropriate, when we get to one that can't, or to the
7531 matching stop_memory. */
7532
7533 switch ((re_opcode_t) *p1)
7534 {
7535 /* Could be either a loop or a series of alternatives. */
7536 case on_failure_jump:
7537 p1++;
7538 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7539
7540 /* If the next operation is not a jump backwards in the
7541 pattern. */
7542
7543 if (mcnt >= 0)
7544 {
7545 /* Go through the on_failure_jumps of the alternatives,
7546 seeing if any of the alternatives cannot match nothing.
7547 The last alternative starts with only a jump,
7548 whereas the rest start with on_failure_jump and end
7549 with a jump, e.g., here is the pattern for `a|b|c':
7550
7551 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
7552 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
7553 /exactn/1/c
7554
7555 So, we have to first go through the first (n-1)
7556 alternatives and then deal with the last one separately. */
7557
7558
7559 /* Deal with the first (n-1) alternatives, which start
7560 with an on_failure_jump (see above) that jumps to right
7561 past a jump_past_alt. */
7562
7563 while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] ==
7564 jump_past_alt)
7565 {
7566 /* `mcnt' holds how many bytes long the alternative
7567 is, including the ending `jump_past_alt' and
7568 its number. */
7569
7570 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt -
7571 (1 + OFFSET_ADDRESS_SIZE),
7572 reg_info))
7573 return false;
7574
7575 /* Move to right after this alternative, including the
7576 jump_past_alt. */
7577 p1 += mcnt;
7578
7579 /* Break if it's the beginning of an n-th alternative
7580 that doesn't begin with an on_failure_jump. */
7581 if ((re_opcode_t) *p1 != on_failure_jump)
7582 break;
7583
7584 /* Still have to check that it's not an n-th
7585 alternative that starts with an on_failure_jump. */
7586 p1++;
7587 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7588 if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] !=
7589 jump_past_alt)
7590 {
7591 /* Get to the beginning of the n-th alternative. */
7592 p1 -= 1 + OFFSET_ADDRESS_SIZE;
7593 break;
7594 }
7595 }
7596
7597 /* Deal with the last alternative: go back and get number
7598 of the `jump_past_alt' just before it. `mcnt' contains
7599 the length of the alternative. */
7600 EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE);
7601
7602 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt, reg_info))
7603 return false;
7604
7605 p1 += mcnt; /* Get past the n-th alternative. */
7606 } /* if mcnt > 0 */
7607 break;
7608
7609
7610 case stop_memory:
7611 assert (p1[1] == **p);
7612 *p = p1 + 2;
7613 return true;
7614
7615
7616 default:
7617 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7618 return false;
7619 }
7620 } /* while p1 < end */
7621
7622 return false;
7623 } /* group_match_null_string_p */
7624
7625
7626 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
7627 It expects P to be the first byte of a single alternative and END one
7628 byte past the last. The alternative can contain groups. */
7629
7630 static boolean
7631 PREFIX(alt_match_null_string_p) (UCHAR_T *p, UCHAR_T *end,
7632 PREFIX(register_info_type) *reg_info)
7633 {
7634 int mcnt;
7635 UCHAR_T *p1 = p;
7636
7637 while (p1 < end)
7638 {
7639 /* Skip over opcodes that can match nothing, and break when we get
7640 to one that can't. */
7641
7642 switch ((re_opcode_t) *p1)
7643 {
7644 /* It's a loop. */
7645 case on_failure_jump:
7646 p1++;
7647 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7648 p1 += mcnt;
7649 break;
7650
7651 default:
7652 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7653 return false;
7654 }
7655 } /* while p1 < end */
7656
7657 return true;
7658 } /* alt_match_null_string_p */
7659
7660
7661 /* Deals with the ops common to group_match_null_string_p and
7662 alt_match_null_string_p.
7663
7664 Sets P to one after the op and its arguments, if any. */
7665
7666 static boolean
7667 PREFIX(common_op_match_null_string_p) (UCHAR_T **p, UCHAR_T *end,
7668 PREFIX(register_info_type) *reg_info)
7669 {
7670 int mcnt;
7671 boolean ret;
7672 int reg_no;
7673 UCHAR_T *p1 = *p;
7674
7675 switch ((re_opcode_t) *p1++)
7676 {
7677 case no_op:
7678 case begline:
7679 case endline:
7680 case begbuf:
7681 case endbuf:
7682 case wordbeg:
7683 case wordend:
7684 case wordbound:
7685 case notwordbound:
7686 #ifdef emacs
7687 case before_dot:
7688 case at_dot:
7689 case after_dot:
7690 #endif
7691 break;
7692
7693 case start_memory:
7694 reg_no = *p1;
7695 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
7696 ret = PREFIX(group_match_null_string_p) (&p1, end, reg_info);
7697
7698 /* Have to set this here in case we're checking a group which
7699 contains a group and a back reference to it. */
7700
7701 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
7702 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
7703
7704 if (!ret)
7705 return false;
7706 break;
7707
7708 /* If this is an optimized succeed_n for zero times, make the jump. */
7709 case jump:
7710 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7711 if (mcnt >= 0)
7712 p1 += mcnt;
7713 else
7714 return false;
7715 break;
7716
7717 case succeed_n:
7718 /* Get to the number of times to succeed. */
7719 p1 += OFFSET_ADDRESS_SIZE;
7720 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7721
7722 if (mcnt == 0)
7723 {
7724 p1 -= 2 * OFFSET_ADDRESS_SIZE;
7725 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7726 p1 += mcnt;
7727 }
7728 else
7729 return false;
7730 break;
7731
7732 case duplicate:
7733 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
7734 return false;
7735 break;
7736
7737 case set_number_at:
7738 p1 += 2 * OFFSET_ADDRESS_SIZE;
7739
7740 default:
7741 /* All other opcodes mean we cannot match the empty string. */
7742 return false;
7743 }
7744
7745 *p = p1;
7746 return true;
7747 } /* common_op_match_null_string_p */
7748
7749
7750 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
7751 bytes; nonzero otherwise. */
7752
7753 static int
7754 PREFIX(bcmp_translate) (const CHAR_T *s1, const CHAR_T *s2, register int len,
7755 RE_TRANSLATE_TYPE translate)
7756 {
7757 register const UCHAR_T *p1 = (const UCHAR_T *) s1;
7758 register const UCHAR_T *p2 = (const UCHAR_T *) s2;
7759 while (len)
7760 {
7761 #ifdef WCHAR
7762 if (((*p1<=0xff)?translate[*p1++]:*p1++)
7763 != ((*p2<=0xff)?translate[*p2++]:*p2++))
7764 return 1;
7765 #else /* BYTE */
7766 if (translate[*p1++] != translate[*p2++]) return 1;
7767 #endif /* WCHAR */
7768 len--;
7769 }
7770 return 0;
7771 }
7772
7773
7775 #else /* not INSIDE_RECURSION */
7776
7777 /* Entry points for GNU code. */
7778
7779 /* re_compile_pattern is the GNU regular expression compiler: it
7780 compiles PATTERN (of length SIZE) and puts the result in BUFP.
7781 Returns 0 if the pattern was valid, otherwise an error string.
7782
7783 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
7784 are set in BUFP on entry.
7785
7786 We call regex_compile to do the actual compilation. */
7787
7788 const char *
7789 re_compile_pattern (const char *pattern, size_t length,
7790 struct re_pattern_buffer *bufp)
7791 {
7792 reg_errcode_t ret;
7793
7794 /* GNU code is written to assume at least RE_NREGS registers will be set
7795 (and at least one extra will be -1). */
7796 bufp->regs_allocated = REGS_UNALLOCATED;
7797
7798 /* And GNU code determines whether or not to get register information
7799 by passing null for the REGS argument to re_match, etc., not by
7800 setting no_sub. */
7801 bufp->no_sub = 0;
7802
7803 /* Match anchors at newline. */
7804 bufp->newline_anchor = 1;
7805
7806 # ifdef MBS_SUPPORT
7807 if (MB_CUR_MAX != 1)
7808 ret = wcs_regex_compile (pattern, length, re_syntax_options, bufp);
7809 else
7810 # endif
7811 ret = byte_regex_compile (pattern, length, re_syntax_options, bufp);
7812
7813 if (!ret)
7814 return NULL;
7815 return gettext (re_error_msgid[(int) ret]);
7816 }
7817 #ifdef _LIBC
7818 weak_alias (__re_compile_pattern, re_compile_pattern)
7819 #endif
7820
7821 /* Entry points compatible with 4.2 BSD regex library. We don't define
7823 them unless specifically requested. */
7824
7825 #if defined _REGEX_RE_COMP || defined _LIBC
7826
7827 /* BSD has one and only one pattern buffer. */
7828 static struct re_pattern_buffer re_comp_buf;
7829
7830 char *
7831 #ifdef _LIBC
7832 /* Make these definitions weak in libc, so POSIX programs can redefine
7833 these names if they don't use our functions, and still use
7834 regcomp/regexec below without link errors. */
7835 weak_function
7836 #endif
7837 re_comp (const char *s)
7838 {
7839 reg_errcode_t ret;
7840
7841 if (!s)
7842 {
7843 if (!re_comp_buf.buffer)
7844 return (char *) gettext ("No previous regular expression");
7845 return 0;
7846 }
7847
7848 if (!re_comp_buf.buffer)
7849 {
7850 re_comp_buf.buffer = (unsigned char *) malloc (200);
7851 if (re_comp_buf.buffer == NULL)
7852 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
7853 re_comp_buf.allocated = 200;
7854
7855 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
7856 if (re_comp_buf.fastmap == NULL)
7857 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
7858 }
7859
7860 /* Since `re_exec' always passes NULL for the `regs' argument, we
7861 don't need to initialize the pattern buffer fields which affect it. */
7862
7863 /* Match anchors at newlines. */
7864 re_comp_buf.newline_anchor = 1;
7865
7866 # ifdef MBS_SUPPORT
7867 if (MB_CUR_MAX != 1)
7868 ret = wcs_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
7869 else
7870 # endif
7871 ret = byte_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
7872
7873 if (!ret)
7874 return NULL;
7875
7876 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
7877 return (char *) gettext (re_error_msgid[(int) ret]);
7878 }
7879
7880
7881 int
7882 #ifdef _LIBC
7883 weak_function
7884 #endif
7885 re_exec (const char *s)
7886 {
7887 const int len = strlen (s);
7888 return
7889 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
7890 }
7891
7892 #endif /* _REGEX_RE_COMP */
7893
7894 /* POSIX.2 functions. Don't define these for Emacs. */
7896
7897 #ifndef emacs
7898
7899 /* regcomp takes a regular expression as a string and compiles it.
7900
7901 PREG is a regex_t *. We do not expect any fields to be initialized,
7902 since POSIX says we shouldn't. Thus, we set
7903
7904 `buffer' to the compiled pattern;
7905 `used' to the length of the compiled pattern;
7906 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
7907 REG_EXTENDED bit in CFLAGS is set; otherwise, to
7908 RE_SYNTAX_POSIX_BASIC;
7909 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
7910 `fastmap' to an allocated space for the fastmap;
7911 `fastmap_accurate' to zero;
7912 `re_nsub' to the number of subexpressions in PATTERN.
7913
7914 PATTERN is the address of the pattern string.
7915
7916 CFLAGS is a series of bits which affect compilation.
7917
7918 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
7919 use POSIX basic syntax.
7920
7921 If REG_NEWLINE is set, then . and [^...] don't match newline.
7922 Also, regexec will try a match beginning after every newline.
7923
7924 If REG_ICASE is set, then we considers upper- and lowercase
7925 versions of letters to be equivalent when matching.
7926
7927 If REG_NOSUB is set, then when PREG is passed to regexec, that
7928 routine will report only success or failure, and nothing about the
7929 registers.
7930
7931 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
7932 the return codes and their meanings.) */
7933
7934 int
7935 regcomp (regex_t *preg, const char *pattern, int cflags)
7936 {
7937 reg_errcode_t ret;
7938 reg_syntax_t syntax
7939 = (cflags & REG_EXTENDED) ?
7940 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
7941
7942 /* regex_compile will allocate the space for the compiled pattern. */
7943 preg->buffer = 0;
7944 preg->allocated = 0;
7945 preg->used = 0;
7946
7947 /* Try to allocate space for the fastmap. */
7948 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
7949
7950 if (cflags & REG_ICASE)
7951 {
7952 int i;
7953
7954 preg->translate
7955 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
7956 * sizeof (*(RE_TRANSLATE_TYPE)0));
7957 if (preg->translate == NULL)
7958 return (int) REG_ESPACE;
7959
7960 /* Map uppercase characters to corresponding lowercase ones. */
7961 for (i = 0; i < CHAR_SET_SIZE; i++)
7962 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
7963 }
7964 else
7965 preg->translate = NULL;
7966
7967 /* If REG_NEWLINE is set, newlines are treated differently. */
7968 if (cflags & REG_NEWLINE)
7969 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
7970 syntax &= ~RE_DOT_NEWLINE;
7971 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
7972 /* It also changes the matching behavior. */
7973 preg->newline_anchor = 1;
7974 }
7975 else
7976 preg->newline_anchor = 0;
7977
7978 preg->no_sub = !!(cflags & REG_NOSUB);
7979
7980 /* POSIX says a null character in the pattern terminates it, so we
7981 can use strlen here in compiling the pattern. */
7982 # ifdef MBS_SUPPORT
7983 if (MB_CUR_MAX != 1)
7984 ret = wcs_regex_compile (pattern, strlen (pattern), syntax, preg);
7985 else
7986 # endif
7987 ret = byte_regex_compile (pattern, strlen (pattern), syntax, preg);
7988
7989 /* POSIX doesn't distinguish between an unmatched open-group and an
7990 unmatched close-group: both are REG_EPAREN. */
7991 if (ret == REG_ERPAREN) ret = REG_EPAREN;
7992
7993 if (ret == REG_NOERROR && preg->fastmap)
7994 {
7995 /* Compute the fastmap now, since regexec cannot modify the pattern
7996 buffer. */
7997 if (re_compile_fastmap (preg) == -2)
7998 {
7999 /* Some error occurred while computing the fastmap, just forget
8000 about it. */
8001 free (preg->fastmap);
8002 preg->fastmap = NULL;
8003 }
8004 }
8005
8006 return (int) ret;
8007 }
8008 #ifdef _LIBC
8009 weak_alias (__regcomp, regcomp)
8010 #endif
8011
8012
8013 /* regexec searches for a given pattern, specified by PREG, in the
8014 string STRING.
8015
8016 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
8017 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
8018 least NMATCH elements, and we set them to the offsets of the
8019 corresponding matched substrings.
8020
8021 EFLAGS specifies `execution flags' which affect matching: if
8022 REG_NOTBOL is set, then ^ does not match at the beginning of the
8023 string; if REG_NOTEOL is set, then $ does not match at the end.
8024
8025 We return 0 if we find a match and REG_NOMATCH if not. */
8026
8027 int
8028 regexec (const regex_t *preg, const char *string, size_t nmatch,
8029 regmatch_t pmatch[], int eflags)
8030 {
8031 int ret;
8032 struct re_registers regs;
8033 regex_t private_preg;
8034 int len = strlen (string);
8035 boolean want_reg_info = !preg->no_sub && nmatch > 0;
8036
8037 private_preg = *preg;
8038
8039 private_preg.not_bol = !!(eflags & REG_NOTBOL);
8040 private_preg.not_eol = !!(eflags & REG_NOTEOL);
8041
8042 /* The user has told us exactly how many registers to return
8043 information about, via `nmatch'. We have to pass that on to the
8044 matching routines. */
8045 private_preg.regs_allocated = REGS_FIXED;
8046
8047 if (want_reg_info)
8048 {
8049 regs.num_regs = nmatch;
8050 regs.start = TALLOC (nmatch * 2, regoff_t);
8051 if (regs.start == NULL)
8052 return (int) REG_NOMATCH;
8053 regs.end = regs.start + nmatch;
8054 }
8055
8056 /* Perform the searching operation. */
8057 ret = re_search (&private_preg, string, len,
8058 /* start: */ 0, /* range: */ len,
8059 want_reg_info ? ®s : (struct re_registers *) 0);
8060
8061 /* Copy the register information to the POSIX structure. */
8062 if (want_reg_info)
8063 {
8064 if (ret >= 0)
8065 {
8066 unsigned r;
8067
8068 for (r = 0; r < nmatch; r++)
8069 {
8070 pmatch[r].rm_so = regs.start[r];
8071 pmatch[r].rm_eo = regs.end[r];
8072 }
8073 }
8074
8075 /* If we needed the temporary register info, free the space now. */
8076 free (regs.start);
8077 }
8078
8079 /* We want zero return to mean success, unlike `re_search'. */
8080 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
8081 }
8082 #ifdef _LIBC
8083 weak_alias (__regexec, regexec)
8084 #endif
8085
8086
8087 /* Returns a message corresponding to an error code, ERRCODE, returned
8088 from either regcomp or regexec. We don't use PREG here. */
8089
8090 size_t
8091 regerror (int errcode, const regex_t *preg ATTRIBUTE_UNUSED,
8092 char *errbuf, size_t errbuf_size)
8093 {
8094 const char *msg;
8095 size_t msg_size;
8096
8097 if (errcode < 0
8098 || errcode >= (int) (sizeof (re_error_msgid)
8099 / sizeof (re_error_msgid[0])))
8100 /* Only error codes returned by the rest of the code should be passed
8101 to this routine. If we are given anything else, or if other regex
8102 code generates an invalid error code, then the program has a bug.
8103 Dump core so we can fix it. */
8104 abort ();
8105
8106 msg = gettext (re_error_msgid[errcode]);
8107
8108 msg_size = strlen (msg) + 1; /* Includes the null. */
8109
8110 if (errbuf_size != 0)
8111 {
8112 if (msg_size > errbuf_size)
8113 {
8114 #if defined HAVE_MEMPCPY || defined _LIBC
8115 *((char *) mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
8116 #else
8117 memcpy (errbuf, msg, errbuf_size - 1);
8118 errbuf[errbuf_size - 1] = 0;
8119 #endif
8120 }
8121 else
8122 memcpy (errbuf, msg, msg_size);
8123 }
8124
8125 return msg_size;
8126 }
8127 #ifdef _LIBC
8128 weak_alias (__regerror, regerror)
8129 #endif
8130
8131
8132 /* Free dynamically allocated space used by PREG. */
8133
8134 void
8135 regfree (regex_t *preg)
8136 {
8137 free (preg->buffer);
8138 preg->buffer = NULL;
8139
8140 preg->allocated = 0;
8141 preg->used = 0;
8142
8143 free (preg->fastmap);
8144 preg->fastmap = NULL;
8145 preg->fastmap_accurate = 0;
8146
8147 free (preg->translate);
8148 preg->translate = NULL;
8149 }
8150 #ifdef _LIBC
8151 weak_alias (__regfree, regfree)
8152 #endif
8153
8154 #endif /* not emacs */
8155
8156 #endif /* not INSIDE_RECURSION */
8157
8158
8159 #undef STORE_NUMBER
8161 #undef STORE_NUMBER_AND_INCR
8162 #undef EXTRACT_NUMBER
8163 #undef EXTRACT_NUMBER_AND_INCR
8164
8165 #undef DEBUG_PRINT_COMPILED_PATTERN
8166 #undef DEBUG_PRINT_DOUBLE_STRING
8167
8168 #undef INIT_FAIL_STACK
8169 #undef RESET_FAIL_STACK
8170 #undef DOUBLE_FAIL_STACK
8171 #undef PUSH_PATTERN_OP
8172 #undef PUSH_FAILURE_POINTER
8173 #undef PUSH_FAILURE_INT
8174 #undef PUSH_FAILURE_ELT
8175 #undef POP_FAILURE_POINTER
8176 #undef POP_FAILURE_INT
8177 #undef POP_FAILURE_ELT
8178 #undef DEBUG_PUSH
8179 #undef DEBUG_POP
8180 #undef PUSH_FAILURE_POINT
8181 #undef POP_FAILURE_POINT
8182
8183 #undef REG_UNSET_VALUE
8184 #undef REG_UNSET
8185
8186 #undef PATFETCH
8187 #undef PATFETCH_RAW
8188 #undef PATUNFETCH
8189 #undef TRANSLATE
8190
8191 #undef INIT_BUF_SIZE
8192 #undef GET_BUFFER_SPACE
8193 #undef BUF_PUSH
8194 #undef BUF_PUSH_2
8195 #undef BUF_PUSH_3
8196 #undef STORE_JUMP
8197 #undef STORE_JUMP2
8198 #undef INSERT_JUMP
8199 #undef INSERT_JUMP2
8200 #undef EXTEND_BUFFER
8201 #undef GET_UNSIGNED_NUMBER
8202 #undef FREE_STACK_RETURN
8203
8204 # undef POINTER_TO_OFFSET
8205 # undef MATCHING_IN_FRST_STRING
8206 # undef PREFETCH
8207 # undef AT_STRINGS_BEG
8208 # undef AT_STRINGS_END
8209 # undef WORDCHAR_P
8210 # undef FREE_VAR
8211 # undef FREE_VARIABLES
8212 # undef NO_HIGHEST_ACTIVE_REG
8213 # undef NO_LOWEST_ACTIVE_REG
8214
8215 # undef CHAR_T
8216 # undef UCHAR_T
8217 # undef COMPILED_BUFFER_VAR
8218 # undef OFFSET_ADDRESS_SIZE
8219 # undef CHAR_CLASS_SIZE
8220 # undef PREFIX
8221 # undef ARG_PREFIX
8222 # undef PUT_CHAR
8223 # undef BYTE
8224 # undef WCHAR
8225
8226 # define DEFINED_ONCE
8227