regex.c revision 1.1.1.6.4.1 1 /* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P1003.2/D11.2, except for some of the
4 internationalization features.)
5
6 Copyright (C) 1993-2019 Free Software Foundation, Inc.
7 This file is part of the GNU C Library.
8
9 The GNU C Library is free software; you can redistribute it and/or
10 modify it under the terms of the GNU Lesser General Public
11 License as published by the Free Software Foundation; either
12 version 2.1 of the License, or (at your option) any later version.
13
14 The GNU C Library is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 Lesser General Public License for more details.
18
19 You should have received a copy of the GNU Lesser General Public
20 License along with the GNU C Library; if not, write to the Free
21 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 02110-1301 USA. */
23
24 /* This file has been modified for usage in libiberty. It includes "xregex.h"
25 instead of <regex.h>. The "xregex.h" header file renames all external
26 routines with an "x" prefix so they do not collide with the native regex
27 routines or with other components regex routines. */
28 /* AIX requires this to be the first thing in the file. */
29 #if defined _AIX && !defined __GNUC__ && !defined REGEX_MALLOC
30 #pragma alloca
31 #endif
32
33 #undef _GNU_SOURCE
34 #define _GNU_SOURCE
35
36 #ifndef INSIDE_RECURSION
37 # ifdef HAVE_CONFIG_H
38 # include <config.h>
39 # endif
40 #endif
41
42 #include <ansidecl.h>
43
44 #ifndef INSIDE_RECURSION
45
46 # if defined STDC_HEADERS && !defined emacs
47 # include <stddef.h>
48 # define PTR_INT_TYPE ptrdiff_t
49 # else
50 /* We need this for `regex.h', and perhaps for the Emacs include files. */
51 # include <sys/types.h>
52 # define PTR_INT_TYPE long
53 # endif
54
55 # define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
56
57 /* For platform which support the ISO C amendement 1 functionality we
58 support user defined character classes. */
59 # if defined _LIBC || WIDE_CHAR_SUPPORT
60 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
61 # include <wchar.h>
62 # include <wctype.h>
63 # endif
64
65 # ifdef _LIBC
66 /* We have to keep the namespace clean. */
67 # define regfree(preg) __regfree (preg)
68 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
69 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
70 # define regerror(errcode, preg, errbuf, errbuf_size) \
71 __regerror(errcode, preg, errbuf, errbuf_size)
72 # define re_set_registers(bu, re, nu, st, en) \
73 __re_set_registers (bu, re, nu, st, en)
74 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
75 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
76 # define re_match(bufp, string, size, pos, regs) \
77 __re_match (bufp, string, size, pos, regs)
78 # define re_search(bufp, string, size, startpos, range, regs) \
79 __re_search (bufp, string, size, startpos, range, regs)
80 # define re_compile_pattern(pattern, length, bufp) \
81 __re_compile_pattern (pattern, length, bufp)
82 # define re_set_syntax(syntax) __re_set_syntax (syntax)
83 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
84 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
85 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
86
87 # define btowc __btowc
88
89 /* We are also using some library internals. */
90 # include <locale/localeinfo.h>
91 # include <locale/elem-hash.h>
92 # include <langinfo.h>
93 # include <locale/coll-lookup.h>
94 # endif
95
96 /* This is for other GNU distributions with internationalized messages. */
97 # if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
98 # include <libintl.h>
99 # ifdef _LIBC
100 # undef gettext
101 # define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES)
102 # endif
103 # else
104 # define gettext(msgid) (msgid)
105 # endif
106
107 # ifndef gettext_noop
108 /* This define is so xgettext can find the internationalizable
109 strings. */
110 # define gettext_noop(String) String
111 # endif
112
113 /* The `emacs' switch turns on certain matching commands
114 that make sense only in Emacs. */
115 # ifdef emacs
116
117 # include "lisp.h"
118 # include "buffer.h"
119 # include "syntax.h"
120
121 # else /* not emacs */
122
123 /* If we are not linking with Emacs proper,
124 we can't use the relocating allocator
125 even if config.h says that we can. */
126 # undef REL_ALLOC
127
128 # if defined STDC_HEADERS || defined _LIBC
129 # include <stdlib.h>
130 # else
131 char *malloc ();
132 char *realloc ();
133 # endif
134
135 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
136 If nothing else has been done, use the method below. */
137 # ifdef INHIBIT_STRING_HEADER
138 # if !(defined HAVE_BZERO && defined HAVE_BCOPY)
139 # if !defined bzero && !defined bcopy
140 # undef INHIBIT_STRING_HEADER
141 # endif
142 # endif
143 # endif
144
145 /* This is the normal way of making sure we have a bcopy and a bzero.
146 This is used in most programs--a few other programs avoid this
147 by defining INHIBIT_STRING_HEADER. */
148 # ifndef INHIBIT_STRING_HEADER
149 # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
150 # include <string.h>
151 # ifndef bzero
152 # ifndef _LIBC
153 # define bzero(s, n) ((void) memset (s, '\0', n))
154 # else
155 # define bzero(s, n) __bzero (s, n)
156 # endif
157 # endif
158 # else
159 # include <strings.h>
160 # ifndef memcmp
161 # define memcmp(s1, s2, n) bcmp (s1, s2, n)
162 # endif
163 # ifndef memcpy
164 # define memcpy(d, s, n) (bcopy (s, d, n), (d))
165 # endif
166 # endif
167 # endif
168
169 /* Define the syntax stuff for \<, \>, etc. */
170
171 /* This must be nonzero for the wordchar and notwordchar pattern
172 commands in re_match_2. */
173 # ifndef Sword
174 # define Sword 1
175 # endif
176
177 # ifdef SWITCH_ENUM_BUG
178 # define SWITCH_ENUM_CAST(x) ((int)(x))
179 # else
180 # define SWITCH_ENUM_CAST(x) (x)
181 # endif
182
183 # endif /* not emacs */
184
185 # if defined _LIBC || HAVE_LIMITS_H
186 # include <limits.h>
187 # endif
188
189 # ifndef MB_LEN_MAX
190 # define MB_LEN_MAX 1
191 # endif
192
193 /* Get the interface, including the syntax bits. */
195 # include "xregex.h" /* change for libiberty */
196
197 /* isalpha etc. are used for the character classes. */
198 # include <ctype.h>
199
200 /* Jim Meyering writes:
201
202 "... Some ctype macros are valid only for character codes that
203 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
204 using /bin/cc or gcc but without giving an ansi option). So, all
205 ctype uses should be through macros like ISPRINT... If
206 STDC_HEADERS is defined, then autoconf has verified that the ctype
207 macros don't need to be guarded with references to isascii. ...
208 Defining isascii to 1 should let any compiler worth its salt
209 eliminate the && through constant folding."
210 Solaris defines some of these symbols so we must undefine them first. */
211
212 # undef ISASCII
213 # if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
214 # define ISASCII(c) 1
215 # else
216 # define ISASCII(c) isascii(c)
217 # endif
218
219 # ifdef isblank
220 # define ISBLANK(c) (ISASCII (c) && isblank (c))
221 # else
222 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
223 # endif
224 # ifdef isgraph
225 # define ISGRAPH(c) (ISASCII (c) && isgraph (c))
226 # else
227 # define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
228 # endif
229
230 # undef ISPRINT
231 # define ISPRINT(c) (ISASCII (c) && isprint (c))
232 # define ISDIGIT(c) (ISASCII (c) && isdigit (c))
233 # define ISALNUM(c) (ISASCII (c) && isalnum (c))
234 # define ISALPHA(c) (ISASCII (c) && isalpha (c))
235 # define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
236 # define ISLOWER(c) (ISASCII (c) && islower (c))
237 # define ISPUNCT(c) (ISASCII (c) && ispunct (c))
238 # define ISSPACE(c) (ISASCII (c) && isspace (c))
239 # define ISUPPER(c) (ISASCII (c) && isupper (c))
240 # define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
241
242 # ifdef _tolower
243 # define TOLOWER(c) _tolower(c)
244 # else
245 # define TOLOWER(c) tolower(c)
246 # endif
247
248 # ifndef NULL
249 # define NULL (void *)0
250 # endif
251
252 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
253 since ours (we hope) works properly with all combinations of
254 machines, compilers, `char' and `unsigned char' argument types.
255 (Per Bothner suggested the basic approach.) */
256 # undef SIGN_EXTEND_CHAR
257 # if __STDC__
258 # define SIGN_EXTEND_CHAR(c) ((signed char) (c))
259 # else /* not __STDC__ */
260 /* As in Harbison and Steele. */
261 # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
262 # endif
263
264 # ifndef emacs
266 /* How many characters in the character set. */
267 # define CHAR_SET_SIZE 256
268
269 # ifdef SYNTAX_TABLE
270
271 extern char *re_syntax_table;
272
273 # else /* not SYNTAX_TABLE */
274
275 static char re_syntax_table[CHAR_SET_SIZE];
276
277 static void init_syntax_once (void);
278
279 static void
280 init_syntax_once (void)
281 {
282 register int c;
283 static int done = 0;
284
285 if (done)
286 return;
287 bzero (re_syntax_table, sizeof re_syntax_table);
288
289 for (c = 0; c < CHAR_SET_SIZE; ++c)
290 if (ISALNUM (c))
291 re_syntax_table[c] = Sword;
292
293 re_syntax_table['_'] = Sword;
294
295 done = 1;
296 }
297
298 # endif /* not SYNTAX_TABLE */
299
300 # define SYNTAX(c) re_syntax_table[(unsigned char) (c)]
301
302 # endif /* emacs */
303
304 /* Integer type for pointers. */
306 # if !defined _LIBC && !defined HAVE_UINTPTR_T
307 typedef unsigned long int uintptr_t;
308 # endif
309
310 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
311 use `alloca' instead of `malloc'. This is because using malloc in
312 re_search* or re_match* could cause memory leaks when C-g is used in
313 Emacs; also, malloc is slower and causes storage fragmentation. On
314 the other hand, malloc is more portable, and easier to debug.
315
316 Because we sometimes use alloca, some routines have to be macros,
317 not functions -- `alloca'-allocated space disappears at the end of the
318 function it is called in. */
319
320 # ifdef REGEX_MALLOC
321
322 # define REGEX_ALLOCATE malloc
323 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
324 # define REGEX_FREE free
325
326 # else /* not REGEX_MALLOC */
327
328 /* Emacs already defines alloca, sometimes. */
329 # ifndef alloca
330
331 /* Make alloca work the best possible way. */
332 # ifdef __GNUC__
333 # define alloca __builtin_alloca
334 # else /* not __GNUC__ */
335 # if HAVE_ALLOCA_H
336 # include <alloca.h>
337 # endif /* HAVE_ALLOCA_H */
338 # endif /* not __GNUC__ */
339
340 # endif /* not alloca */
341
342 # define REGEX_ALLOCATE alloca
343
344 /* Assumes a `char *destination' variable. */
345 # define REGEX_REALLOCATE(source, osize, nsize) \
346 (destination = (char *) alloca (nsize), \
347 memcpy (destination, source, osize))
348
349 /* No need to do anything to free, after alloca. */
350 # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
351
352 # endif /* not REGEX_MALLOC */
353
354 /* Define how to allocate the failure stack. */
355
356 # if defined REL_ALLOC && defined REGEX_MALLOC
357
358 # define REGEX_ALLOCATE_STACK(size) \
359 r_alloc (&failure_stack_ptr, (size))
360 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
361 r_re_alloc (&failure_stack_ptr, (nsize))
362 # define REGEX_FREE_STACK(ptr) \
363 r_alloc_free (&failure_stack_ptr)
364
365 # else /* not using relocating allocator */
366
367 # ifdef REGEX_MALLOC
368
369 # define REGEX_ALLOCATE_STACK malloc
370 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
371 # define REGEX_FREE_STACK free
372
373 # else /* not REGEX_MALLOC */
374
375 # define REGEX_ALLOCATE_STACK alloca
376
377 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
378 REGEX_REALLOCATE (source, osize, nsize)
379 /* No need to explicitly free anything. */
380 # define REGEX_FREE_STACK(arg)
381
382 # endif /* not REGEX_MALLOC */
383 # endif /* not using relocating allocator */
384
385
386 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
387 `string1' or just past its end. This works if PTR is NULL, which is
388 a good thing. */
389 # define FIRST_STRING_P(ptr) \
390 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
391
392 /* (Re)Allocate N items of type T using malloc, or fail. */
393 # define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
394 # define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
395 # define RETALLOC_IF(addr, n, t) \
396 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
397 # define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
398
399 # define BYTEWIDTH 8 /* In bits. */
400
401 # define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
402
403 # undef MAX
404 # undef MIN
405 # define MAX(a, b) ((a) > (b) ? (a) : (b))
406 # define MIN(a, b) ((a) < (b) ? (a) : (b))
407
408 typedef char boolean;
409 # define false 0
410 # define true 1
411
412 static reg_errcode_t byte_regex_compile (const char *pattern, size_t size,
413 reg_syntax_t syntax,
414 struct re_pattern_buffer *bufp);
415
416 static int byte_re_match_2_internal (struct re_pattern_buffer *bufp,
417 const char *string1, int size1,
418 const char *string2, int size2,
419 int pos,
420 struct re_registers *regs,
421 int stop);
422 static int byte_re_search_2 (struct re_pattern_buffer *bufp,
423 const char *string1, int size1,
424 const char *string2, int size2,
425 int startpos, int range,
426 struct re_registers *regs, int stop);
427 static int byte_re_compile_fastmap (struct re_pattern_buffer *bufp);
428
429 #ifdef MBS_SUPPORT
430 static reg_errcode_t wcs_regex_compile (const char *pattern, size_t size,
431 reg_syntax_t syntax,
432 struct re_pattern_buffer *bufp);
433
434
435 static int wcs_re_match_2_internal (struct re_pattern_buffer *bufp,
436 const char *cstring1, int csize1,
437 const char *cstring2, int csize2,
438 int pos,
439 struct re_registers *regs,
440 int stop,
441 wchar_t *string1, int size1,
442 wchar_t *string2, int size2,
443 int *mbs_offset1, int *mbs_offset2);
444 static int wcs_re_search_2 (struct re_pattern_buffer *bufp,
445 const char *string1, int size1,
446 const char *string2, int size2,
447 int startpos, int range,
448 struct re_registers *regs, int stop);
449 static int wcs_re_compile_fastmap (struct re_pattern_buffer *bufp);
450 #endif
451
452 /* These are the command codes that appear in compiled regular
454 expressions. Some opcodes are followed by argument bytes. A
455 command code can specify any interpretation whatsoever for its
456 arguments. Zero bytes may appear in the compiled regular expression. */
457
458 typedef enum
459 {
460 no_op = 0,
461
462 /* Succeed right away--no more backtracking. */
463 succeed,
464
465 /* Followed by one byte giving n, then by n literal bytes. */
466 exactn,
467
468 # ifdef MBS_SUPPORT
469 /* Same as exactn, but contains binary data. */
470 exactn_bin,
471 # endif
472
473 /* Matches any (more or less) character. */
474 anychar,
475
476 /* Matches any one char belonging to specified set. First
477 following byte is number of bitmap bytes. Then come bytes
478 for a bitmap saying which chars are in. Bits in each byte
479 are ordered low-bit-first. A character is in the set if its
480 bit is 1. A character too large to have a bit in the map is
481 automatically not in the set. */
482 /* ifdef MBS_SUPPORT, following element is length of character
483 classes, length of collating symbols, length of equivalence
484 classes, length of character ranges, and length of characters.
485 Next, character class element, collating symbols elements,
486 equivalence class elements, range elements, and character
487 elements follow.
488 See regex_compile function. */
489 charset,
490
491 /* Same parameters as charset, but match any character that is
492 not one of those specified. */
493 charset_not,
494
495 /* Start remembering the text that is matched, for storing in a
496 register. Followed by one byte with the register number, in
497 the range 0 to one less than the pattern buffer's re_nsub
498 field. Then followed by one byte with the number of groups
499 inner to this one. (This last has to be part of the
500 start_memory only because we need it in the on_failure_jump
501 of re_match_2.) */
502 start_memory,
503
504 /* Stop remembering the text that is matched and store it in a
505 memory register. Followed by one byte with the register
506 number, in the range 0 to one less than `re_nsub' in the
507 pattern buffer, and one byte with the number of inner groups,
508 just like `start_memory'. (We need the number of inner
509 groups here because we don't have any easy way of finding the
510 corresponding start_memory when we're at a stop_memory.) */
511 stop_memory,
512
513 /* Match a duplicate of something remembered. Followed by one
514 byte containing the register number. */
515 duplicate,
516
517 /* Fail unless at beginning of line. */
518 begline,
519
520 /* Fail unless at end of line. */
521 endline,
522
523 /* Succeeds if at beginning of buffer (if emacs) or at beginning
524 of string to be matched (if not). */
525 begbuf,
526
527 /* Analogously, for end of buffer/string. */
528 endbuf,
529
530 /* Followed by two byte relative address to which to jump. */
531 jump,
532
533 /* Same as jump, but marks the end of an alternative. */
534 jump_past_alt,
535
536 /* Followed by two-byte relative address of place to resume at
537 in case of failure. */
538 /* ifdef MBS_SUPPORT, the size of address is 1. */
539 on_failure_jump,
540
541 /* Like on_failure_jump, but pushes a placeholder instead of the
542 current string position when executed. */
543 on_failure_keep_string_jump,
544
545 /* Throw away latest failure point and then jump to following
546 two-byte relative address. */
547 /* ifdef MBS_SUPPORT, the size of address is 1. */
548 pop_failure_jump,
549
550 /* Change to pop_failure_jump if know won't have to backtrack to
551 match; otherwise change to jump. This is used to jump
552 back to the beginning of a repeat. If what follows this jump
553 clearly won't match what the repeat does, such that we can be
554 sure that there is no use backtracking out of repetitions
555 already matched, then we change it to a pop_failure_jump.
556 Followed by two-byte address. */
557 /* ifdef MBS_SUPPORT, the size of address is 1. */
558 maybe_pop_jump,
559
560 /* Jump to following two-byte address, and push a dummy failure
561 point. This failure point will be thrown away if an attempt
562 is made to use it for a failure. A `+' construct makes this
563 before the first repeat. Also used as an intermediary kind
564 of jump when compiling an alternative. */
565 /* ifdef MBS_SUPPORT, the size of address is 1. */
566 dummy_failure_jump,
567
568 /* Push a dummy failure point and continue. Used at the end of
569 alternatives. */
570 push_dummy_failure,
571
572 /* Followed by two-byte relative address and two-byte number n.
573 After matching N times, jump to the address upon failure. */
574 /* ifdef MBS_SUPPORT, the size of address is 1. */
575 succeed_n,
576
577 /* Followed by two-byte relative address, and two-byte number n.
578 Jump to the address N times, then fail. */
579 /* ifdef MBS_SUPPORT, the size of address is 1. */
580 jump_n,
581
582 /* Set the following two-byte relative address to the
583 subsequent two-byte number. The address *includes* the two
584 bytes of number. */
585 /* ifdef MBS_SUPPORT, the size of address is 1. */
586 set_number_at,
587
588 wordchar, /* Matches any word-constituent character. */
589 notwordchar, /* Matches any char that is not a word-constituent. */
590
591 wordbeg, /* Succeeds if at word beginning. */
592 wordend, /* Succeeds if at word end. */
593
594 wordbound, /* Succeeds if at a word boundary. */
595 notwordbound /* Succeeds if not at a word boundary. */
596
597 # ifdef emacs
598 ,before_dot, /* Succeeds if before point. */
599 at_dot, /* Succeeds if at point. */
600 after_dot, /* Succeeds if after point. */
601
602 /* Matches any character whose syntax is specified. Followed by
603 a byte which contains a syntax code, e.g., Sword. */
604 syntaxspec,
605
606 /* Matches any character whose syntax is not that specified. */
607 notsyntaxspec
608 # endif /* emacs */
609 } re_opcode_t;
610 #endif /* not INSIDE_RECURSION */
611
612
614 #ifdef BYTE
615 # define CHAR_T char
616 # define UCHAR_T unsigned char
617 # define COMPILED_BUFFER_VAR bufp->buffer
618 # define OFFSET_ADDRESS_SIZE 2
619 # define PREFIX(name) byte_##name
620 # define ARG_PREFIX(name) name
621 # define PUT_CHAR(c) putchar (c)
622 #else
623 # ifdef WCHAR
624 # define CHAR_T wchar_t
625 # define UCHAR_T wchar_t
626 # define COMPILED_BUFFER_VAR wc_buffer
627 # define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
628 # define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_T)+1)
629 # define PREFIX(name) wcs_##name
630 # define ARG_PREFIX(name) c##name
631 /* Should we use wide stream?? */
632 # define PUT_CHAR(c) printf ("%C", c);
633 # define TRUE 1
634 # define FALSE 0
635 # else
636 # ifdef MBS_SUPPORT
637 # define WCHAR
638 # define INSIDE_RECURSION
639 # include "regex.c"
640 # undef INSIDE_RECURSION
641 # endif
642 # define BYTE
643 # define INSIDE_RECURSION
644 # include "regex.c"
645 # undef INSIDE_RECURSION
646 # endif
647 #endif
648
649 #ifdef INSIDE_RECURSION
650 /* Common operations on the compiled pattern. */
651
652 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
653 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
654
655 # ifdef WCHAR
656 # define STORE_NUMBER(destination, number) \
657 do { \
658 *(destination) = (UCHAR_T)(number); \
659 } while (0)
660 # else /* BYTE */
661 # define STORE_NUMBER(destination, number) \
662 do { \
663 (destination)[0] = (number) & 0377; \
664 (destination)[1] = (number) >> 8; \
665 } while (0)
666 # endif /* WCHAR */
667
668 /* Same as STORE_NUMBER, except increment DESTINATION to
669 the byte after where the number is stored. Therefore, DESTINATION
670 must be an lvalue. */
671 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
672
673 # define STORE_NUMBER_AND_INCR(destination, number) \
674 do { \
675 STORE_NUMBER (destination, number); \
676 (destination) += OFFSET_ADDRESS_SIZE; \
677 } while (0)
678
679 /* Put into DESTINATION a number stored in two contiguous bytes starting
680 at SOURCE. */
681 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
682
683 # ifdef WCHAR
684 # define EXTRACT_NUMBER(destination, source) \
685 do { \
686 (destination) = *(source); \
687 } while (0)
688 # else /* BYTE */
689 # define EXTRACT_NUMBER(destination, source) \
690 do { \
691 (destination) = *(source) & 0377; \
692 (destination) += ((unsigned) SIGN_EXTEND_CHAR (*((source) + 1))) << 8; \
693 } while (0)
694 # endif
695
696 # ifdef DEBUG
697 static void PREFIX(extract_number) (int *dest, UCHAR_T *source);
698 static void
699 PREFIX(extract_number) (int *dest, UCHAR_T *source)
700 {
701 # ifdef WCHAR
702 *dest = *source;
703 # else /* BYTE */
704 int temp = SIGN_EXTEND_CHAR (*(source + 1));
705 *dest = *source & 0377;
706 *dest += temp << 8;
707 # endif
708 }
709
710 # ifndef EXTRACT_MACROS /* To debug the macros. */
711 # undef EXTRACT_NUMBER
712 # define EXTRACT_NUMBER(dest, src) PREFIX(extract_number) (&dest, src)
713 # endif /* not EXTRACT_MACROS */
714
715 # endif /* DEBUG */
716
717 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
718 SOURCE must be an lvalue. */
719
720 # define EXTRACT_NUMBER_AND_INCR(destination, source) \
721 do { \
722 EXTRACT_NUMBER (destination, source); \
723 (source) += OFFSET_ADDRESS_SIZE; \
724 } while (0)
725
726 # ifdef DEBUG
727 static void PREFIX(extract_number_and_incr) (int *destination,
728 UCHAR_T **source);
729 static void
730 PREFIX(extract_number_and_incr) (int *destination, UCHAR_T **source)
731 {
732 PREFIX(extract_number) (destination, *source);
733 *source += OFFSET_ADDRESS_SIZE;
734 }
735
736 # ifndef EXTRACT_MACROS
737 # undef EXTRACT_NUMBER_AND_INCR
738 # define EXTRACT_NUMBER_AND_INCR(dest, src) \
739 PREFIX(extract_number_and_incr) (&dest, &src)
740 # endif /* not EXTRACT_MACROS */
741
742 # endif /* DEBUG */
743
744
745
747 /* If DEBUG is defined, Regex prints many voluminous messages about what
748 it is doing (if the variable `debug' is nonzero). If linked with the
749 main program in `iregex.c', you can enter patterns and strings
750 interactively. And if linked with the main program in `main.c' and
751 the other test files, you can run the already-written tests. */
752
753 # ifdef DEBUG
754
755 # ifndef DEFINED_ONCE
756
757 /* We use standard I/O for debugging. */
758 # include <stdio.h>
759
760 /* It is useful to test things that ``must'' be true when debugging. */
761 # include <assert.h>
762
763 static int debug;
764
765 # define DEBUG_STATEMENT(e) e
766 # define DEBUG_PRINT1(x) if (debug) printf (x)
767 # define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
768 # define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
769 # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
770 # endif /* not DEFINED_ONCE */
771
772 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
773 if (debug) PREFIX(print_partial_compiled_pattern) (s, e)
774 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
775 if (debug) PREFIX(print_double_string) (w, s1, sz1, s2, sz2)
776
777
778 /* Print the fastmap in human-readable form. */
779
780 # ifndef DEFINED_ONCE
781 void
782 print_fastmap (char *fastmap)
783 {
784 unsigned was_a_range = 0;
785 unsigned i = 0;
786
787 while (i < (1 << BYTEWIDTH))
788 {
789 if (fastmap[i++])
790 {
791 was_a_range = 0;
792 putchar (i - 1);
793 while (i < (1 << BYTEWIDTH) && fastmap[i])
794 {
795 was_a_range = 1;
796 i++;
797 }
798 if (was_a_range)
799 {
800 printf ("-");
801 putchar (i - 1);
802 }
803 }
804 }
805 putchar ('\n');
806 }
807 # endif /* not DEFINED_ONCE */
808
809
810 /* Print a compiled pattern string in human-readable form, starting at
811 the START pointer into it and ending just before the pointer END. */
812
813 void
814 PREFIX(print_partial_compiled_pattern) (UCHAR_T *start, UCHAR_T *end)
815 {
816 int mcnt, mcnt2;
817 UCHAR_T *p1;
818 UCHAR_T *p = start;
819 UCHAR_T *pend = end;
820
821 if (start == NULL)
822 {
823 printf ("(null)\n");
824 return;
825 }
826
827 /* Loop over pattern commands. */
828 while (p < pend)
829 {
830 # ifdef _LIBC
831 printf ("%td:\t", p - start);
832 # else
833 printf ("%ld:\t", (long int) (p - start));
834 # endif
835
836 switch ((re_opcode_t) *p++)
837 {
838 case no_op:
839 printf ("/no_op");
840 break;
841
842 case exactn:
843 mcnt = *p++;
844 printf ("/exactn/%d", mcnt);
845 do
846 {
847 putchar ('/');
848 PUT_CHAR (*p++);
849 }
850 while (--mcnt);
851 break;
852
853 # ifdef MBS_SUPPORT
854 case exactn_bin:
855 mcnt = *p++;
856 printf ("/exactn_bin/%d", mcnt);
857 do
858 {
859 printf("/%lx", (long int) *p++);
860 }
861 while (--mcnt);
862 break;
863 # endif /* MBS_SUPPORT */
864
865 case start_memory:
866 mcnt = *p++;
867 printf ("/start_memory/%d/%ld", mcnt, (long int) *p++);
868 break;
869
870 case stop_memory:
871 mcnt = *p++;
872 printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++);
873 break;
874
875 case duplicate:
876 printf ("/duplicate/%ld", (long int) *p++);
877 break;
878
879 case anychar:
880 printf ("/anychar");
881 break;
882
883 case charset:
884 case charset_not:
885 {
886 # ifdef WCHAR
887 int i, length;
888 wchar_t *workp = p;
889 printf ("/charset [%s",
890 (re_opcode_t) *(workp - 1) == charset_not ? "^" : "");
891 p += 5;
892 length = *workp++; /* the length of char_classes */
893 for (i=0 ; i<length ; i++)
894 printf("[:%lx:]", (long int) *p++);
895 length = *workp++; /* the length of collating_symbol */
896 for (i=0 ; i<length ;)
897 {
898 printf("[.");
899 while(*p != 0)
900 PUT_CHAR((i++,*p++));
901 i++,p++;
902 printf(".]");
903 }
904 length = *workp++; /* the length of equivalence_class */
905 for (i=0 ; i<length ;)
906 {
907 printf("[=");
908 while(*p != 0)
909 PUT_CHAR((i++,*p++));
910 i++,p++;
911 printf("=]");
912 }
913 length = *workp++; /* the length of char_range */
914 for (i=0 ; i<length ; i++)
915 {
916 wchar_t range_start = *p++;
917 wchar_t range_end = *p++;
918 printf("%C-%C", range_start, range_end);
919 }
920 length = *workp++; /* the length of char */
921 for (i=0 ; i<length ; i++)
922 printf("%C", *p++);
923 putchar (']');
924 # else
925 register int c, last = -100;
926 register int in_range = 0;
927
928 printf ("/charset [%s",
929 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
930
931 assert (p + *p < pend);
932
933 for (c = 0; c < 256; c++)
934 if (c / 8 < *p
935 && (p[1 + (c/8)] & (1 << (c % 8))))
936 {
937 /* Are we starting a range? */
938 if (last + 1 == c && ! in_range)
939 {
940 putchar ('-');
941 in_range = 1;
942 }
943 /* Have we broken a range? */
944 else if (last + 1 != c && in_range)
945 {
946 putchar (last);
947 in_range = 0;
948 }
949
950 if (! in_range)
951 putchar (c);
952
953 last = c;
954 }
955
956 if (in_range)
957 putchar (last);
958
959 putchar (']');
960
961 p += 1 + *p;
962 # endif /* WCHAR */
963 }
964 break;
965
966 case begline:
967 printf ("/begline");
968 break;
969
970 case endline:
971 printf ("/endline");
972 break;
973
974 case on_failure_jump:
975 PREFIX(extract_number_and_incr) (&mcnt, &p);
976 # ifdef _LIBC
977 printf ("/on_failure_jump to %td", p + mcnt - start);
978 # else
979 printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start));
980 # endif
981 break;
982
983 case on_failure_keep_string_jump:
984 PREFIX(extract_number_and_incr) (&mcnt, &p);
985 # ifdef _LIBC
986 printf ("/on_failure_keep_string_jump to %td", p + mcnt - start);
987 # else
988 printf ("/on_failure_keep_string_jump to %ld",
989 (long int) (p + mcnt - start));
990 # endif
991 break;
992
993 case dummy_failure_jump:
994 PREFIX(extract_number_and_incr) (&mcnt, &p);
995 # ifdef _LIBC
996 printf ("/dummy_failure_jump to %td", p + mcnt - start);
997 # else
998 printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start));
999 # endif
1000 break;
1001
1002 case push_dummy_failure:
1003 printf ("/push_dummy_failure");
1004 break;
1005
1006 case maybe_pop_jump:
1007 PREFIX(extract_number_and_incr) (&mcnt, &p);
1008 # ifdef _LIBC
1009 printf ("/maybe_pop_jump to %td", p + mcnt - start);
1010 # else
1011 printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start));
1012 # endif
1013 break;
1014
1015 case pop_failure_jump:
1016 PREFIX(extract_number_and_incr) (&mcnt, &p);
1017 # ifdef _LIBC
1018 printf ("/pop_failure_jump to %td", p + mcnt - start);
1019 # else
1020 printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start));
1021 # endif
1022 break;
1023
1024 case jump_past_alt:
1025 PREFIX(extract_number_and_incr) (&mcnt, &p);
1026 # ifdef _LIBC
1027 printf ("/jump_past_alt to %td", p + mcnt - start);
1028 # else
1029 printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start));
1030 # endif
1031 break;
1032
1033 case jump:
1034 PREFIX(extract_number_and_incr) (&mcnt, &p);
1035 # ifdef _LIBC
1036 printf ("/jump to %td", p + mcnt - start);
1037 # else
1038 printf ("/jump to %ld", (long int) (p + mcnt - start));
1039 # endif
1040 break;
1041
1042 case succeed_n:
1043 PREFIX(extract_number_and_incr) (&mcnt, &p);
1044 p1 = p + mcnt;
1045 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1046 # ifdef _LIBC
1047 printf ("/succeed_n to %td, %d times", p1 - start, mcnt2);
1048 # else
1049 printf ("/succeed_n to %ld, %d times",
1050 (long int) (p1 - start), mcnt2);
1051 # endif
1052 break;
1053
1054 case jump_n:
1055 PREFIX(extract_number_and_incr) (&mcnt, &p);
1056 p1 = p + mcnt;
1057 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1058 printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
1059 break;
1060
1061 case set_number_at:
1062 PREFIX(extract_number_and_incr) (&mcnt, &p);
1063 p1 = p + mcnt;
1064 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1065 # ifdef _LIBC
1066 printf ("/set_number_at location %td to %d", p1 - start, mcnt2);
1067 # else
1068 printf ("/set_number_at location %ld to %d",
1069 (long int) (p1 - start), mcnt2);
1070 # endif
1071 break;
1072
1073 case wordbound:
1074 printf ("/wordbound");
1075 break;
1076
1077 case notwordbound:
1078 printf ("/notwordbound");
1079 break;
1080
1081 case wordbeg:
1082 printf ("/wordbeg");
1083 break;
1084
1085 case wordend:
1086 printf ("/wordend");
1087 break;
1088
1089 # ifdef emacs
1090 case before_dot:
1091 printf ("/before_dot");
1092 break;
1093
1094 case at_dot:
1095 printf ("/at_dot");
1096 break;
1097
1098 case after_dot:
1099 printf ("/after_dot");
1100 break;
1101
1102 case syntaxspec:
1103 printf ("/syntaxspec");
1104 mcnt = *p++;
1105 printf ("/%d", mcnt);
1106 break;
1107
1108 case notsyntaxspec:
1109 printf ("/notsyntaxspec");
1110 mcnt = *p++;
1111 printf ("/%d", mcnt);
1112 break;
1113 # endif /* emacs */
1114
1115 case wordchar:
1116 printf ("/wordchar");
1117 break;
1118
1119 case notwordchar:
1120 printf ("/notwordchar");
1121 break;
1122
1123 case begbuf:
1124 printf ("/begbuf");
1125 break;
1126
1127 case endbuf:
1128 printf ("/endbuf");
1129 break;
1130
1131 default:
1132 printf ("?%ld", (long int) *(p-1));
1133 }
1134
1135 putchar ('\n');
1136 }
1137
1138 # ifdef _LIBC
1139 printf ("%td:\tend of pattern.\n", p - start);
1140 # else
1141 printf ("%ld:\tend of pattern.\n", (long int) (p - start));
1142 # endif
1143 }
1144
1145
1146 void
1147 PREFIX(print_compiled_pattern) (struct re_pattern_buffer *bufp)
1148 {
1149 UCHAR_T *buffer = (UCHAR_T*) bufp->buffer;
1150
1151 PREFIX(print_partial_compiled_pattern) (buffer, buffer
1152 + bufp->used / sizeof(UCHAR_T));
1153 printf ("%ld bytes used/%ld bytes allocated.\n",
1154 bufp->used, bufp->allocated);
1155
1156 if (bufp->fastmap_accurate && bufp->fastmap)
1157 {
1158 printf ("fastmap: ");
1159 print_fastmap (bufp->fastmap);
1160 }
1161
1162 # ifdef _LIBC
1163 printf ("re_nsub: %Zd\t", bufp->re_nsub);
1164 # else
1165 printf ("re_nsub: %ld\t", (long int) bufp->re_nsub);
1166 # endif
1167 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1168 printf ("can_be_null: %d\t", bufp->can_be_null);
1169 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1170 printf ("no_sub: %d\t", bufp->no_sub);
1171 printf ("not_bol: %d\t", bufp->not_bol);
1172 printf ("not_eol: %d\t", bufp->not_eol);
1173 printf ("syntax: %lx\n", bufp->syntax);
1174 /* Perhaps we should print the translate table? */
1175 }
1176
1177
1178 void
1179 PREFIX(print_double_string) (const CHAR_T *where, const CHAR_T *string1,
1180 int size1, const CHAR_T *string2, int size2)
1181 {
1182 int this_char;
1183
1184 if (where == NULL)
1185 printf ("(null)");
1186 else
1187 {
1188 int cnt;
1189
1190 if (FIRST_STRING_P (where))
1191 {
1192 for (this_char = where - string1; this_char < size1; this_char++)
1193 PUT_CHAR (string1[this_char]);
1194
1195 where = string2;
1196 }
1197
1198 cnt = 0;
1199 for (this_char = where - string2; this_char < size2; this_char++)
1200 {
1201 PUT_CHAR (string2[this_char]);
1202 if (++cnt > 100)
1203 {
1204 fputs ("...", stdout);
1205 break;
1206 }
1207 }
1208 }
1209 }
1210
1211 # ifndef DEFINED_ONCE
1212 void
1213 printchar (int c)
1214 {
1215 putc (c, stderr);
1216 }
1217 # endif
1218
1219 # else /* not DEBUG */
1220
1221 # ifndef DEFINED_ONCE
1222 # undef assert
1223 # define assert(e)
1224
1225 # define DEBUG_STATEMENT(e)
1226 # define DEBUG_PRINT1(x)
1227 # define DEBUG_PRINT2(x1, x2)
1228 # define DEBUG_PRINT3(x1, x2, x3)
1229 # define DEBUG_PRINT4(x1, x2, x3, x4)
1230 # endif /* not DEFINED_ONCE */
1231 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1232 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1233
1234 # endif /* not DEBUG */
1235
1236
1237
1239 # ifdef WCHAR
1240 /* This convert a multibyte string to a wide character string.
1241 And write their correspondances to offset_buffer(see below)
1242 and write whether each wchar_t is binary data to is_binary.
1243 This assume invalid multibyte sequences as binary data.
1244 We assume offset_buffer and is_binary is already allocated
1245 enough space. */
1246
1247 static size_t convert_mbs_to_wcs (CHAR_T *dest, const unsigned char* src,
1248 size_t len, int *offset_buffer,
1249 char *is_binary);
1250 static size_t
1251 convert_mbs_to_wcs (CHAR_T *dest, const unsigned char*src, size_t len,
1252 int *offset_buffer, char *is_binary)
1253 /* It hold correspondances between src(char string) and
1254 dest(wchar_t string) for optimization.
1255 e.g. src = "xxxyzz"
1256 dest = {'X', 'Y', 'Z'}
1257 (each "xxx", "y" and "zz" represent one multibyte character
1258 corresponding to 'X', 'Y' and 'Z'.)
1259 offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")}
1260 = {0, 3, 4, 6}
1261 */
1262 {
1263 wchar_t *pdest = dest;
1264 const unsigned char *psrc = src;
1265 size_t wc_count = 0;
1266
1267 mbstate_t mbs;
1268 int i, consumed;
1269 size_t mb_remain = len;
1270 size_t mb_count = 0;
1271
1272 /* Initialize the conversion state. */
1273 memset (&mbs, 0, sizeof (mbstate_t));
1274
1275 offset_buffer[0] = 0;
1276 for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed,
1277 psrc += consumed)
1278 {
1279 #ifdef _LIBC
1280 consumed = __mbrtowc (pdest, psrc, mb_remain, &mbs);
1281 #else
1282 consumed = mbrtowc (pdest, psrc, mb_remain, &mbs);
1283 #endif
1284
1285 if (consumed <= 0)
1286 /* failed to convert. maybe src contains binary data.
1287 So we consume 1 byte manualy. */
1288 {
1289 *pdest = *psrc;
1290 consumed = 1;
1291 is_binary[wc_count] = TRUE;
1292 }
1293 else
1294 is_binary[wc_count] = FALSE;
1295 /* In sjis encoding, we use yen sign as escape character in
1296 place of reverse solidus. So we convert 0x5c(yen sign in
1297 sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse
1298 solidus in UCS2). */
1299 if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5)
1300 *pdest = (wchar_t) *psrc;
1301
1302 offset_buffer[wc_count + 1] = mb_count += consumed;
1303 }
1304
1305 /* Fill remain of the buffer with sentinel. */
1306 for (i = wc_count + 1 ; i <= len ; i++)
1307 offset_buffer[i] = mb_count + 1;
1308
1309 return wc_count;
1310 }
1311
1312 # endif /* WCHAR */
1313
1314 #else /* not INSIDE_RECURSION */
1315
1316 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1317 also be assigned to arbitrarily: each pattern buffer stores its own
1318 syntax, so it can be changed between regex compilations. */
1319 /* This has no initializer because initialized variables in Emacs
1320 become read-only after dumping. */
1321 reg_syntax_t re_syntax_options;
1322
1323
1324 /* Specify the precise syntax of regexps for compilation. This provides
1325 for compatibility for various utilities which historically have
1326 different, incompatible syntaxes.
1327
1328 The argument SYNTAX is a bit mask comprised of the various bits
1329 defined in regex.h. We return the old syntax. */
1330
1331 reg_syntax_t
1332 re_set_syntax (reg_syntax_t syntax)
1333 {
1334 reg_syntax_t ret = re_syntax_options;
1335
1336 re_syntax_options = syntax;
1337 # ifdef DEBUG
1338 if (syntax & RE_DEBUG)
1339 debug = 1;
1340 else if (debug) /* was on but now is not */
1341 debug = 0;
1342 # endif /* DEBUG */
1343 return ret;
1344 }
1345 # ifdef _LIBC
1346 weak_alias (__re_set_syntax, re_set_syntax)
1347 # endif
1348
1349 /* This table gives an error message for each of the error codes listed
1351 in regex.h. Obviously the order here has to be same as there.
1352 POSIX doesn't require that we do anything for REG_NOERROR,
1353 but why not be nice? */
1354
1355 static const char *re_error_msgid[] =
1356 {
1357 gettext_noop ("Success"), /* REG_NOERROR */
1358 gettext_noop ("No match"), /* REG_NOMATCH */
1359 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1360 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1361 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1362 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1363 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1364 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1365 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1366 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1367 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1368 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1369 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1370 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1371 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1372 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1373 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
1374 };
1375
1376 #endif /* INSIDE_RECURSION */
1378
1379 #ifndef DEFINED_ONCE
1380 /* Avoiding alloca during matching, to placate r_alloc. */
1381
1382 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1383 searching and matching functions should not call alloca. On some
1384 systems, alloca is implemented in terms of malloc, and if we're
1385 using the relocating allocator routines, then malloc could cause a
1386 relocation, which might (if the strings being searched are in the
1387 ralloc heap) shift the data out from underneath the regexp
1388 routines.
1389
1390 Here's another reason to avoid allocation: Emacs
1391 processes input from X in a signal handler; processing X input may
1392 call malloc; if input arrives while a matching routine is calling
1393 malloc, then we're scrod. But Emacs can't just block input while
1394 calling matching routines; then we don't notice interrupts when
1395 they come in. So, Emacs blocks input around all regexp calls
1396 except the matching calls, which it leaves unprotected, in the
1397 faith that they will not malloc. */
1398
1399 /* Normally, this is fine. */
1400 # define MATCH_MAY_ALLOCATE
1401
1402 /* When using GNU C, we are not REALLY using the C alloca, no matter
1403 what config.h may say. So don't take precautions for it. */
1404 # ifdef __GNUC__
1405 # undef C_ALLOCA
1406 # endif
1407
1408 /* The match routines may not allocate if (1) they would do it with malloc
1409 and (2) it's not safe for them to use malloc.
1410 Note that if REL_ALLOC is defined, matching would not use malloc for the
1411 failure stack, but we would still use it for the register vectors;
1412 so REL_ALLOC should not affect this. */
1413 # if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1414 # undef MATCH_MAY_ALLOCATE
1415 # endif
1416 #endif /* not DEFINED_ONCE */
1417
1418 #ifdef INSIDE_RECURSION
1420 /* Failure stack declarations and macros; both re_compile_fastmap and
1421 re_match_2 use a failure stack. These have to be macros because of
1422 REGEX_ALLOCATE_STACK. */
1423
1424
1425 /* Number of failure points for which to initially allocate space
1426 when matching. If this number is exceeded, we allocate more
1427 space, so it is not a hard limit. */
1428 # ifndef INIT_FAILURE_ALLOC
1429 # define INIT_FAILURE_ALLOC 5
1430 # endif
1431
1432 /* Roughly the maximum number of failure points on the stack. Would be
1433 exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1434 This is a variable only so users of regex can assign to it; we never
1435 change it ourselves. */
1436
1437 # ifdef INT_IS_16BIT
1438
1439 # ifndef DEFINED_ONCE
1440 # if defined MATCH_MAY_ALLOCATE
1441 /* 4400 was enough to cause a crash on Alpha OSF/1,
1442 whose default stack limit is 2mb. */
1443 long int re_max_failures = 4000;
1444 # else
1445 long int re_max_failures = 2000;
1446 # endif
1447 # endif
1448
1449 union PREFIX(fail_stack_elt)
1450 {
1451 UCHAR_T *pointer;
1452 long int integer;
1453 };
1454
1455 typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1456
1457 typedef struct
1458 {
1459 PREFIX(fail_stack_elt_t) *stack;
1460 unsigned long int size;
1461 unsigned long int avail; /* Offset of next open position. */
1462 } PREFIX(fail_stack_type);
1463
1464 # else /* not INT_IS_16BIT */
1465
1466 # ifndef DEFINED_ONCE
1467 # if defined MATCH_MAY_ALLOCATE
1468 /* 4400 was enough to cause a crash on Alpha OSF/1,
1469 whose default stack limit is 2mb. */
1470 int re_max_failures = 4000;
1471 # else
1472 int re_max_failures = 2000;
1473 # endif
1474 # endif
1475
1476 union PREFIX(fail_stack_elt)
1477 {
1478 UCHAR_T *pointer;
1479 int integer;
1480 };
1481
1482 typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1483
1484 typedef struct
1485 {
1486 PREFIX(fail_stack_elt_t) *stack;
1487 unsigned size;
1488 unsigned avail; /* Offset of next open position. */
1489 } PREFIX(fail_stack_type);
1490
1491 # endif /* INT_IS_16BIT */
1492
1493 # ifndef DEFINED_ONCE
1494 # define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1495 # define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1496 # define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1497 # endif
1498
1499
1500 /* Define macros to initialize and free the failure stack.
1501 Do `return -2' if the alloc fails. */
1502
1503 # ifdef MATCH_MAY_ALLOCATE
1504 # define INIT_FAIL_STACK() \
1505 do { \
1506 fail_stack.stack = (PREFIX(fail_stack_elt_t) *) \
1507 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (PREFIX(fail_stack_elt_t))); \
1508 \
1509 if (fail_stack.stack == NULL) \
1510 return -2; \
1511 \
1512 fail_stack.size = INIT_FAILURE_ALLOC; \
1513 fail_stack.avail = 0; \
1514 } while (0)
1515
1516 # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1517 # else
1518 # define INIT_FAIL_STACK() \
1519 do { \
1520 fail_stack.avail = 0; \
1521 } while (0)
1522
1523 # define RESET_FAIL_STACK()
1524 # endif
1525
1526
1527 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1528
1529 Return 1 if succeeds, and 0 if either ran out of memory
1530 allocating space for it or it was already too large.
1531
1532 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1533
1534 # define DOUBLE_FAIL_STACK(fail_stack) \
1535 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1536 ? 0 \
1537 : ((fail_stack).stack = (PREFIX(fail_stack_elt_t) *) \
1538 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1539 (fail_stack).size * sizeof (PREFIX(fail_stack_elt_t)), \
1540 ((fail_stack).size << 1) * sizeof (PREFIX(fail_stack_elt_t))),\
1541 \
1542 (fail_stack).stack == NULL \
1543 ? 0 \
1544 : ((fail_stack).size <<= 1, \
1545 1)))
1546
1547
1548 /* Push pointer POINTER on FAIL_STACK.
1549 Return 1 if was able to do so and 0 if ran out of memory allocating
1550 space to do so. */
1551 # define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1552 ((FAIL_STACK_FULL () \
1553 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1554 ? 0 \
1555 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1556 1))
1557
1558 /* Push a pointer value onto the failure stack.
1559 Assumes the variable `fail_stack'. Probably should only
1560 be called from within `PUSH_FAILURE_POINT'. */
1561 # define PUSH_FAILURE_POINTER(item) \
1562 fail_stack.stack[fail_stack.avail++].pointer = (UCHAR_T *) (item)
1563
1564 /* This pushes an integer-valued item onto the failure stack.
1565 Assumes the variable `fail_stack'. Probably should only
1566 be called from within `PUSH_FAILURE_POINT'. */
1567 # define PUSH_FAILURE_INT(item) \
1568 fail_stack.stack[fail_stack.avail++].integer = (item)
1569
1570 /* Push a fail_stack_elt_t value onto the failure stack.
1571 Assumes the variable `fail_stack'. Probably should only
1572 be called from within `PUSH_FAILURE_POINT'. */
1573 # define PUSH_FAILURE_ELT(item) \
1574 fail_stack.stack[fail_stack.avail++] = (item)
1575
1576 /* These three POP... operations complement the three PUSH... operations.
1577 All assume that `fail_stack' is nonempty. */
1578 # define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1579 # define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1580 # define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1581
1582 /* Used to omit pushing failure point id's when we're not debugging. */
1583 # ifdef DEBUG
1584 # define DEBUG_PUSH PUSH_FAILURE_INT
1585 # define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1586 # else
1587 # define DEBUG_PUSH(item)
1588 # define DEBUG_POP(item_addr)
1589 # endif
1590
1591
1592 /* Push the information about the state we will need
1593 if we ever fail back to it.
1594
1595 Requires variables fail_stack, regstart, regend, reg_info, and
1596 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
1597 be declared.
1598
1599 Does `return FAILURE_CODE' if runs out of memory. */
1600
1601 # define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1602 do { \
1603 char *destination; \
1604 /* Must be int, so when we don't save any registers, the arithmetic \
1605 of 0 + -1 isn't done as unsigned. */ \
1606 /* Can't be int, since there is not a shred of a guarantee that int \
1607 is wide enough to hold a value of something to which pointer can \
1608 be assigned */ \
1609 active_reg_t this_reg; \
1610 \
1611 DEBUG_STATEMENT (failure_id++); \
1612 DEBUG_STATEMENT (nfailure_points_pushed++); \
1613 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1614 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1615 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1616 \
1617 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
1618 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1619 \
1620 /* Ensure we have enough space allocated for what we will push. */ \
1621 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1622 { \
1623 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1624 return failure_code; \
1625 \
1626 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1627 (fail_stack).size); \
1628 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1629 } \
1630 \
1631 /* Push the info, starting with the registers. */ \
1632 DEBUG_PRINT1 ("\n"); \
1633 \
1634 if (1) \
1635 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1636 this_reg++) \
1637 { \
1638 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
1639 DEBUG_STATEMENT (num_regs_pushed++); \
1640 \
1641 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1642 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1643 \
1644 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1645 PUSH_FAILURE_POINTER (regend[this_reg]); \
1646 \
1647 DEBUG_PRINT2 (" info: %p\n ", \
1648 reg_info[this_reg].word.pointer); \
1649 DEBUG_PRINT2 (" match_null=%d", \
1650 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1651 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1652 DEBUG_PRINT2 (" matched_something=%d", \
1653 MATCHED_SOMETHING (reg_info[this_reg])); \
1654 DEBUG_PRINT2 (" ever_matched=%d", \
1655 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1656 DEBUG_PRINT1 ("\n"); \
1657 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1658 } \
1659 \
1660 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
1661 PUSH_FAILURE_INT (lowest_active_reg); \
1662 \
1663 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
1664 PUSH_FAILURE_INT (highest_active_reg); \
1665 \
1666 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
1667 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1668 PUSH_FAILURE_POINTER (pattern_place); \
1669 \
1670 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
1671 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1672 size2); \
1673 DEBUG_PRINT1 ("'\n"); \
1674 PUSH_FAILURE_POINTER (string_place); \
1675 \
1676 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1677 DEBUG_PUSH (failure_id); \
1678 } while (0)
1679
1680 # ifndef DEFINED_ONCE
1681 /* This is the number of items that are pushed and popped on the stack
1682 for each register. */
1683 # define NUM_REG_ITEMS 3
1684
1685 /* Individual items aside from the registers. */
1686 # ifdef DEBUG
1687 # define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1688 # else
1689 # define NUM_NONREG_ITEMS 4
1690 # endif
1691
1692 /* We push at most this many items on the stack. */
1693 /* We used to use (num_regs - 1), which is the number of registers
1694 this regexp will save; but that was changed to 5
1695 to avoid stack overflow for a regexp with lots of parens. */
1696 # define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1697
1698 /* We actually push this many items. */
1699 # define NUM_FAILURE_ITEMS \
1700 (((0 \
1701 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1702 * NUM_REG_ITEMS) \
1703 + NUM_NONREG_ITEMS)
1704
1705 /* How many items can still be added to the stack without overflowing it. */
1706 # define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1707 # endif /* not DEFINED_ONCE */
1708
1709
1710 /* Pops what PUSH_FAIL_STACK pushes.
1711
1712 We restore into the parameters, all of which should be lvalues:
1713 STR -- the saved data position.
1714 PAT -- the saved pattern position.
1715 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1716 REGSTART, REGEND -- arrays of string positions.
1717 REG_INFO -- array of information about each subexpression.
1718
1719 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1720 `pend', `string1', `size1', `string2', and `size2'. */
1721 # define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1722 { \
1723 DEBUG_STATEMENT (unsigned failure_id;) \
1724 active_reg_t this_reg; \
1725 const UCHAR_T *string_temp; \
1726 \
1727 assert (!FAIL_STACK_EMPTY ()); \
1728 \
1729 /* Remove failure points and point to how many regs pushed. */ \
1730 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1731 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1732 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1733 \
1734 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1735 \
1736 DEBUG_POP (&failure_id); \
1737 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1738 \
1739 /* If the saved string location is NULL, it came from an \
1740 on_failure_keep_string_jump opcode, and we want to throw away the \
1741 saved NULL, thus retaining our current position in the string. */ \
1742 string_temp = POP_FAILURE_POINTER (); \
1743 if (string_temp != NULL) \
1744 str = (const CHAR_T *) string_temp; \
1745 \
1746 DEBUG_PRINT2 (" Popping string %p: `", str); \
1747 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1748 DEBUG_PRINT1 ("'\n"); \
1749 \
1750 pat = (UCHAR_T *) POP_FAILURE_POINTER (); \
1751 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
1752 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1753 \
1754 /* Restore register info. */ \
1755 high_reg = (active_reg_t) POP_FAILURE_INT (); \
1756 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
1757 \
1758 low_reg = (active_reg_t) POP_FAILURE_INT (); \
1759 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
1760 \
1761 if (1) \
1762 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1763 { \
1764 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
1765 \
1766 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1767 DEBUG_PRINT2 (" info: %p\n", \
1768 reg_info[this_reg].word.pointer); \
1769 \
1770 regend[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1771 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1772 \
1773 regstart[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1774 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1775 } \
1776 else \
1777 { \
1778 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1779 { \
1780 reg_info[this_reg].word.integer = 0; \
1781 regend[this_reg] = 0; \
1782 regstart[this_reg] = 0; \
1783 } \
1784 highest_active_reg = high_reg; \
1785 } \
1786 \
1787 set_regs_matched_done = 0; \
1788 DEBUG_STATEMENT (nfailure_points_popped++); \
1789 } /* POP_FAILURE_POINT */
1790
1791 /* Structure for per-register (a.k.a. per-group) information.
1793 Other register information, such as the
1794 starting and ending positions (which are addresses), and the list of
1795 inner groups (which is a bits list) are maintained in separate
1796 variables.
1797
1798 We are making a (strictly speaking) nonportable assumption here: that
1799 the compiler will pack our bit fields into something that fits into
1800 the type of `word', i.e., is something that fits into one item on the
1801 failure stack. */
1802
1803
1804 /* Declarations and macros for re_match_2. */
1805
1806 typedef union
1807 {
1808 PREFIX(fail_stack_elt_t) word;
1809 struct
1810 {
1811 /* This field is one if this group can match the empty string,
1812 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1813 # define MATCH_NULL_UNSET_VALUE 3
1814 unsigned match_null_string_p : 2;
1815 unsigned is_active : 1;
1816 unsigned matched_something : 1;
1817 unsigned ever_matched_something : 1;
1818 } bits;
1819 } PREFIX(register_info_type);
1820
1821 # ifndef DEFINED_ONCE
1822 # define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1823 # define IS_ACTIVE(R) ((R).bits.is_active)
1824 # define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1825 # define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1826
1827
1828 /* Call this when have matched a real character; it sets `matched' flags
1829 for the subexpressions which we are currently inside. Also records
1830 that those subexprs have matched. */
1831 # define SET_REGS_MATCHED() \
1832 do \
1833 { \
1834 if (!set_regs_matched_done) \
1835 { \
1836 active_reg_t r; \
1837 set_regs_matched_done = 1; \
1838 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1839 { \
1840 MATCHED_SOMETHING (reg_info[r]) \
1841 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1842 = 1; \
1843 } \
1844 } \
1845 } \
1846 while (0)
1847 # endif /* not DEFINED_ONCE */
1848
1849 /* Registers are set to a sentinel when they haven't yet matched. */
1850 static CHAR_T PREFIX(reg_unset_dummy);
1851 # define REG_UNSET_VALUE (&PREFIX(reg_unset_dummy))
1852 # define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1853
1854 /* Subroutine declarations and macros for regex_compile. */
1855 static void PREFIX(store_op1) (re_opcode_t op, UCHAR_T *loc, int arg);
1856 static void PREFIX(store_op2) (re_opcode_t op, UCHAR_T *loc,
1857 int arg1, int arg2);
1858 static void PREFIX(insert_op1) (re_opcode_t op, UCHAR_T *loc,
1859 int arg, UCHAR_T *end);
1860 static void PREFIX(insert_op2) (re_opcode_t op, UCHAR_T *loc,
1861 int arg1, int arg2, UCHAR_T *end);
1862 static boolean PREFIX(at_begline_loc_p) (const CHAR_T *pattern,
1863 const CHAR_T *p,
1864 reg_syntax_t syntax);
1865 static boolean PREFIX(at_endline_loc_p) (const CHAR_T *p,
1866 const CHAR_T *pend,
1867 reg_syntax_t syntax);
1868 # ifdef WCHAR
1869 static reg_errcode_t wcs_compile_range (CHAR_T range_start,
1870 const CHAR_T **p_ptr,
1871 const CHAR_T *pend,
1872 char *translate,
1873 reg_syntax_t syntax,
1874 UCHAR_T *b,
1875 CHAR_T *char_set);
1876 static void insert_space (int num, CHAR_T *loc, CHAR_T *end);
1877 # else /* BYTE */
1878 static reg_errcode_t byte_compile_range (unsigned int range_start,
1879 const char **p_ptr,
1880 const char *pend,
1881 char *translate,
1882 reg_syntax_t syntax,
1883 unsigned char *b);
1884 # endif /* WCHAR */
1885
1886 /* Fetch the next character in the uncompiled pattern---translating it
1887 if necessary. Also cast from a signed character in the constant
1888 string passed to us by the user to an unsigned char that we can use
1889 as an array index (in, e.g., `translate'). */
1890 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1891 because it is impossible to allocate 4GB array for some encodings
1892 which have 4 byte character_set like UCS4. */
1893 # ifndef PATFETCH
1894 # ifdef WCHAR
1895 # define PATFETCH(c) \
1896 do {if (p == pend) return REG_EEND; \
1897 c = (UCHAR_T) *p++; \
1898 if (translate && (c <= 0xff)) c = (UCHAR_T) translate[c]; \
1899 } while (0)
1900 # else /* BYTE */
1901 # define PATFETCH(c) \
1902 do {if (p == pend) return REG_EEND; \
1903 c = (unsigned char) *p++; \
1904 if (translate) c = (unsigned char) translate[c]; \
1905 } while (0)
1906 # endif /* WCHAR */
1907 # endif
1908
1909 /* Fetch the next character in the uncompiled pattern, with no
1910 translation. */
1911 # define PATFETCH_RAW(c) \
1912 do {if (p == pend) return REG_EEND; \
1913 c = (UCHAR_T) *p++; \
1914 } while (0)
1915
1916 /* Go backwards one character in the pattern. */
1917 # define PATUNFETCH p--
1918
1919
1920 /* If `translate' is non-null, return translate[D], else just D. We
1921 cast the subscript to translate because some data is declared as
1922 `char *', to avoid warnings when a string constant is passed. But
1923 when we use a character as a subscript we must make it unsigned. */
1924 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1925 because it is impossible to allocate 4GB array for some encodings
1926 which have 4 byte character_set like UCS4. */
1927
1928 # ifndef TRANSLATE
1929 # ifdef WCHAR
1930 # define TRANSLATE(d) \
1931 ((translate && ((UCHAR_T) (d)) <= 0xff) \
1932 ? (char) translate[(unsigned char) (d)] : (d))
1933 # else /* BYTE */
1934 # define TRANSLATE(d) \
1935 (translate ? (char) translate[(unsigned char) (d)] : (char) (d))
1936 # endif /* WCHAR */
1937 # endif
1938
1939
1940 /* Macros for outputting the compiled pattern into `buffer'. */
1941
1942 /* If the buffer isn't allocated when it comes in, use this. */
1943 # define INIT_BUF_SIZE (32 * sizeof(UCHAR_T))
1944
1945 /* Make sure we have at least N more bytes of space in buffer. */
1946 # ifdef WCHAR
1947 # define GET_BUFFER_SPACE(n) \
1948 while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \
1949 + (n)*sizeof(CHAR_T)) > bufp->allocated) \
1950 EXTEND_BUFFER ()
1951 # else /* BYTE */
1952 # define GET_BUFFER_SPACE(n) \
1953 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
1954 EXTEND_BUFFER ()
1955 # endif /* WCHAR */
1956
1957 /* Make sure we have one more byte of buffer space and then add C to it. */
1958 # define BUF_PUSH(c) \
1959 do { \
1960 GET_BUFFER_SPACE (1); \
1961 *b++ = (UCHAR_T) (c); \
1962 } while (0)
1963
1964
1965 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1966 # define BUF_PUSH_2(c1, c2) \
1967 do { \
1968 GET_BUFFER_SPACE (2); \
1969 *b++ = (UCHAR_T) (c1); \
1970 *b++ = (UCHAR_T) (c2); \
1971 } while (0)
1972
1973
1974 /* As with BUF_PUSH_2, except for three bytes. */
1975 # define BUF_PUSH_3(c1, c2, c3) \
1976 do { \
1977 GET_BUFFER_SPACE (3); \
1978 *b++ = (UCHAR_T) (c1); \
1979 *b++ = (UCHAR_T) (c2); \
1980 *b++ = (UCHAR_T) (c3); \
1981 } while (0)
1982
1983 /* Store a jump with opcode OP at LOC to location TO. We store a
1984 relative address offset by the three bytes the jump itself occupies. */
1985 # define STORE_JUMP(op, loc, to) \
1986 PREFIX(store_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)))
1987
1988 /* Likewise, for a two-argument jump. */
1989 # define STORE_JUMP2(op, loc, to, arg) \
1990 PREFIX(store_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg)
1991
1992 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1993 # define INSERT_JUMP(op, loc, to) \
1994 PREFIX(insert_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b)
1995
1996 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1997 # define INSERT_JUMP2(op, loc, to, arg) \
1998 PREFIX(insert_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\
1999 arg, b)
2000
2001 /* This is not an arbitrary limit: the arguments which represent offsets
2002 into the pattern are two bytes long. So if 2^16 bytes turns out to
2003 be too small, many things would have to change. */
2004 /* Any other compiler which, like MSC, has allocation limit below 2^16
2005 bytes will have to use approach similar to what was done below for
2006 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
2007 reallocating to 0 bytes. Such thing is not going to work too well.
2008 You have been warned!! */
2009 # ifndef DEFINED_ONCE
2010 # if defined _MSC_VER && !defined WIN32
2011 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
2012 The REALLOC define eliminates a flurry of conversion warnings,
2013 but is not required. */
2014 # define MAX_BUF_SIZE 65500L
2015 # define REALLOC(p,s) realloc ((p), (size_t) (s))
2016 # else
2017 # define MAX_BUF_SIZE (1L << 16)
2018 # define REALLOC(p,s) realloc ((p), (s))
2019 # endif
2020
2021 /* Extend the buffer by twice its current size via realloc and
2022 reset the pointers that pointed into the old block to point to the
2023 correct places in the new one. If extending the buffer results in it
2024 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
2025 # if __BOUNDED_POINTERS__
2026 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
2027 # define MOVE_BUFFER_POINTER(P) \
2028 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
2029 # define ELSE_EXTEND_BUFFER_HIGH_BOUND \
2030 else \
2031 { \
2032 SET_HIGH_BOUND (b); \
2033 SET_HIGH_BOUND (begalt); \
2034 if (fixup_alt_jump) \
2035 SET_HIGH_BOUND (fixup_alt_jump); \
2036 if (laststart) \
2037 SET_HIGH_BOUND (laststart); \
2038 if (pending_exact) \
2039 SET_HIGH_BOUND (pending_exact); \
2040 }
2041 # else
2042 # define MOVE_BUFFER_POINTER(P) (P) += incr
2043 # define ELSE_EXTEND_BUFFER_HIGH_BOUND
2044 # endif
2045 # endif /* not DEFINED_ONCE */
2046
2047 # ifdef WCHAR
2048 # define EXTEND_BUFFER() \
2049 do { \
2050 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2051 int wchar_count; \
2052 if (bufp->allocated + sizeof(UCHAR_T) > MAX_BUF_SIZE) \
2053 return REG_ESIZE; \
2054 bufp->allocated <<= 1; \
2055 if (bufp->allocated > MAX_BUF_SIZE) \
2056 bufp->allocated = MAX_BUF_SIZE; \
2057 /* How many characters the new buffer can have? */ \
2058 wchar_count = bufp->allocated / sizeof(UCHAR_T); \
2059 if (wchar_count == 0) wchar_count = 1; \
2060 /* Truncate the buffer to CHAR_T align. */ \
2061 bufp->allocated = wchar_count * sizeof(UCHAR_T); \
2062 RETALLOC (COMPILED_BUFFER_VAR, wchar_count, UCHAR_T); \
2063 bufp->buffer = (char*)COMPILED_BUFFER_VAR; \
2064 if (COMPILED_BUFFER_VAR == NULL) \
2065 return REG_ESPACE; \
2066 /* If the buffer moved, move all the pointers into it. */ \
2067 if (old_buffer != COMPILED_BUFFER_VAR) \
2068 { \
2069 PTR_INT_TYPE incr = COMPILED_BUFFER_VAR - old_buffer; \
2070 MOVE_BUFFER_POINTER (b); \
2071 MOVE_BUFFER_POINTER (begalt); \
2072 if (fixup_alt_jump) \
2073 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2074 if (laststart) \
2075 MOVE_BUFFER_POINTER (laststart); \
2076 if (pending_exact) \
2077 MOVE_BUFFER_POINTER (pending_exact); \
2078 } \
2079 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2080 } while (0)
2081 # else /* BYTE */
2082 # define EXTEND_BUFFER() \
2083 do { \
2084 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2085 if (bufp->allocated == MAX_BUF_SIZE) \
2086 return REG_ESIZE; \
2087 bufp->allocated <<= 1; \
2088 if (bufp->allocated > MAX_BUF_SIZE) \
2089 bufp->allocated = MAX_BUF_SIZE; \
2090 bufp->buffer = (UCHAR_T *) REALLOC (COMPILED_BUFFER_VAR, \
2091 bufp->allocated); \
2092 if (COMPILED_BUFFER_VAR == NULL) \
2093 return REG_ESPACE; \
2094 /* If the buffer moved, move all the pointers into it. */ \
2095 if (old_buffer != COMPILED_BUFFER_VAR) \
2096 { \
2097 PTR_INT_TYPE incr = COMPILED_BUFFER_VAR - old_buffer; \
2098 MOVE_BUFFER_POINTER (b); \
2099 MOVE_BUFFER_POINTER (begalt); \
2100 if (fixup_alt_jump) \
2101 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2102 if (laststart) \
2103 MOVE_BUFFER_POINTER (laststart); \
2104 if (pending_exact) \
2105 MOVE_BUFFER_POINTER (pending_exact); \
2106 } \
2107 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2108 } while (0)
2109 # endif /* WCHAR */
2110
2111 # ifndef DEFINED_ONCE
2112 /* Since we have one byte reserved for the register number argument to
2113 {start,stop}_memory, the maximum number of groups we can report
2114 things about is what fits in that byte. */
2115 # define MAX_REGNUM 255
2116
2117 /* But patterns can have more than `MAX_REGNUM' registers. We just
2118 ignore the excess. */
2119 typedef unsigned regnum_t;
2120
2121
2122 /* Macros for the compile stack. */
2123
2124 /* Since offsets can go either forwards or backwards, this type needs to
2125 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
2126 /* int may be not enough when sizeof(int) == 2. */
2127 typedef long pattern_offset_t;
2128
2129 typedef struct
2130 {
2131 pattern_offset_t begalt_offset;
2132 pattern_offset_t fixup_alt_jump;
2133 pattern_offset_t inner_group_offset;
2134 pattern_offset_t laststart_offset;
2135 regnum_t regnum;
2136 } compile_stack_elt_t;
2137
2138
2139 typedef struct
2140 {
2141 compile_stack_elt_t *stack;
2142 unsigned size;
2143 unsigned avail; /* Offset of next open position. */
2144 } compile_stack_type;
2145
2146
2147 # define INIT_COMPILE_STACK_SIZE 32
2148
2149 # define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
2150 # define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
2151
2152 /* The next available element. */
2153 # define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
2154
2155 # endif /* not DEFINED_ONCE */
2156
2157 /* Set the bit for character C in a list. */
2158 # ifndef DEFINED_ONCE
2159 # define SET_LIST_BIT(c) \
2160 (b[((unsigned char) (c)) / BYTEWIDTH] \
2161 |= 1 << (((unsigned char) c) % BYTEWIDTH))
2162 # endif /* DEFINED_ONCE */
2163
2164 /* Get the next unsigned number in the uncompiled pattern. */
2165 # define GET_UNSIGNED_NUMBER(num) \
2166 { \
2167 while (p != pend) \
2168 { \
2169 PATFETCH (c); \
2170 if (c < '0' || c > '9') \
2171 break; \
2172 if (num <= RE_DUP_MAX) \
2173 { \
2174 if (num < 0) \
2175 num = 0; \
2176 num = num * 10 + c - '0'; \
2177 } \
2178 } \
2179 }
2180
2181 # ifndef DEFINED_ONCE
2182 # if defined _LIBC || WIDE_CHAR_SUPPORT
2183 /* The GNU C library provides support for user-defined character classes
2184 and the functions from ISO C amendement 1. */
2185 # ifdef CHARCLASS_NAME_MAX
2186 # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
2187 # else
2188 /* This shouldn't happen but some implementation might still have this
2189 problem. Use a reasonable default value. */
2190 # define CHAR_CLASS_MAX_LENGTH 256
2191 # endif
2192
2193 # ifdef _LIBC
2194 # define IS_CHAR_CLASS(string) __wctype (string)
2195 # else
2196 # define IS_CHAR_CLASS(string) wctype (string)
2197 # endif
2198 # else
2199 # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
2200
2201 # define IS_CHAR_CLASS(string) \
2202 (STREQ (string, "alpha") || STREQ (string, "upper") \
2203 || STREQ (string, "lower") || STREQ (string, "digit") \
2204 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
2205 || STREQ (string, "space") || STREQ (string, "print") \
2206 || STREQ (string, "punct") || STREQ (string, "graph") \
2207 || STREQ (string, "cntrl") || STREQ (string, "blank"))
2208 # endif
2209 # endif /* DEFINED_ONCE */
2210
2211 # ifndef MATCH_MAY_ALLOCATE
2213
2214 /* If we cannot allocate large objects within re_match_2_internal,
2215 we make the fail stack and register vectors global.
2216 The fail stack, we grow to the maximum size when a regexp
2217 is compiled.
2218 The register vectors, we adjust in size each time we
2219 compile a regexp, according to the number of registers it needs. */
2220
2221 static PREFIX(fail_stack_type) fail_stack;
2222
2223 /* Size with which the following vectors are currently allocated.
2224 That is so we can make them bigger as needed,
2225 but never make them smaller. */
2226 # ifdef DEFINED_ONCE
2227 static int regs_allocated_size;
2228
2229 static const char ** regstart, ** regend;
2230 static const char ** old_regstart, ** old_regend;
2231 static const char **best_regstart, **best_regend;
2232 static const char **reg_dummy;
2233 # endif /* DEFINED_ONCE */
2234
2235 static PREFIX(register_info_type) *PREFIX(reg_info);
2236 static PREFIX(register_info_type) *PREFIX(reg_info_dummy);
2237
2238 /* Make the register vectors big enough for NUM_REGS registers,
2239 but don't make them smaller. */
2240
2241 static void
2242 PREFIX(regex_grow_registers) (int num_regs)
2243 {
2244 if (num_regs > regs_allocated_size)
2245 {
2246 RETALLOC_IF (regstart, num_regs, const char *);
2247 RETALLOC_IF (regend, num_regs, const char *);
2248 RETALLOC_IF (old_regstart, num_regs, const char *);
2249 RETALLOC_IF (old_regend, num_regs, const char *);
2250 RETALLOC_IF (best_regstart, num_regs, const char *);
2251 RETALLOC_IF (best_regend, num_regs, const char *);
2252 RETALLOC_IF (PREFIX(reg_info), num_regs, PREFIX(register_info_type));
2253 RETALLOC_IF (reg_dummy, num_regs, const char *);
2254 RETALLOC_IF (PREFIX(reg_info_dummy), num_regs, PREFIX(register_info_type));
2255
2256 regs_allocated_size = num_regs;
2257 }
2258 }
2259
2260 # endif /* not MATCH_MAY_ALLOCATE */
2261
2262 # ifndef DEFINED_ONCE
2264 static boolean group_in_compile_stack (compile_stack_type compile_stack,
2265 regnum_t regnum);
2266 # endif /* not DEFINED_ONCE */
2267
2268 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2269 Returns one of error codes defined in `regex.h', or zero for success.
2270
2271 Assumes the `allocated' (and perhaps `buffer') and `translate'
2272 fields are set in BUFP on entry.
2273
2274 If it succeeds, results are put in BUFP (if it returns an error, the
2275 contents of BUFP are undefined):
2276 `buffer' is the compiled pattern;
2277 `syntax' is set to SYNTAX;
2278 `used' is set to the length of the compiled pattern;
2279 `fastmap_accurate' is zero;
2280 `re_nsub' is the number of subexpressions in PATTERN;
2281 `not_bol' and `not_eol' are zero;
2282
2283 The `fastmap' and `newline_anchor' fields are neither
2284 examined nor set. */
2285
2286 /* Return, freeing storage we allocated. */
2287 # ifdef WCHAR
2288 # define FREE_STACK_RETURN(value) \
2289 return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value)
2290 # else
2291 # define FREE_STACK_RETURN(value) \
2292 return (free (compile_stack.stack), value)
2293 # endif /* WCHAR */
2294
2295 static reg_errcode_t
2296 PREFIX(regex_compile) (const char *ARG_PREFIX(pattern),
2297 size_t ARG_PREFIX(size), reg_syntax_t syntax,
2298 struct re_pattern_buffer *bufp)
2299 {
2300 /* We fetch characters from PATTERN here. Even though PATTERN is
2301 `char *' (i.e., signed), we declare these variables as unsigned, so
2302 they can be reliably used as array indices. */
2303 register UCHAR_T c, c1;
2304
2305 #ifdef WCHAR
2306 /* A temporary space to keep wchar_t pattern and compiled pattern. */
2307 CHAR_T *pattern, *COMPILED_BUFFER_VAR;
2308 size_t size;
2309 /* offset buffer for optimization. See convert_mbs_to_wc. */
2310 int *mbs_offset = NULL;
2311 /* It hold whether each wchar_t is binary data or not. */
2312 char *is_binary = NULL;
2313 /* A flag whether exactn is handling binary data or not. */
2314 char is_exactn_bin = FALSE;
2315 #endif /* WCHAR */
2316
2317 /* A random temporary spot in PATTERN. */
2318 const CHAR_T *p1;
2319
2320 /* Points to the end of the buffer, where we should append. */
2321 register UCHAR_T *b;
2322
2323 /* Keeps track of unclosed groups. */
2324 compile_stack_type compile_stack;
2325
2326 /* Points to the current (ending) position in the pattern. */
2327 #ifdef WCHAR
2328 const CHAR_T *p;
2329 const CHAR_T *pend;
2330 #else /* BYTE */
2331 const CHAR_T *p = pattern;
2332 const CHAR_T *pend = pattern + size;
2333 #endif /* WCHAR */
2334
2335 /* How to translate the characters in the pattern. */
2336 RE_TRANSLATE_TYPE translate = bufp->translate;
2337
2338 /* Address of the count-byte of the most recently inserted `exactn'
2339 command. This makes it possible to tell if a new exact-match
2340 character can be added to that command or if the character requires
2341 a new `exactn' command. */
2342 UCHAR_T *pending_exact = 0;
2343
2344 /* Address of start of the most recently finished expression.
2345 This tells, e.g., postfix * where to find the start of its
2346 operand. Reset at the beginning of groups and alternatives. */
2347 UCHAR_T *laststart = 0;
2348
2349 /* Address of beginning of regexp, or inside of last group. */
2350 UCHAR_T *begalt;
2351
2352 /* Address of the place where a forward jump should go to the end of
2353 the containing expression. Each alternative of an `or' -- except the
2354 last -- ends with a forward jump of this sort. */
2355 UCHAR_T *fixup_alt_jump = 0;
2356
2357 /* Counts open-groups as they are encountered. Remembered for the
2358 matching close-group on the compile stack, so the same register
2359 number is put in the stop_memory as the start_memory. */
2360 regnum_t regnum = 0;
2361
2362 #ifdef WCHAR
2363 /* Initialize the wchar_t PATTERN and offset_buffer. */
2364 p = pend = pattern = TALLOC(csize + 1, CHAR_T);
2365 mbs_offset = TALLOC(csize + 1, int);
2366 is_binary = TALLOC(csize + 1, char);
2367 if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
2368 {
2369 free(pattern);
2370 free(mbs_offset);
2371 free(is_binary);
2372 return REG_ESPACE;
2373 }
2374 pattern[csize] = L'\0'; /* sentinel */
2375 size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
2376 pend = p + size;
2377 if (size < 0)
2378 {
2379 free(pattern);
2380 free(mbs_offset);
2381 free(is_binary);
2382 return REG_BADPAT;
2383 }
2384 #endif
2385
2386 #ifdef DEBUG
2387 DEBUG_PRINT1 ("\nCompiling pattern: ");
2388 if (debug)
2389 {
2390 unsigned debug_count;
2391
2392 for (debug_count = 0; debug_count < size; debug_count++)
2393 PUT_CHAR (pattern[debug_count]);
2394 putchar ('\n');
2395 }
2396 #endif /* DEBUG */
2397
2398 /* Initialize the compile stack. */
2399 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2400 if (compile_stack.stack == NULL)
2401 {
2402 #ifdef WCHAR
2403 free(pattern);
2404 free(mbs_offset);
2405 free(is_binary);
2406 #endif
2407 return REG_ESPACE;
2408 }
2409
2410 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2411 compile_stack.avail = 0;
2412
2413 /* Initialize the pattern buffer. */
2414 bufp->syntax = syntax;
2415 bufp->fastmap_accurate = 0;
2416 bufp->not_bol = bufp->not_eol = 0;
2417
2418 /* Set `used' to zero, so that if we return an error, the pattern
2419 printer (for debugging) will think there's no pattern. We reset it
2420 at the end. */
2421 bufp->used = 0;
2422
2423 /* Always count groups, whether or not bufp->no_sub is set. */
2424 bufp->re_nsub = 0;
2425
2426 #if !defined emacs && !defined SYNTAX_TABLE
2427 /* Initialize the syntax table. */
2428 init_syntax_once ();
2429 #endif
2430
2431 if (bufp->allocated == 0)
2432 {
2433 if (bufp->buffer)
2434 { /* If zero allocated, but buffer is non-null, try to realloc
2435 enough space. This loses if buffer's address is bogus, but
2436 that is the user's responsibility. */
2437 #ifdef WCHAR
2438 /* Free bufp->buffer and allocate an array for wchar_t pattern
2439 buffer. */
2440 free(bufp->buffer);
2441 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(UCHAR_T),
2442 UCHAR_T);
2443 #else
2444 RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, UCHAR_T);
2445 #endif /* WCHAR */
2446 }
2447 else
2448 { /* Caller did not allocate a buffer. Do it for them. */
2449 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(UCHAR_T),
2450 UCHAR_T);
2451 }
2452
2453 if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE);
2454 #ifdef WCHAR
2455 bufp->buffer = (char*)COMPILED_BUFFER_VAR;
2456 #endif /* WCHAR */
2457 bufp->allocated = INIT_BUF_SIZE;
2458 }
2459 #ifdef WCHAR
2460 else
2461 COMPILED_BUFFER_VAR = (UCHAR_T*) bufp->buffer;
2462 #endif
2463
2464 begalt = b = COMPILED_BUFFER_VAR;
2465
2466 /* Loop through the uncompiled pattern until we're at the end. */
2467 while (p != pend)
2468 {
2469 PATFETCH (c);
2470
2471 switch (c)
2472 {
2473 case '^':
2474 {
2475 if ( /* If at start of pattern, it's an operator. */
2476 p == pattern + 1
2477 /* If context independent, it's an operator. */
2478 || syntax & RE_CONTEXT_INDEP_ANCHORS
2479 /* Otherwise, depends on what's come before. */
2480 || PREFIX(at_begline_loc_p) (pattern, p, syntax))
2481 BUF_PUSH (begline);
2482 else
2483 goto normal_char;
2484 }
2485 break;
2486
2487
2488 case '$':
2489 {
2490 if ( /* If at end of pattern, it's an operator. */
2491 p == pend
2492 /* If context independent, it's an operator. */
2493 || syntax & RE_CONTEXT_INDEP_ANCHORS
2494 /* Otherwise, depends on what's next. */
2495 || PREFIX(at_endline_loc_p) (p, pend, syntax))
2496 BUF_PUSH (endline);
2497 else
2498 goto normal_char;
2499 }
2500 break;
2501
2502
2503 case '+':
2504 case '?':
2505 if ((syntax & RE_BK_PLUS_QM)
2506 || (syntax & RE_LIMITED_OPS))
2507 goto normal_char;
2508 /* Fall through. */
2509 handle_plus:
2510 case '*':
2511 /* If there is no previous pattern... */
2512 if (!laststart)
2513 {
2514 if (syntax & RE_CONTEXT_INVALID_OPS)
2515 FREE_STACK_RETURN (REG_BADRPT);
2516 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2517 goto normal_char;
2518 }
2519
2520 {
2521 /* Are we optimizing this jump? */
2522 boolean keep_string_p = false;
2523
2524 /* 1 means zero (many) matches is allowed. */
2525 char zero_times_ok = 0, many_times_ok = 0;
2526
2527 /* If there is a sequence of repetition chars, collapse it
2528 down to just one (the right one). We can't combine
2529 interval operators with these because of, e.g., `a{2}*',
2530 which should only match an even number of `a's. */
2531
2532 for (;;)
2533 {
2534 zero_times_ok |= c != '+';
2535 many_times_ok |= c != '?';
2536
2537 if (p == pend)
2538 break;
2539
2540 PATFETCH (c);
2541
2542 if (c == '*'
2543 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2544 ;
2545
2546 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2547 {
2548 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2549
2550 PATFETCH (c1);
2551 if (!(c1 == '+' || c1 == '?'))
2552 {
2553 PATUNFETCH;
2554 PATUNFETCH;
2555 break;
2556 }
2557
2558 c = c1;
2559 }
2560 else
2561 {
2562 PATUNFETCH;
2563 break;
2564 }
2565
2566 /* If we get here, we found another repeat character. */
2567 }
2568
2569 /* Star, etc. applied to an empty pattern is equivalent
2570 to an empty pattern. */
2571 if (!laststart)
2572 break;
2573
2574 /* Now we know whether or not zero matches is allowed
2575 and also whether or not two or more matches is allowed. */
2576 if (many_times_ok)
2577 { /* More than one repetition is allowed, so put in at the
2578 end a backward relative jump from `b' to before the next
2579 jump we're going to put in below (which jumps from
2580 laststart to after this jump).
2581
2582 But if we are at the `*' in the exact sequence `.*\n',
2583 insert an unconditional jump backwards to the .,
2584 instead of the beginning of the loop. This way we only
2585 push a failure point once, instead of every time
2586 through the loop. */
2587 assert (p - 1 > pattern);
2588
2589 /* Allocate the space for the jump. */
2590 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2591
2592 /* We know we are not at the first character of the pattern,
2593 because laststart was nonzero. And we've already
2594 incremented `p', by the way, to be the character after
2595 the `*'. Do we have to do something analogous here
2596 for null bytes, because of RE_DOT_NOT_NULL? */
2597 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2598 && zero_times_ok
2599 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2600 && !(syntax & RE_DOT_NEWLINE))
2601 { /* We have .*\n. */
2602 STORE_JUMP (jump, b, laststart);
2603 keep_string_p = true;
2604 }
2605 else
2606 /* Anything else. */
2607 STORE_JUMP (maybe_pop_jump, b, laststart -
2608 (1 + OFFSET_ADDRESS_SIZE));
2609
2610 /* We've added more stuff to the buffer. */
2611 b += 1 + OFFSET_ADDRESS_SIZE;
2612 }
2613
2614 /* On failure, jump from laststart to b + 3, which will be the
2615 end of the buffer after this jump is inserted. */
2616 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of
2617 'b + 3'. */
2618 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2619 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2620 : on_failure_jump,
2621 laststart, b + 1 + OFFSET_ADDRESS_SIZE);
2622 pending_exact = 0;
2623 b += 1 + OFFSET_ADDRESS_SIZE;
2624
2625 if (!zero_times_ok)
2626 {
2627 /* At least one repetition is required, so insert a
2628 `dummy_failure_jump' before the initial
2629 `on_failure_jump' instruction of the loop. This
2630 effects a skip over that instruction the first time
2631 we hit that loop. */
2632 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2633 INSERT_JUMP (dummy_failure_jump, laststart, laststart +
2634 2 + 2 * OFFSET_ADDRESS_SIZE);
2635 b += 1 + OFFSET_ADDRESS_SIZE;
2636 }
2637 }
2638 break;
2639
2640
2641 case '.':
2642 laststart = b;
2643 BUF_PUSH (anychar);
2644 break;
2645
2646
2647 case '[':
2648 {
2649 boolean had_char_class = false;
2650 #ifdef WCHAR
2651 CHAR_T range_start = 0xffffffff;
2652 #else
2653 unsigned int range_start = 0xffffffff;
2654 #endif
2655 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2656
2657 #ifdef WCHAR
2658 /* We assume a charset(_not) structure as a wchar_t array.
2659 charset[0] = (re_opcode_t) charset(_not)
2660 charset[1] = l (= length of char_classes)
2661 charset[2] = m (= length of collating_symbols)
2662 charset[3] = n (= length of equivalence_classes)
2663 charset[4] = o (= length of char_ranges)
2664 charset[5] = p (= length of chars)
2665
2666 charset[6] = char_class (wctype_t)
2667 charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
2668 ...
2669 charset[l+5] = char_class (wctype_t)
2670
2671 charset[l+6] = collating_symbol (wchar_t)
2672 ...
2673 charset[l+m+5] = collating_symbol (wchar_t)
2674 ifdef _LIBC we use the index if
2675 _NL_COLLATE_SYMB_EXTRAMB instead of
2676 wchar_t string.
2677
2678 charset[l+m+6] = equivalence_classes (wchar_t)
2679 ...
2680 charset[l+m+n+5] = equivalence_classes (wchar_t)
2681 ifdef _LIBC we use the index in
2682 _NL_COLLATE_WEIGHT instead of
2683 wchar_t string.
2684
2685 charset[l+m+n+6] = range_start
2686 charset[l+m+n+7] = range_end
2687 ...
2688 charset[l+m+n+2o+4] = range_start
2689 charset[l+m+n+2o+5] = range_end
2690 ifdef _LIBC we use the value looked up
2691 in _NL_COLLATE_COLLSEQ instead of
2692 wchar_t character.
2693
2694 charset[l+m+n+2o+6] = char
2695 ...
2696 charset[l+m+n+2o+p+5] = char
2697
2698 */
2699
2700 /* We need at least 6 spaces: the opcode, the length of
2701 char_classes, the length of collating_symbols, the length of
2702 equivalence_classes, the length of char_ranges, the length of
2703 chars. */
2704 GET_BUFFER_SPACE (6);
2705
2706 /* Save b as laststart. And We use laststart as the pointer
2707 to the first element of the charset here.
2708 In other words, laststart[i] indicates charset[i]. */
2709 laststart = b;
2710
2711 /* We test `*p == '^' twice, instead of using an if
2712 statement, so we only need one BUF_PUSH. */
2713 BUF_PUSH (*p == '^' ? charset_not : charset);
2714 if (*p == '^')
2715 p++;
2716
2717 /* Push the length of char_classes, the length of
2718 collating_symbols, the length of equivalence_classes, the
2719 length of char_ranges and the length of chars. */
2720 BUF_PUSH_3 (0, 0, 0);
2721 BUF_PUSH_2 (0, 0);
2722
2723 /* Remember the first position in the bracket expression. */
2724 p1 = p;
2725
2726 /* charset_not matches newline according to a syntax bit. */
2727 if ((re_opcode_t) b[-6] == charset_not
2728 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2729 {
2730 BUF_PUSH('\n');
2731 laststart[5]++; /* Update the length of characters */
2732 }
2733
2734 /* Read in characters and ranges, setting map bits. */
2735 for (;;)
2736 {
2737 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2738
2739 PATFETCH (c);
2740
2741 /* \ might escape characters inside [...] and [^...]. */
2742 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2743 {
2744 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2745
2746 PATFETCH (c1);
2747 BUF_PUSH(c1);
2748 laststart[5]++; /* Update the length of chars */
2749 range_start = c1;
2750 continue;
2751 }
2752
2753 /* Could be the end of the bracket expression. If it's
2754 not (i.e., when the bracket expression is `[]' so
2755 far), the ']' character bit gets set way below. */
2756 if (c == ']' && p != p1 + 1)
2757 break;
2758
2759 /* Look ahead to see if it's a range when the last thing
2760 was a character class. */
2761 if (had_char_class && c == '-' && *p != ']')
2762 FREE_STACK_RETURN (REG_ERANGE);
2763
2764 /* Look ahead to see if it's a range when the last thing
2765 was a character: if this is a hyphen not at the
2766 beginning or the end of a list, then it's the range
2767 operator. */
2768 if (c == '-'
2769 && !(p - 2 >= pattern && p[-2] == '[')
2770 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2771 && *p != ']')
2772 {
2773 reg_errcode_t ret;
2774 /* Allocate the space for range_start and range_end. */
2775 GET_BUFFER_SPACE (2);
2776 /* Update the pointer to indicate end of buffer. */
2777 b += 2;
2778 ret = wcs_compile_range (range_start, &p, pend, translate,
2779 syntax, b, laststart);
2780 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2781 range_start = 0xffffffff;
2782 }
2783 else if (p[0] == '-' && p[1] != ']')
2784 { /* This handles ranges made up of characters only. */
2785 reg_errcode_t ret;
2786
2787 /* Move past the `-'. */
2788 PATFETCH (c1);
2789 /* Allocate the space for range_start and range_end. */
2790 GET_BUFFER_SPACE (2);
2791 /* Update the pointer to indicate end of buffer. */
2792 b += 2;
2793 ret = wcs_compile_range (c, &p, pend, translate, syntax, b,
2794 laststart);
2795 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2796 range_start = 0xffffffff;
2797 }
2798
2799 /* See if we're at the beginning of a possible character
2800 class. */
2801 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2802 { /* Leave room for the null. */
2803 char str[CHAR_CLASS_MAX_LENGTH + 1];
2804
2805 PATFETCH (c);
2806 c1 = 0;
2807
2808 /* If pattern is `[[:'. */
2809 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2810
2811 for (;;)
2812 {
2813 PATFETCH (c);
2814 if ((c == ':' && *p == ']') || p == pend)
2815 break;
2816 if (c1 < CHAR_CLASS_MAX_LENGTH)
2817 str[c1++] = c;
2818 else
2819 /* This is in any case an invalid class name. */
2820 str[0] = '\0';
2821 }
2822 str[c1] = '\0';
2823
2824 /* If isn't a word bracketed by `[:' and `:]':
2825 undo the ending character, the letters, and leave
2826 the leading `:' and `[' (but store them as character). */
2827 if (c == ':' && *p == ']')
2828 {
2829 wctype_t wt;
2830 uintptr_t alignedp;
2831
2832 /* Query the character class as wctype_t. */
2833 wt = IS_CHAR_CLASS (str);
2834 if (wt == 0)
2835 FREE_STACK_RETURN (REG_ECTYPE);
2836
2837 /* Throw away the ] at the end of the character
2838 class. */
2839 PATFETCH (c);
2840
2841 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2842
2843 /* Allocate the space for character class. */
2844 GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
2845 /* Update the pointer to indicate end of buffer. */
2846 b += CHAR_CLASS_SIZE;
2847 /* Move data which follow character classes
2848 not to violate the data. */
2849 insert_space(CHAR_CLASS_SIZE,
2850 laststart + 6 + laststart[1],
2851 b - 1);
2852 alignedp = ((uintptr_t)(laststart + 6 + laststart[1])
2853 + __alignof__(wctype_t) - 1)
2854 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
2855 /* Store the character class. */
2856 *((wctype_t*)alignedp) = wt;
2857 /* Update length of char_classes */
2858 laststart[1] += CHAR_CLASS_SIZE;
2859
2860 had_char_class = true;
2861 }
2862 else
2863 {
2864 c1++;
2865 while (c1--)
2866 PATUNFETCH;
2867 BUF_PUSH ('[');
2868 BUF_PUSH (':');
2869 laststart[5] += 2; /* Update the length of characters */
2870 range_start = ':';
2871 had_char_class = false;
2872 }
2873 }
2874 else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '='
2875 || *p == '.'))
2876 {
2877 CHAR_T str[128]; /* Should be large enough. */
2878 CHAR_T delim = *p; /* '=' or '.' */
2879 # ifdef _LIBC
2880 uint32_t nrules =
2881 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2882 # endif
2883 PATFETCH (c);
2884 c1 = 0;
2885
2886 /* If pattern is `[[=' or '[[.'. */
2887 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2888
2889 for (;;)
2890 {
2891 PATFETCH (c);
2892 if ((c == delim && *p == ']') || p == pend)
2893 break;
2894 if (c1 < sizeof (str) - 1)
2895 str[c1++] = c;
2896 else
2897 /* This is in any case an invalid class name. */
2898 str[0] = '\0';
2899 }
2900 str[c1] = '\0';
2901
2902 if (c == delim && *p == ']' && str[0] != '\0')
2903 {
2904 unsigned int i, offset;
2905 /* If we have no collation data we use the default
2906 collation in which each character is in a class
2907 by itself. It also means that ASCII is the
2908 character set and therefore we cannot have character
2909 with more than one byte in the multibyte
2910 representation. */
2911
2912 /* If not defined _LIBC, we push the name and
2913 `\0' for the sake of matching performance. */
2914 int datasize = c1 + 1;
2915
2916 # ifdef _LIBC
2917 int32_t idx = 0;
2918 if (nrules == 0)
2919 # endif
2920 {
2921 if (c1 != 1)
2922 FREE_STACK_RETURN (REG_ECOLLATE);
2923 }
2924 # ifdef _LIBC
2925 else
2926 {
2927 const int32_t *table;
2928 const int32_t *weights;
2929 const int32_t *extra;
2930 const int32_t *indirect;
2931 wint_t *cp;
2932
2933 /* This #include defines a local function! */
2934 # include <locale/weightwc.h>
2935
2936 if(delim == '=')
2937 {
2938 /* We push the index for equivalence class. */
2939 cp = (wint_t*)str;
2940
2941 table = (const int32_t *)
2942 _NL_CURRENT (LC_COLLATE,
2943 _NL_COLLATE_TABLEWC);
2944 weights = (const int32_t *)
2945 _NL_CURRENT (LC_COLLATE,
2946 _NL_COLLATE_WEIGHTWC);
2947 extra = (const int32_t *)
2948 _NL_CURRENT (LC_COLLATE,
2949 _NL_COLLATE_EXTRAWC);
2950 indirect = (const int32_t *)
2951 _NL_CURRENT (LC_COLLATE,
2952 _NL_COLLATE_INDIRECTWC);
2953
2954 idx = findidx ((const wint_t**)&cp);
2955 if (idx == 0 || cp < (wint_t*) str + c1)
2956 /* This is no valid character. */
2957 FREE_STACK_RETURN (REG_ECOLLATE);
2958
2959 str[0] = (wchar_t)idx;
2960 }
2961 else /* delim == '.' */
2962 {
2963 /* We push collation sequence value
2964 for collating symbol. */
2965 int32_t table_size;
2966 const int32_t *symb_table;
2967 const unsigned char *extra;
2968 int32_t idx;
2969 int32_t elem;
2970 int32_t second;
2971 int32_t hash;
2972 char char_str[c1];
2973
2974 /* We have to convert the name to a single-byte
2975 string. This is possible since the names
2976 consist of ASCII characters and the internal
2977 representation is UCS4. */
2978 for (i = 0; i < c1; ++i)
2979 char_str[i] = str[i];
2980
2981 table_size =
2982 _NL_CURRENT_WORD (LC_COLLATE,
2983 _NL_COLLATE_SYMB_HASH_SIZEMB);
2984 symb_table = (const int32_t *)
2985 _NL_CURRENT (LC_COLLATE,
2986 _NL_COLLATE_SYMB_TABLEMB);
2987 extra = (const unsigned char *)
2988 _NL_CURRENT (LC_COLLATE,
2989 _NL_COLLATE_SYMB_EXTRAMB);
2990
2991 /* Locate the character in the hashing table. */
2992 hash = elem_hash (char_str, c1);
2993
2994 idx = 0;
2995 elem = hash % table_size;
2996 second = hash % (table_size - 2);
2997 while (symb_table[2 * elem] != 0)
2998 {
2999 /* First compare the hashing value. */
3000 if (symb_table[2 * elem] == hash
3001 && c1 == extra[symb_table[2 * elem + 1]]
3002 && memcmp (char_str,
3003 &extra[symb_table[2 * elem + 1]
3004 + 1], c1) == 0)
3005 {
3006 /* Yep, this is the entry. */
3007 idx = symb_table[2 * elem + 1];
3008 idx += 1 + extra[idx];
3009 break;
3010 }
3011
3012 /* Next entry. */
3013 elem += second;
3014 }
3015
3016 if (symb_table[2 * elem] != 0)
3017 {
3018 /* Compute the index of the byte sequence
3019 in the table. */
3020 idx += 1 + extra[idx];
3021 /* Adjust for the alignment. */
3022 idx = (idx + 3) & ~3;
3023
3024 str[0] = (wchar_t) idx + 4;
3025 }
3026 else if (symb_table[2 * elem] == 0 && c1 == 1)
3027 {
3028 /* No valid character. Match it as a
3029 single byte character. */
3030 had_char_class = false;
3031 BUF_PUSH(str[0]);
3032 /* Update the length of characters */
3033 laststart[5]++;
3034 range_start = str[0];
3035
3036 /* Throw away the ] at the end of the
3037 collating symbol. */
3038 PATFETCH (c);
3039 /* exit from the switch block. */
3040 continue;
3041 }
3042 else
3043 FREE_STACK_RETURN (REG_ECOLLATE);
3044 }
3045 datasize = 1;
3046 }
3047 # endif
3048 /* Throw away the ] at the end of the equivalence
3049 class (or collating symbol). */
3050 PATFETCH (c);
3051
3052 /* Allocate the space for the equivalence class
3053 (or collating symbol) (and '\0' if needed). */
3054 GET_BUFFER_SPACE(datasize);
3055 /* Update the pointer to indicate end of buffer. */
3056 b += datasize;
3057
3058 if (delim == '=')
3059 { /* equivalence class */
3060 /* Calculate the offset of char_ranges,
3061 which is next to equivalence_classes. */
3062 offset = laststart[1] + laststart[2]
3063 + laststart[3] +6;
3064 /* Insert space. */
3065 insert_space(datasize, laststart + offset, b - 1);
3066
3067 /* Write the equivalence_class and \0. */
3068 for (i = 0 ; i < datasize ; i++)
3069 laststart[offset + i] = str[i];
3070
3071 /* Update the length of equivalence_classes. */
3072 laststart[3] += datasize;
3073 had_char_class = true;
3074 }
3075 else /* delim == '.' */
3076 { /* collating symbol */
3077 /* Calculate the offset of the equivalence_classes,
3078 which is next to collating_symbols. */
3079 offset = laststart[1] + laststart[2] + 6;
3080 /* Insert space and write the collationg_symbol
3081 and \0. */
3082 insert_space(datasize, laststart + offset, b-1);
3083 for (i = 0 ; i < datasize ; i++)
3084 laststart[offset + i] = str[i];
3085
3086 /* In re_match_2_internal if range_start < -1, we
3087 assume -range_start is the offset of the
3088 collating symbol which is specified as
3089 the character of the range start. So we assign
3090 -(laststart[1] + laststart[2] + 6) to
3091 range_start. */
3092 range_start = -(laststart[1] + laststart[2] + 6);
3093 /* Update the length of collating_symbol. */
3094 laststart[2] += datasize;
3095 had_char_class = false;
3096 }
3097 }
3098 else
3099 {
3100 c1++;
3101 while (c1--)
3102 PATUNFETCH;
3103 BUF_PUSH ('[');
3104 BUF_PUSH (delim);
3105 laststart[5] += 2; /* Update the length of characters */
3106 range_start = delim;
3107 had_char_class = false;
3108 }
3109 }
3110 else
3111 {
3112 had_char_class = false;
3113 BUF_PUSH(c);
3114 laststart[5]++; /* Update the length of characters */
3115 range_start = c;
3116 }
3117 }
3118
3119 #else /* BYTE */
3120 /* Ensure that we have enough space to push a charset: the
3121 opcode, the length count, and the bitset; 34 bytes in all. */
3122 GET_BUFFER_SPACE (34);
3123
3124 laststart = b;
3125
3126 /* We test `*p == '^' twice, instead of using an if
3127 statement, so we only need one BUF_PUSH. */
3128 BUF_PUSH (*p == '^' ? charset_not : charset);
3129 if (*p == '^')
3130 p++;
3131
3132 /* Remember the first position in the bracket expression. */
3133 p1 = p;
3134
3135 /* Push the number of bytes in the bitmap. */
3136 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
3137
3138 /* Clear the whole map. */
3139 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
3140
3141 /* charset_not matches newline according to a syntax bit. */
3142 if ((re_opcode_t) b[-2] == charset_not
3143 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
3144 SET_LIST_BIT ('\n');
3145
3146 /* Read in characters and ranges, setting map bits. */
3147 for (;;)
3148 {
3149 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3150
3151 PATFETCH (c);
3152
3153 /* \ might escape characters inside [...] and [^...]. */
3154 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
3155 {
3156 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3157
3158 PATFETCH (c1);
3159 SET_LIST_BIT (c1);
3160 range_start = c1;
3161 continue;
3162 }
3163
3164 /* Could be the end of the bracket expression. If it's
3165 not (i.e., when the bracket expression is `[]' so
3166 far), the ']' character bit gets set way below. */
3167 if (c == ']' && p != p1 + 1)
3168 break;
3169
3170 /* Look ahead to see if it's a range when the last thing
3171 was a character class. */
3172 if (had_char_class && c == '-' && *p != ']')
3173 FREE_STACK_RETURN (REG_ERANGE);
3174
3175 /* Look ahead to see if it's a range when the last thing
3176 was a character: if this is a hyphen not at the
3177 beginning or the end of a list, then it's the range
3178 operator. */
3179 if (c == '-'
3180 && !(p - 2 >= pattern && p[-2] == '[')
3181 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
3182 && *p != ']')
3183 {
3184 reg_errcode_t ret
3185 = byte_compile_range (range_start, &p, pend, translate,
3186 syntax, b);
3187 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3188 range_start = 0xffffffff;
3189 }
3190
3191 else if (p[0] == '-' && p[1] != ']')
3192 { /* This handles ranges made up of characters only. */
3193 reg_errcode_t ret;
3194
3195 /* Move past the `-'. */
3196 PATFETCH (c1);
3197
3198 ret = byte_compile_range (c, &p, pend, translate, syntax, b);
3199 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3200 range_start = 0xffffffff;
3201 }
3202
3203 /* See if we're at the beginning of a possible character
3204 class. */
3205
3206 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
3207 { /* Leave room for the null. */
3208 char str[CHAR_CLASS_MAX_LENGTH + 1];
3209
3210 PATFETCH (c);
3211 c1 = 0;
3212
3213 /* If pattern is `[[:'. */
3214 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3215
3216 for (;;)
3217 {
3218 PATFETCH (c);
3219 if ((c == ':' && *p == ']') || p == pend)
3220 break;
3221 if (c1 < CHAR_CLASS_MAX_LENGTH)
3222 str[c1++] = c;
3223 else
3224 /* This is in any case an invalid class name. */
3225 str[0] = '\0';
3226 }
3227 str[c1] = '\0';
3228
3229 /* If isn't a word bracketed by `[:' and `:]':
3230 undo the ending character, the letters, and leave
3231 the leading `:' and `[' (but set bits for them). */
3232 if (c == ':' && *p == ']')
3233 {
3234 # if defined _LIBC || WIDE_CHAR_SUPPORT
3235 boolean is_lower = STREQ (str, "lower");
3236 boolean is_upper = STREQ (str, "upper");
3237 wctype_t wt;
3238 int ch;
3239
3240 wt = IS_CHAR_CLASS (str);
3241 if (wt == 0)
3242 FREE_STACK_RETURN (REG_ECTYPE);
3243
3244 /* Throw away the ] at the end of the character
3245 class. */
3246 PATFETCH (c);
3247
3248 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3249
3250 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
3251 {
3252 # ifdef _LIBC
3253 if (__iswctype (__btowc (ch), wt))
3254 SET_LIST_BIT (ch);
3255 # else
3256 if (iswctype (btowc (ch), wt))
3257 SET_LIST_BIT (ch);
3258 # endif
3259
3260 if (translate && (is_upper || is_lower)
3261 && (ISUPPER (ch) || ISLOWER (ch)))
3262 SET_LIST_BIT (ch);
3263 }
3264
3265 had_char_class = true;
3266 # else
3267 int ch;
3268 boolean is_alnum = STREQ (str, "alnum");
3269 boolean is_alpha = STREQ (str, "alpha");
3270 boolean is_blank = STREQ (str, "blank");
3271 boolean is_cntrl = STREQ (str, "cntrl");
3272 boolean is_digit = STREQ (str, "digit");
3273 boolean is_graph = STREQ (str, "graph");
3274 boolean is_lower = STREQ (str, "lower");
3275 boolean is_print = STREQ (str, "print");
3276 boolean is_punct = STREQ (str, "punct");
3277 boolean is_space = STREQ (str, "space");
3278 boolean is_upper = STREQ (str, "upper");
3279 boolean is_xdigit = STREQ (str, "xdigit");
3280
3281 if (!IS_CHAR_CLASS (str))
3282 FREE_STACK_RETURN (REG_ECTYPE);
3283
3284 /* Throw away the ] at the end of the character
3285 class. */
3286 PATFETCH (c);
3287
3288 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3289
3290 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
3291 {
3292 /* This was split into 3 if's to
3293 avoid an arbitrary limit in some compiler. */
3294 if ( (is_alnum && ISALNUM (ch))
3295 || (is_alpha && ISALPHA (ch))
3296 || (is_blank && ISBLANK (ch))
3297 || (is_cntrl && ISCNTRL (ch)))
3298 SET_LIST_BIT (ch);
3299 if ( (is_digit && ISDIGIT (ch))
3300 || (is_graph && ISGRAPH (ch))
3301 || (is_lower && ISLOWER (ch))
3302 || (is_print && ISPRINT (ch)))
3303 SET_LIST_BIT (ch);
3304 if ( (is_punct && ISPUNCT (ch))
3305 || (is_space && ISSPACE (ch))
3306 || (is_upper && ISUPPER (ch))
3307 || (is_xdigit && ISXDIGIT (ch)))
3308 SET_LIST_BIT (ch);
3309 if ( translate && (is_upper || is_lower)
3310 && (ISUPPER (ch) || ISLOWER (ch)))
3311 SET_LIST_BIT (ch);
3312 }
3313 had_char_class = true;
3314 # endif /* libc || wctype.h */
3315 }
3316 else
3317 {
3318 c1++;
3319 while (c1--)
3320 PATUNFETCH;
3321 SET_LIST_BIT ('[');
3322 SET_LIST_BIT (':');
3323 range_start = ':';
3324 had_char_class = false;
3325 }
3326 }
3327 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
3328 {
3329 unsigned char str[MB_LEN_MAX + 1];
3330 # ifdef _LIBC
3331 uint32_t nrules =
3332 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3333 # endif
3334
3335 PATFETCH (c);
3336 c1 = 0;
3337
3338 /* If pattern is `[[='. */
3339 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3340
3341 for (;;)
3342 {
3343 PATFETCH (c);
3344 if ((c == '=' && *p == ']') || p == pend)
3345 break;
3346 if (c1 < MB_LEN_MAX)
3347 str[c1++] = c;
3348 else
3349 /* This is in any case an invalid class name. */
3350 str[0] = '\0';
3351 }
3352 str[c1] = '\0';
3353
3354 if (c == '=' && *p == ']' && str[0] != '\0')
3355 {
3356 /* If we have no collation data we use the default
3357 collation in which each character is in a class
3358 by itself. It also means that ASCII is the
3359 character set and therefore we cannot have character
3360 with more than one byte in the multibyte
3361 representation. */
3362 # ifdef _LIBC
3363 if (nrules == 0)
3364 # endif
3365 {
3366 if (c1 != 1)
3367 FREE_STACK_RETURN (REG_ECOLLATE);
3368
3369 /* Throw away the ] at the end of the equivalence
3370 class. */
3371 PATFETCH (c);
3372
3373 /* Set the bit for the character. */
3374 SET_LIST_BIT (str[0]);
3375 }
3376 # ifdef _LIBC
3377 else
3378 {
3379 /* Try to match the byte sequence in `str' against
3380 those known to the collate implementation.
3381 First find out whether the bytes in `str' are
3382 actually from exactly one character. */
3383 const int32_t *table;
3384 const unsigned char *weights;
3385 const unsigned char *extra;
3386 const int32_t *indirect;
3387 int32_t idx;
3388 const unsigned char *cp = str;
3389 int ch;
3390
3391 /* This #include defines a local function! */
3392 # include <locale/weight.h>
3393
3394 table = (const int32_t *)
3395 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3396 weights = (const unsigned char *)
3397 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3398 extra = (const unsigned char *)
3399 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3400 indirect = (const int32_t *)
3401 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3402
3403 idx = findidx (&cp);
3404 if (idx == 0 || cp < str + c1)
3405 /* This is no valid character. */
3406 FREE_STACK_RETURN (REG_ECOLLATE);
3407
3408 /* Throw away the ] at the end of the equivalence
3409 class. */
3410 PATFETCH (c);
3411
3412 /* Now we have to go through the whole table
3413 and find all characters which have the same
3414 first level weight.
3415
3416 XXX Note that this is not entirely correct.
3417 we would have to match multibyte sequences
3418 but this is not possible with the current
3419 implementation. */
3420 for (ch = 1; ch < 256; ++ch)
3421 /* XXX This test would have to be changed if we
3422 would allow matching multibyte sequences. */
3423 if (table[ch] > 0)
3424 {
3425 int32_t idx2 = table[ch];
3426 size_t len = weights[idx2];
3427
3428 /* Test whether the lenghts match. */
3429 if (weights[idx] == len)
3430 {
3431 /* They do. New compare the bytes of
3432 the weight. */
3433 size_t cnt = 0;
3434
3435 while (cnt < len
3436 && (weights[idx + 1 + cnt]
3437 == weights[idx2 + 1 + cnt]))
3438 ++cnt;
3439
3440 if (cnt == len)
3441 /* They match. Mark the character as
3442 acceptable. */
3443 SET_LIST_BIT (ch);
3444 }
3445 }
3446 }
3447 # endif
3448 had_char_class = true;
3449 }
3450 else
3451 {
3452 c1++;
3453 while (c1--)
3454 PATUNFETCH;
3455 SET_LIST_BIT ('[');
3456 SET_LIST_BIT ('=');
3457 range_start = '=';
3458 had_char_class = false;
3459 }
3460 }
3461 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
3462 {
3463 unsigned char str[128]; /* Should be large enough. */
3464 # ifdef _LIBC
3465 uint32_t nrules =
3466 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3467 # endif
3468
3469 PATFETCH (c);
3470 c1 = 0;
3471
3472 /* If pattern is `[[.'. */
3473 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3474
3475 for (;;)
3476 {
3477 PATFETCH (c);
3478 if ((c == '.' && *p == ']') || p == pend)
3479 break;
3480 if (c1 < sizeof (str))
3481 str[c1++] = c;
3482 else
3483 /* This is in any case an invalid class name. */
3484 str[0] = '\0';
3485 }
3486 str[c1] = '\0';
3487
3488 if (c == '.' && *p == ']' && str[0] != '\0')
3489 {
3490 /* If we have no collation data we use the default
3491 collation in which each character is the name
3492 for its own class which contains only the one
3493 character. It also means that ASCII is the
3494 character set and therefore we cannot have character
3495 with more than one byte in the multibyte
3496 representation. */
3497 # ifdef _LIBC
3498 if (nrules == 0)
3499 # endif
3500 {
3501 if (c1 != 1)
3502 FREE_STACK_RETURN (REG_ECOLLATE);
3503
3504 /* Throw away the ] at the end of the equivalence
3505 class. */
3506 PATFETCH (c);
3507
3508 /* Set the bit for the character. */
3509 SET_LIST_BIT (str[0]);
3510 range_start = ((const unsigned char *) str)[0];
3511 }
3512 # ifdef _LIBC
3513 else
3514 {
3515 /* Try to match the byte sequence in `str' against
3516 those known to the collate implementation.
3517 First find out whether the bytes in `str' are
3518 actually from exactly one character. */
3519 int32_t table_size;
3520 const int32_t *symb_table;
3521 const unsigned char *extra;
3522 int32_t idx;
3523 int32_t elem;
3524 int32_t second;
3525 int32_t hash;
3526
3527 table_size =
3528 _NL_CURRENT_WORD (LC_COLLATE,
3529 _NL_COLLATE_SYMB_HASH_SIZEMB);
3530 symb_table = (const int32_t *)
3531 _NL_CURRENT (LC_COLLATE,
3532 _NL_COLLATE_SYMB_TABLEMB);
3533 extra = (const unsigned char *)
3534 _NL_CURRENT (LC_COLLATE,
3535 _NL_COLLATE_SYMB_EXTRAMB);
3536
3537 /* Locate the character in the hashing table. */
3538 hash = elem_hash (str, c1);
3539
3540 idx = 0;
3541 elem = hash % table_size;
3542 second = hash % (table_size - 2);
3543 while (symb_table[2 * elem] != 0)
3544 {
3545 /* First compare the hashing value. */
3546 if (symb_table[2 * elem] == hash
3547 && c1 == extra[symb_table[2 * elem + 1]]
3548 && memcmp (str,
3549 &extra[symb_table[2 * elem + 1]
3550 + 1],
3551 c1) == 0)
3552 {
3553 /* Yep, this is the entry. */
3554 idx = symb_table[2 * elem + 1];
3555 idx += 1 + extra[idx];
3556 break;
3557 }
3558
3559 /* Next entry. */
3560 elem += second;
3561 }
3562
3563 if (symb_table[2 * elem] == 0)
3564 /* This is no valid character. */
3565 FREE_STACK_RETURN (REG_ECOLLATE);
3566
3567 /* Throw away the ] at the end of the equivalence
3568 class. */
3569 PATFETCH (c);
3570
3571 /* Now add the multibyte character(s) we found
3572 to the accept list.
3573
3574 XXX Note that this is not entirely correct.
3575 we would have to match multibyte sequences
3576 but this is not possible with the current
3577 implementation. Also, we have to match
3578 collating symbols, which expand to more than
3579 one file, as a whole and not allow the
3580 individual bytes. */
3581 c1 = extra[idx++];
3582 if (c1 == 1)
3583 range_start = extra[idx];
3584 while (c1-- > 0)
3585 {
3586 SET_LIST_BIT (extra[idx]);
3587 ++idx;
3588 }
3589 }
3590 # endif
3591 had_char_class = false;
3592 }
3593 else
3594 {
3595 c1++;
3596 while (c1--)
3597 PATUNFETCH;
3598 SET_LIST_BIT ('[');
3599 SET_LIST_BIT ('.');
3600 range_start = '.';
3601 had_char_class = false;
3602 }
3603 }
3604 else
3605 {
3606 had_char_class = false;
3607 SET_LIST_BIT (c);
3608 range_start = c;
3609 }
3610 }
3611
3612 /* Discard any (non)matching list bytes that are all 0 at the
3613 end of the map. Decrease the map-length byte too. */
3614 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3615 b[-1]--;
3616 b += b[-1];
3617 #endif /* WCHAR */
3618 }
3619 break;
3620
3621
3622 case '(':
3623 if (syntax & RE_NO_BK_PARENS)
3624 goto handle_open;
3625 else
3626 goto normal_char;
3627
3628
3629 case ')':
3630 if (syntax & RE_NO_BK_PARENS)
3631 goto handle_close;
3632 else
3633 goto normal_char;
3634
3635
3636 case '\n':
3637 if (syntax & RE_NEWLINE_ALT)
3638 goto handle_alt;
3639 else
3640 goto normal_char;
3641
3642
3643 case '|':
3644 if (syntax & RE_NO_BK_VBAR)
3645 goto handle_alt;
3646 else
3647 goto normal_char;
3648
3649
3650 case '{':
3651 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3652 goto handle_interval;
3653 else
3654 goto normal_char;
3655
3656
3657 case '\\':
3658 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3659
3660 /* Do not translate the character after the \, so that we can
3661 distinguish, e.g., \B from \b, even if we normally would
3662 translate, e.g., B to b. */
3663 PATFETCH_RAW (c);
3664
3665 switch (c)
3666 {
3667 case '(':
3668 if (syntax & RE_NO_BK_PARENS)
3669 goto normal_backslash;
3670
3671 handle_open:
3672 bufp->re_nsub++;
3673 regnum++;
3674
3675 if (COMPILE_STACK_FULL)
3676 {
3677 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3678 compile_stack_elt_t);
3679 if (compile_stack.stack == NULL) return REG_ESPACE;
3680
3681 compile_stack.size <<= 1;
3682 }
3683
3684 /* These are the values to restore when we hit end of this
3685 group. They are all relative offsets, so that if the
3686 whole pattern moves because of realloc, they will still
3687 be valid. */
3688 COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR;
3689 COMPILE_STACK_TOP.fixup_alt_jump
3690 = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0;
3691 COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR;
3692 COMPILE_STACK_TOP.regnum = regnum;
3693
3694 /* We will eventually replace the 0 with the number of
3695 groups inner to this one. But do not push a
3696 start_memory for groups beyond the last one we can
3697 represent in the compiled pattern. */
3698 if (regnum <= MAX_REGNUM)
3699 {
3700 COMPILE_STACK_TOP.inner_group_offset = b
3701 - COMPILED_BUFFER_VAR + 2;
3702 BUF_PUSH_3 (start_memory, regnum, 0);
3703 }
3704
3705 compile_stack.avail++;
3706
3707 fixup_alt_jump = 0;
3708 laststart = 0;
3709 begalt = b;
3710 /* If we've reached MAX_REGNUM groups, then this open
3711 won't actually generate any code, so we'll have to
3712 clear pending_exact explicitly. */
3713 pending_exact = 0;
3714 break;
3715
3716
3717 case ')':
3718 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3719
3720 if (COMPILE_STACK_EMPTY)
3721 {
3722 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3723 goto normal_backslash;
3724 else
3725 FREE_STACK_RETURN (REG_ERPAREN);
3726 }
3727
3728 handle_close:
3729 if (fixup_alt_jump)
3730 { /* Push a dummy failure point at the end of the
3731 alternative for a possible future
3732 `pop_failure_jump' to pop. See comments at
3733 `push_dummy_failure' in `re_match_2'. */
3734 BUF_PUSH (push_dummy_failure);
3735
3736 /* We allocated space for this jump when we assigned
3737 to `fixup_alt_jump', in the `handle_alt' case below. */
3738 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
3739 }
3740
3741 /* See similar code for backslashed left paren above. */
3742 if (COMPILE_STACK_EMPTY)
3743 {
3744 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3745 goto normal_char;
3746 else
3747 FREE_STACK_RETURN (REG_ERPAREN);
3748 }
3749
3750 /* Since we just checked for an empty stack above, this
3751 ``can't happen''. */
3752 assert (compile_stack.avail != 0);
3753 {
3754 /* We don't just want to restore into `regnum', because
3755 later groups should continue to be numbered higher,
3756 as in `(ab)c(de)' -- the second group is #2. */
3757 regnum_t this_group_regnum;
3758
3759 compile_stack.avail--;
3760 begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset;
3761 fixup_alt_jump
3762 = COMPILE_STACK_TOP.fixup_alt_jump
3763 ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1
3764 : 0;
3765 laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset;
3766 this_group_regnum = COMPILE_STACK_TOP.regnum;
3767 /* If we've reached MAX_REGNUM groups, then this open
3768 won't actually generate any code, so we'll have to
3769 clear pending_exact explicitly. */
3770 pending_exact = 0;
3771
3772 /* We're at the end of the group, so now we know how many
3773 groups were inside this one. */
3774 if (this_group_regnum <= MAX_REGNUM)
3775 {
3776 UCHAR_T *inner_group_loc
3777 = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset;
3778
3779 *inner_group_loc = regnum - this_group_regnum;
3780 BUF_PUSH_3 (stop_memory, this_group_regnum,
3781 regnum - this_group_regnum);
3782 }
3783 }
3784 break;
3785
3786
3787 case '|': /* `\|'. */
3788 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3789 goto normal_backslash;
3790 handle_alt:
3791 if (syntax & RE_LIMITED_OPS)
3792 goto normal_char;
3793
3794 /* Insert before the previous alternative a jump which
3795 jumps to this alternative if the former fails. */
3796 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3797 INSERT_JUMP (on_failure_jump, begalt,
3798 b + 2 + 2 * OFFSET_ADDRESS_SIZE);
3799 pending_exact = 0;
3800 b += 1 + OFFSET_ADDRESS_SIZE;
3801
3802 /* The alternative before this one has a jump after it
3803 which gets executed if it gets matched. Adjust that
3804 jump so it will jump to this alternative's analogous
3805 jump (put in below, which in turn will jump to the next
3806 (if any) alternative's such jump, etc.). The last such
3807 jump jumps to the correct final destination. A picture:
3808 _____ _____
3809 | | | |
3810 | v | v
3811 a | b | c
3812
3813 If we are at `b', then fixup_alt_jump right now points to a
3814 three-byte space after `a'. We'll put in the jump, set
3815 fixup_alt_jump to right after `b', and leave behind three
3816 bytes which we'll fill in when we get to after `c'. */
3817
3818 if (fixup_alt_jump)
3819 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3820
3821 /* Mark and leave space for a jump after this alternative,
3822 to be filled in later either by next alternative or
3823 when know we're at the end of a series of alternatives. */
3824 fixup_alt_jump = b;
3825 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3826 b += 1 + OFFSET_ADDRESS_SIZE;
3827
3828 laststart = 0;
3829 begalt = b;
3830 break;
3831
3832
3833 case '{':
3834 /* If \{ is a literal. */
3835 if (!(syntax & RE_INTERVALS)
3836 /* If we're at `\{' and it's not the open-interval
3837 operator. */
3838 || (syntax & RE_NO_BK_BRACES))
3839 goto normal_backslash;
3840
3841 handle_interval:
3842 {
3843 /* If got here, then the syntax allows intervals. */
3844
3845 /* At least (most) this many matches must be made. */
3846 int lower_bound = -1, upper_bound = -1;
3847
3848 /* Place in the uncompiled pattern (i.e., just after
3849 the '{') to go back to if the interval is invalid. */
3850 const CHAR_T *beg_interval = p;
3851
3852 if (p == pend)
3853 goto invalid_interval;
3854
3855 GET_UNSIGNED_NUMBER (lower_bound);
3856
3857 if (c == ',')
3858 {
3859 GET_UNSIGNED_NUMBER (upper_bound);
3860 if (upper_bound < 0)
3861 upper_bound = RE_DUP_MAX;
3862 }
3863 else
3864 /* Interval such as `{1}' => match exactly once. */
3865 upper_bound = lower_bound;
3866
3867 if (! (0 <= lower_bound && lower_bound <= upper_bound))
3868 goto invalid_interval;
3869
3870 if (!(syntax & RE_NO_BK_BRACES))
3871 {
3872 if (c != '\\' || p == pend)
3873 goto invalid_interval;
3874 PATFETCH (c);
3875 }
3876
3877 if (c != '}')
3878 goto invalid_interval;
3879
3880 /* If it's invalid to have no preceding re. */
3881 if (!laststart)
3882 {
3883 if (syntax & RE_CONTEXT_INVALID_OPS
3884 && !(syntax & RE_INVALID_INTERVAL_ORD))
3885 FREE_STACK_RETURN (REG_BADRPT);
3886 else if (syntax & RE_CONTEXT_INDEP_OPS)
3887 laststart = b;
3888 else
3889 goto unfetch_interval;
3890 }
3891
3892 /* We just parsed a valid interval. */
3893
3894 if (RE_DUP_MAX < upper_bound)
3895 FREE_STACK_RETURN (REG_BADBR);
3896
3897 /* If the upper bound is zero, don't want to succeed at
3898 all; jump from `laststart' to `b + 3', which will be
3899 the end of the buffer after we insert the jump. */
3900 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE'
3901 instead of 'b + 3'. */
3902 if (upper_bound == 0)
3903 {
3904 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3905 INSERT_JUMP (jump, laststart, b + 1
3906 + OFFSET_ADDRESS_SIZE);
3907 b += 1 + OFFSET_ADDRESS_SIZE;
3908 }
3909
3910 /* Otherwise, we have a nontrivial interval. When
3911 we're all done, the pattern will look like:
3912 set_number_at <jump count> <upper bound>
3913 set_number_at <succeed_n count> <lower bound>
3914 succeed_n <after jump addr> <succeed_n count>
3915 <body of loop>
3916 jump_n <succeed_n addr> <jump count>
3917 (The upper bound and `jump_n' are omitted if
3918 `upper_bound' is 1, though.) */
3919 else
3920 { /* If the upper bound is > 1, we need to insert
3921 more at the end of the loop. */
3922 unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE +
3923 (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE);
3924
3925 GET_BUFFER_SPACE (nbytes);
3926
3927 /* Initialize lower bound of the `succeed_n', even
3928 though it will be set during matching by its
3929 attendant `set_number_at' (inserted next),
3930 because `re_compile_fastmap' needs to know.
3931 Jump to the `jump_n' we might insert below. */
3932 INSERT_JUMP2 (succeed_n, laststart,
3933 b + 1 + 2 * OFFSET_ADDRESS_SIZE
3934 + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE)
3935 , lower_bound);
3936 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3937
3938 /* Code to initialize the lower bound. Insert
3939 before the `succeed_n'. The `5' is the last two
3940 bytes of this `set_number_at', plus 3 bytes of
3941 the following `succeed_n'. */
3942 /* ifdef WCHAR, The '1+2*OFFSET_ADDRESS_SIZE'
3943 is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE'
3944 of the following `succeed_n'. */
3945 PREFIX(insert_op2) (set_number_at, laststart, 1
3946 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b);
3947 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3948
3949 if (upper_bound > 1)
3950 { /* More than one repetition is allowed, so
3951 append a backward jump to the `succeed_n'
3952 that starts this interval.
3953
3954 When we've reached this during matching,
3955 we'll have matched the interval once, so
3956 jump back only `upper_bound - 1' times. */
3957 STORE_JUMP2 (jump_n, b, laststart
3958 + 2 * OFFSET_ADDRESS_SIZE + 1,
3959 upper_bound - 1);
3960 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3961
3962 /* The location we want to set is the second
3963 parameter of the `jump_n'; that is `b-2' as
3964 an absolute address. `laststart' will be
3965 the `set_number_at' we're about to insert;
3966 `laststart+3' the number to set, the source
3967 for the relative address. But we are
3968 inserting into the middle of the pattern --
3969 so everything is getting moved up by 5.
3970 Conclusion: (b - 2) - (laststart + 3) + 5,
3971 i.e., b - laststart.
3972
3973 We insert this at the beginning of the loop
3974 so that if we fail during matching, we'll
3975 reinitialize the bounds. */
3976 PREFIX(insert_op2) (set_number_at, laststart,
3977 b - laststart,
3978 upper_bound - 1, b);
3979 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3980 }
3981 }
3982 pending_exact = 0;
3983 break;
3984
3985 invalid_interval:
3986 if (!(syntax & RE_INVALID_INTERVAL_ORD))
3987 FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR);
3988 unfetch_interval:
3989 /* Match the characters as literals. */
3990 p = beg_interval;
3991 c = '{';
3992 if (syntax & RE_NO_BK_BRACES)
3993 goto normal_char;
3994 else
3995 goto normal_backslash;
3996 }
3997
3998 #ifdef emacs
3999 /* There is no way to specify the before_dot and after_dot
4000 operators. rms says this is ok. --karl */
4001 case '=':
4002 BUF_PUSH (at_dot);
4003 break;
4004
4005 case 's':
4006 laststart = b;
4007 PATFETCH (c);
4008 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
4009 break;
4010
4011 case 'S':
4012 laststart = b;
4013 PATFETCH (c);
4014 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
4015 break;
4016 #endif /* emacs */
4017
4018
4019 case 'w':
4020 if (syntax & RE_NO_GNU_OPS)
4021 goto normal_char;
4022 laststart = b;
4023 BUF_PUSH (wordchar);
4024 break;
4025
4026
4027 case 'W':
4028 if (syntax & RE_NO_GNU_OPS)
4029 goto normal_char;
4030 laststart = b;
4031 BUF_PUSH (notwordchar);
4032 break;
4033
4034
4035 case '<':
4036 if (syntax & RE_NO_GNU_OPS)
4037 goto normal_char;
4038 BUF_PUSH (wordbeg);
4039 break;
4040
4041 case '>':
4042 if (syntax & RE_NO_GNU_OPS)
4043 goto normal_char;
4044 BUF_PUSH (wordend);
4045 break;
4046
4047 case 'b':
4048 if (syntax & RE_NO_GNU_OPS)
4049 goto normal_char;
4050 BUF_PUSH (wordbound);
4051 break;
4052
4053 case 'B':
4054 if (syntax & RE_NO_GNU_OPS)
4055 goto normal_char;
4056 BUF_PUSH (notwordbound);
4057 break;
4058
4059 case '`':
4060 if (syntax & RE_NO_GNU_OPS)
4061 goto normal_char;
4062 BUF_PUSH (begbuf);
4063 break;
4064
4065 case '\'':
4066 if (syntax & RE_NO_GNU_OPS)
4067 goto normal_char;
4068 BUF_PUSH (endbuf);
4069 break;
4070
4071 case '1': case '2': case '3': case '4': case '5':
4072 case '6': case '7': case '8': case '9':
4073 if (syntax & RE_NO_BK_REFS)
4074 goto normal_char;
4075
4076 c1 = c - '0';
4077
4078 if (c1 > regnum)
4079 FREE_STACK_RETURN (REG_ESUBREG);
4080
4081 /* Can't back reference to a subexpression if inside of it. */
4082 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
4083 goto normal_char;
4084
4085 laststart = b;
4086 BUF_PUSH_2 (duplicate, c1);
4087 break;
4088
4089
4090 case '+':
4091 case '?':
4092 if (syntax & RE_BK_PLUS_QM)
4093 goto handle_plus;
4094 else
4095 goto normal_backslash;
4096
4097 default:
4098 normal_backslash:
4099 /* You might think it would be useful for \ to mean
4100 not to translate; but if we don't translate it
4101 it will never match anything. */
4102 c = TRANSLATE (c);
4103 goto normal_char;
4104 }
4105 break;
4106
4107
4108 default:
4109 /* Expects the character in `c'. */
4110 normal_char:
4111 /* If no exactn currently being built. */
4112 if (!pending_exact
4113 #ifdef WCHAR
4114 /* If last exactn handle binary(or character) and
4115 new exactn handle character(or binary). */
4116 || is_exactn_bin != is_binary[p - 1 - pattern]
4117 #endif /* WCHAR */
4118
4119 /* If last exactn not at current position. */
4120 || pending_exact + *pending_exact + 1 != b
4121
4122 /* We have only one byte following the exactn for the count. */
4123 || *pending_exact == (1 << BYTEWIDTH) - 1
4124
4125 /* If followed by a repetition operator. */
4126 || *p == '*' || *p == '^'
4127 || ((syntax & RE_BK_PLUS_QM)
4128 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
4129 : (*p == '+' || *p == '?'))
4130 || ((syntax & RE_INTERVALS)
4131 && ((syntax & RE_NO_BK_BRACES)
4132 ? *p == '{'
4133 : (p[0] == '\\' && p[1] == '{'))))
4134 {
4135 /* Start building a new exactn. */
4136
4137 laststart = b;
4138
4139 #ifdef WCHAR
4140 /* Is this exactn binary data or character? */
4141 is_exactn_bin = is_binary[p - 1 - pattern];
4142 if (is_exactn_bin)
4143 BUF_PUSH_2 (exactn_bin, 0);
4144 else
4145 BUF_PUSH_2 (exactn, 0);
4146 #else
4147 BUF_PUSH_2 (exactn, 0);
4148 #endif /* WCHAR */
4149 pending_exact = b - 1;
4150 }
4151
4152 BUF_PUSH (c);
4153 (*pending_exact)++;
4154 break;
4155 } /* switch (c) */
4156 } /* while p != pend */
4157
4158
4159 /* Through the pattern now. */
4160
4161 if (fixup_alt_jump)
4162 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
4163
4164 if (!COMPILE_STACK_EMPTY)
4165 FREE_STACK_RETURN (REG_EPAREN);
4166
4167 /* If we don't want backtracking, force success
4168 the first time we reach the end of the compiled pattern. */
4169 if (syntax & RE_NO_POSIX_BACKTRACKING)
4170 BUF_PUSH (succeed);
4171
4172 #ifdef WCHAR
4173 free (pattern);
4174 free (mbs_offset);
4175 free (is_binary);
4176 #endif
4177 free (compile_stack.stack);
4178
4179 /* We have succeeded; set the length of the buffer. */
4180 #ifdef WCHAR
4181 bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR;
4182 #else
4183 bufp->used = b - bufp->buffer;
4184 #endif
4185
4186 #ifdef DEBUG
4187 if (debug)
4188 {
4189 DEBUG_PRINT1 ("\nCompiled pattern: \n");
4190 PREFIX(print_compiled_pattern) (bufp);
4191 }
4192 #endif /* DEBUG */
4193
4194 #ifndef MATCH_MAY_ALLOCATE
4195 /* Initialize the failure stack to the largest possible stack. This
4196 isn't necessary unless we're trying to avoid calling alloca in
4197 the search and match routines. */
4198 {
4199 int num_regs = bufp->re_nsub + 1;
4200
4201 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
4202 is strictly greater than re_max_failures, the largest possible stack
4203 is 2 * re_max_failures failure points. */
4204 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
4205 {
4206 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
4207
4208 # ifdef emacs
4209 if (! fail_stack.stack)
4210 fail_stack.stack
4211 = (PREFIX(fail_stack_elt_t) *) xmalloc (fail_stack.size
4212 * sizeof (PREFIX(fail_stack_elt_t)));
4213 else
4214 fail_stack.stack
4215 = (PREFIX(fail_stack_elt_t) *) xrealloc (fail_stack.stack,
4216 (fail_stack.size
4217 * sizeof (PREFIX(fail_stack_elt_t))));
4218 # else /* not emacs */
4219 if (! fail_stack.stack)
4220 fail_stack.stack
4221 = (PREFIX(fail_stack_elt_t) *) malloc (fail_stack.size
4222 * sizeof (PREFIX(fail_stack_elt_t)));
4223 else
4224 fail_stack.stack
4225 = (PREFIX(fail_stack_elt_t) *) realloc (fail_stack.stack,
4226 (fail_stack.size
4227 * sizeof (PREFIX(fail_stack_elt_t))));
4228 # endif /* not emacs */
4229 }
4230
4231 PREFIX(regex_grow_registers) (num_regs);
4232 }
4233 #endif /* not MATCH_MAY_ALLOCATE */
4234
4235 return REG_NOERROR;
4236 } /* regex_compile */
4237
4238 /* Subroutines for `regex_compile'. */
4239
4240 /* Store OP at LOC followed by two-byte integer parameter ARG. */
4241 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4242
4243 static void
4244 PREFIX(store_op1) (re_opcode_t op, UCHAR_T *loc, int arg)
4245 {
4246 *loc = (UCHAR_T) op;
4247 STORE_NUMBER (loc + 1, arg);
4248 }
4249
4250
4251 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
4252 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4253
4254 static void
4255 PREFIX(store_op2) (re_opcode_t op, UCHAR_T *loc, int arg1, int arg2)
4256 {
4257 *loc = (UCHAR_T) op;
4258 STORE_NUMBER (loc + 1, arg1);
4259 STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2);
4260 }
4261
4262
4263 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
4264 for OP followed by two-byte integer parameter ARG. */
4265 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4266
4267 static void
4268 PREFIX(insert_op1) (re_opcode_t op, UCHAR_T *loc, int arg, UCHAR_T *end)
4269 {
4270 register UCHAR_T *pfrom = end;
4271 register UCHAR_T *pto = end + 1 + OFFSET_ADDRESS_SIZE;
4272
4273 while (pfrom != loc)
4274 *--pto = *--pfrom;
4275
4276 PREFIX(store_op1) (op, loc, arg);
4277 }
4278
4279
4280 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
4281 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4282
4283 static void
4284 PREFIX(insert_op2) (re_opcode_t op, UCHAR_T *loc, int arg1,
4285 int arg2, UCHAR_T *end)
4286 {
4287 register UCHAR_T *pfrom = end;
4288 register UCHAR_T *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE;
4289
4290 while (pfrom != loc)
4291 *--pto = *--pfrom;
4292
4293 PREFIX(store_op2) (op, loc, arg1, arg2);
4294 }
4295
4296
4297 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
4298 after an alternative or a begin-subexpression. We assume there is at
4299 least one character before the ^. */
4300
4301 static boolean
4302 PREFIX(at_begline_loc_p) (const CHAR_T *pattern, const CHAR_T *p,
4303 reg_syntax_t syntax)
4304 {
4305 const CHAR_T *prev = p - 2;
4306 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
4307
4308 return
4309 /* After a subexpression? */
4310 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
4311 /* After an alternative? */
4312 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
4313 }
4314
4315
4316 /* The dual of at_begline_loc_p. This one is for $. We assume there is
4317 at least one character after the $, i.e., `P < PEND'. */
4318
4319 static boolean
4320 PREFIX(at_endline_loc_p) (const CHAR_T *p, const CHAR_T *pend,
4321 reg_syntax_t syntax)
4322 {
4323 const CHAR_T *next = p;
4324 boolean next_backslash = *next == '\\';
4325 const CHAR_T *next_next = p + 1 < pend ? p + 1 : 0;
4326
4327 return
4328 /* Before a subexpression? */
4329 (syntax & RE_NO_BK_PARENS ? *next == ')'
4330 : next_backslash && next_next && *next_next == ')')
4331 /* Before an alternative? */
4332 || (syntax & RE_NO_BK_VBAR ? *next == '|'
4333 : next_backslash && next_next && *next_next == '|');
4334 }
4335
4336 #else /* not INSIDE_RECURSION */
4337
4338 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4339 false if it's not. */
4340
4341 static boolean
4342 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
4343 {
4344 int this_element;
4345
4346 for (this_element = compile_stack.avail - 1;
4347 this_element >= 0;
4348 this_element--)
4349 if (compile_stack.stack[this_element].regnum == regnum)
4350 return true;
4351
4352 return false;
4353 }
4354 #endif /* not INSIDE_RECURSION */
4355
4356 #ifdef INSIDE_RECURSION
4357
4358 #ifdef WCHAR
4359 /* This insert space, which size is "num", into the pattern at "loc".
4360 "end" must point the end of the allocated buffer. */
4361 static void
4362 insert_space (int num, CHAR_T *loc, CHAR_T *end)
4363 {
4364 register CHAR_T *pto = end;
4365 register CHAR_T *pfrom = end - num;
4366
4367 while (pfrom >= loc)
4368 *pto-- = *pfrom--;
4369 }
4370 #endif /* WCHAR */
4371
4372 #ifdef WCHAR
4373 static reg_errcode_t
4374 wcs_compile_range (CHAR_T range_start_char, const CHAR_T **p_ptr,
4375 const CHAR_T *pend, RE_TRANSLATE_TYPE translate,
4376 reg_syntax_t syntax, CHAR_T *b, CHAR_T *char_set)
4377 {
4378 const CHAR_T *p = *p_ptr;
4379 CHAR_T range_start, range_end;
4380 reg_errcode_t ret;
4381 # ifdef _LIBC
4382 uint32_t nrules;
4383 uint32_t start_val, end_val;
4384 # endif
4385 if (p == pend)
4386 return REG_ERANGE;
4387
4388 # ifdef _LIBC
4389 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4390 if (nrules != 0)
4391 {
4392 const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
4393 _NL_COLLATE_COLLSEQWC);
4394 const unsigned char *extra = (const unsigned char *)
4395 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4396
4397 if (range_start_char < -1)
4398 {
4399 /* range_start is a collating symbol. */
4400 int32_t *wextra;
4401 /* Retreive the index and get collation sequence value. */
4402 wextra = (int32_t*)(extra + char_set[-range_start_char]);
4403 start_val = wextra[1 + *wextra];
4404 }
4405 else
4406 start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char));
4407
4408 end_val = collseq_table_lookup (collseq, TRANSLATE (p[0]));
4409
4410 /* Report an error if the range is empty and the syntax prohibits
4411 this. */
4412 ret = ((syntax & RE_NO_EMPTY_RANGES)
4413 && (start_val > end_val))? REG_ERANGE : REG_NOERROR;
4414
4415 /* Insert space to the end of the char_ranges. */
4416 insert_space(2, b - char_set[5] - 2, b - 1);
4417 *(b - char_set[5] - 2) = (wchar_t)start_val;
4418 *(b - char_set[5] - 1) = (wchar_t)end_val;
4419 char_set[4]++; /* ranges_index */
4420 }
4421 else
4422 # endif
4423 {
4424 range_start = (range_start_char >= 0)? TRANSLATE (range_start_char):
4425 range_start_char;
4426 range_end = TRANSLATE (p[0]);
4427 /* Report an error if the range is empty and the syntax prohibits
4428 this. */
4429 ret = ((syntax & RE_NO_EMPTY_RANGES)
4430 && (range_start > range_end))? REG_ERANGE : REG_NOERROR;
4431
4432 /* Insert space to the end of the char_ranges. */
4433 insert_space(2, b - char_set[5] - 2, b - 1);
4434 *(b - char_set[5] - 2) = range_start;
4435 *(b - char_set[5] - 1) = range_end;
4436 char_set[4]++; /* ranges_index */
4437 }
4438 /* Have to increment the pointer into the pattern string, so the
4439 caller isn't still at the ending character. */
4440 (*p_ptr)++;
4441
4442 return ret;
4443 }
4444 #else /* BYTE */
4445 /* Read the ending character of a range (in a bracket expression) from the
4446 uncompiled pattern *P_PTR (which ends at PEND). We assume the
4447 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4448 Then we set the translation of all bits between the starting and
4449 ending characters (inclusive) in the compiled pattern B.
4450
4451 Return an error code.
4452
4453 We use these short variable names so we can use the same macros as
4454 `regex_compile' itself. */
4455
4456 static reg_errcode_t
4457 byte_compile_range (unsigned int range_start_char, const char **p_ptr,
4458 const char *pend, RE_TRANSLATE_TYPE translate,
4459 reg_syntax_t syntax, unsigned char *b)
4460 {
4461 unsigned this_char;
4462 const char *p = *p_ptr;
4463 reg_errcode_t ret;
4464 # if _LIBC
4465 const unsigned char *collseq;
4466 unsigned int start_colseq;
4467 unsigned int end_colseq;
4468 # else
4469 unsigned end_char;
4470 # endif
4471
4472 if (p == pend)
4473 return REG_ERANGE;
4474
4475 /* Have to increment the pointer into the pattern string, so the
4476 caller isn't still at the ending character. */
4477 (*p_ptr)++;
4478
4479 /* Report an error if the range is empty and the syntax prohibits this. */
4480 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
4481
4482 # if _LIBC
4483 collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
4484 _NL_COLLATE_COLLSEQMB);
4485
4486 start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)];
4487 end_colseq = collseq[(unsigned char) TRANSLATE (p[0])];
4488 for (this_char = 0; this_char <= (unsigned char) -1; ++this_char)
4489 {
4490 unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)];
4491
4492 if (start_colseq <= this_colseq && this_colseq <= end_colseq)
4493 {
4494 SET_LIST_BIT (TRANSLATE (this_char));
4495 ret = REG_NOERROR;
4496 }
4497 }
4498 # else
4499 /* Here we see why `this_char' has to be larger than an `unsigned
4500 char' -- we would otherwise go into an infinite loop, since all
4501 characters <= 0xff. */
4502 range_start_char = TRANSLATE (range_start_char);
4503 /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE,
4504 and some compilers cast it to int implicitly, so following for_loop
4505 may fall to (almost) infinite loop.
4506 e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff.
4507 To avoid this, we cast p[0] to unsigned int and truncate it. */
4508 end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1));
4509
4510 for (this_char = range_start_char; this_char <= end_char; ++this_char)
4511 {
4512 SET_LIST_BIT (TRANSLATE (this_char));
4513 ret = REG_NOERROR;
4514 }
4515 # endif
4516
4517 return ret;
4518 }
4519 #endif /* WCHAR */
4520
4521 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4523 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4524 characters can start a string that matches the pattern. This fastmap
4525 is used by re_search to skip quickly over impossible starting points.
4526
4527 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4528 area as BUFP->fastmap.
4529
4530 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4531 the pattern buffer.
4532
4533 Returns 0 if we succeed, -2 if an internal error. */
4534
4535 #ifdef WCHAR
4536 /* local function for re_compile_fastmap.
4537 truncate wchar_t character to char. */
4538 static unsigned char truncate_wchar (CHAR_T c);
4539
4540 static unsigned char
4541 truncate_wchar (CHAR_T c)
4542 {
4543 unsigned char buf[MB_CUR_MAX];
4544 mbstate_t state;
4545 int retval;
4546 memset (&state, '\0', sizeof (state));
4547 # ifdef _LIBC
4548 retval = __wcrtomb (buf, c, &state);
4549 # else
4550 retval = wcrtomb (buf, c, &state);
4551 # endif
4552 return retval > 0 ? buf[0] : (unsigned char) c;
4553 }
4554 #endif /* WCHAR */
4555
4556 static int
4557 PREFIX(re_compile_fastmap) (struct re_pattern_buffer *bufp)
4558 {
4559 int j, k;
4560 #ifdef MATCH_MAY_ALLOCATE
4561 PREFIX(fail_stack_type) fail_stack;
4562 #endif
4563 #ifndef REGEX_MALLOC
4564 char *destination;
4565 #endif
4566
4567 register char *fastmap = bufp->fastmap;
4568
4569 #ifdef WCHAR
4570 /* We need to cast pattern to (wchar_t*), because we casted this compiled
4571 pattern to (char*) in regex_compile. */
4572 UCHAR_T *pattern = (UCHAR_T*)bufp->buffer;
4573 register UCHAR_T *pend = (UCHAR_T*) (bufp->buffer + bufp->used);
4574 #else /* BYTE */
4575 UCHAR_T *pattern = bufp->buffer;
4576 register UCHAR_T *pend = pattern + bufp->used;
4577 #endif /* WCHAR */
4578 UCHAR_T *p = pattern;
4579
4580 #ifdef REL_ALLOC
4581 /* This holds the pointer to the failure stack, when
4582 it is allocated relocatably. */
4583 fail_stack_elt_t *failure_stack_ptr;
4584 #endif
4585
4586 /* Assume that each path through the pattern can be null until
4587 proven otherwise. We set this false at the bottom of switch
4588 statement, to which we get only if a particular path doesn't
4589 match the empty string. */
4590 boolean path_can_be_null = true;
4591
4592 /* We aren't doing a `succeed_n' to begin with. */
4593 boolean succeed_n_p = false;
4594
4595 assert (fastmap != NULL && p != NULL);
4596
4597 INIT_FAIL_STACK ();
4598 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4599 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4600 bufp->can_be_null = 0;
4601
4602 while (1)
4603 {
4604 if (p == pend || *p == (UCHAR_T) succeed)
4605 {
4606 /* We have reached the (effective) end of pattern. */
4607 if (!FAIL_STACK_EMPTY ())
4608 {
4609 bufp->can_be_null |= path_can_be_null;
4610
4611 /* Reset for next path. */
4612 path_can_be_null = true;
4613
4614 p = fail_stack.stack[--fail_stack.avail].pointer;
4615
4616 continue;
4617 }
4618 else
4619 break;
4620 }
4621
4622 /* We should never be about to go beyond the end of the pattern. */
4623 assert (p < pend);
4624
4625 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4626 {
4627
4628 /* I guess the idea here is to simply not bother with a fastmap
4629 if a backreference is used, since it's too hard to figure out
4630 the fastmap for the corresponding group. Setting
4631 `can_be_null' stops `re_search_2' from using the fastmap, so
4632 that is all we do. */
4633 case duplicate:
4634 bufp->can_be_null = 1;
4635 goto done;
4636
4637
4638 /* Following are the cases which match a character. These end
4639 with `break'. */
4640
4641 #ifdef WCHAR
4642 case exactn:
4643 fastmap[truncate_wchar(p[1])] = 1;
4644 break;
4645 #else /* BYTE */
4646 case exactn:
4647 fastmap[p[1]] = 1;
4648 break;
4649 #endif /* WCHAR */
4650 #ifdef MBS_SUPPORT
4651 case exactn_bin:
4652 fastmap[p[1]] = 1;
4653 break;
4654 #endif
4655
4656 #ifdef WCHAR
4657 /* It is hard to distinguish fastmap from (multi byte) characters
4658 which depends on current locale. */
4659 case charset:
4660 case charset_not:
4661 case wordchar:
4662 case notwordchar:
4663 bufp->can_be_null = 1;
4664 goto done;
4665 #else /* BYTE */
4666 case charset:
4667 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4668 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
4669 fastmap[j] = 1;
4670 break;
4671
4672
4673 case charset_not:
4674 /* Chars beyond end of map must be allowed. */
4675 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
4676 fastmap[j] = 1;
4677
4678 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4679 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
4680 fastmap[j] = 1;
4681 break;
4682
4683
4684 case wordchar:
4685 for (j = 0; j < (1 << BYTEWIDTH); j++)
4686 if (SYNTAX (j) == Sword)
4687 fastmap[j] = 1;
4688 break;
4689
4690
4691 case notwordchar:
4692 for (j = 0; j < (1 << BYTEWIDTH); j++)
4693 if (SYNTAX (j) != Sword)
4694 fastmap[j] = 1;
4695 break;
4696 #endif /* WCHAR */
4697
4698 case anychar:
4699 {
4700 int fastmap_newline = fastmap['\n'];
4701
4702 /* `.' matches anything ... */
4703 for (j = 0; j < (1 << BYTEWIDTH); j++)
4704 fastmap[j] = 1;
4705
4706 /* ... except perhaps newline. */
4707 if (!(bufp->syntax & RE_DOT_NEWLINE))
4708 fastmap['\n'] = fastmap_newline;
4709
4710 /* Return if we have already set `can_be_null'; if we have,
4711 then the fastmap is irrelevant. Something's wrong here. */
4712 else if (bufp->can_be_null)
4713 goto done;
4714
4715 /* Otherwise, have to check alternative paths. */
4716 break;
4717 }
4718
4719 #ifdef emacs
4720 case syntaxspec:
4721 k = *p++;
4722 for (j = 0; j < (1 << BYTEWIDTH); j++)
4723 if (SYNTAX (j) == (enum syntaxcode) k)
4724 fastmap[j] = 1;
4725 break;
4726
4727
4728 case notsyntaxspec:
4729 k = *p++;
4730 for (j = 0; j < (1 << BYTEWIDTH); j++)
4731 if (SYNTAX (j) != (enum syntaxcode) k)
4732 fastmap[j] = 1;
4733 break;
4734
4735
4736 /* All cases after this match the empty string. These end with
4737 `continue'. */
4738
4739
4740 case before_dot:
4741 case at_dot:
4742 case after_dot:
4743 continue;
4744 #endif /* emacs */
4745
4746
4747 case no_op:
4748 case begline:
4749 case endline:
4750 case begbuf:
4751 case endbuf:
4752 case wordbound:
4753 case notwordbound:
4754 case wordbeg:
4755 case wordend:
4756 case push_dummy_failure:
4757 continue;
4758
4759
4760 case jump_n:
4761 case pop_failure_jump:
4762 case maybe_pop_jump:
4763 case jump:
4764 case jump_past_alt:
4765 case dummy_failure_jump:
4766 EXTRACT_NUMBER_AND_INCR (j, p);
4767 p += j;
4768 if (j > 0)
4769 continue;
4770
4771 /* Jump backward implies we just went through the body of a
4772 loop and matched nothing. Opcode jumped to should be
4773 `on_failure_jump' or `succeed_n'. Just treat it like an
4774 ordinary jump. For a * loop, it has pushed its failure
4775 point already; if so, discard that as redundant. */
4776 if ((re_opcode_t) *p != on_failure_jump
4777 && (re_opcode_t) *p != succeed_n)
4778 continue;
4779
4780 p++;
4781 EXTRACT_NUMBER_AND_INCR (j, p);
4782 p += j;
4783
4784 /* If what's on the stack is where we are now, pop it. */
4785 if (!FAIL_STACK_EMPTY ()
4786 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
4787 fail_stack.avail--;
4788
4789 continue;
4790
4791
4792 case on_failure_jump:
4793 case on_failure_keep_string_jump:
4794 handle_on_failure_jump:
4795 EXTRACT_NUMBER_AND_INCR (j, p);
4796
4797 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
4798 end of the pattern. We don't want to push such a point,
4799 since when we restore it above, entering the switch will
4800 increment `p' past the end of the pattern. We don't need
4801 to push such a point since we obviously won't find any more
4802 fastmap entries beyond `pend'. Such a pattern can match
4803 the null string, though. */
4804 if (p + j < pend)
4805 {
4806 if (!PUSH_PATTERN_OP (p + j, fail_stack))
4807 {
4808 RESET_FAIL_STACK ();
4809 return -2;
4810 }
4811 }
4812 else
4813 bufp->can_be_null = 1;
4814
4815 if (succeed_n_p)
4816 {
4817 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
4818 succeed_n_p = false;
4819 }
4820
4821 continue;
4822
4823
4824 case succeed_n:
4825 /* Get to the number of times to succeed. */
4826 p += OFFSET_ADDRESS_SIZE;
4827
4828 /* Increment p past the n for when k != 0. */
4829 EXTRACT_NUMBER_AND_INCR (k, p);
4830 if (k == 0)
4831 {
4832 p -= 2 * OFFSET_ADDRESS_SIZE;
4833 succeed_n_p = true; /* Spaghetti code alert. */
4834 goto handle_on_failure_jump;
4835 }
4836 continue;
4837
4838
4839 case set_number_at:
4840 p += 2 * OFFSET_ADDRESS_SIZE;
4841 continue;
4842
4843
4844 case start_memory:
4845 case stop_memory:
4846 p += 2;
4847 continue;
4848
4849
4850 default:
4851 abort (); /* We have listed all the cases. */
4852 } /* switch *p++ */
4853
4854 /* Getting here means we have found the possible starting
4855 characters for one path of the pattern -- and that the empty
4856 string does not match. We need not follow this path further.
4857 Instead, look at the next alternative (remembered on the
4858 stack), or quit if no more. The test at the top of the loop
4859 does these things. */
4860 path_can_be_null = false;
4861 p = pend;
4862 } /* while p */
4863
4864 /* Set `can_be_null' for the last path (also the first path, if the
4865 pattern is empty). */
4866 bufp->can_be_null |= path_can_be_null;
4867
4868 done:
4869 RESET_FAIL_STACK ();
4870 return 0;
4871 }
4872
4873 #else /* not INSIDE_RECURSION */
4874
4875 int
4876 re_compile_fastmap (struct re_pattern_buffer *bufp)
4877 {
4878 # ifdef MBS_SUPPORT
4879 if (MB_CUR_MAX != 1)
4880 return wcs_re_compile_fastmap(bufp);
4881 else
4882 # endif
4883 return byte_re_compile_fastmap(bufp);
4884 } /* re_compile_fastmap */
4885 #ifdef _LIBC
4886 weak_alias (__re_compile_fastmap, re_compile_fastmap)
4887 #endif
4888
4889
4891 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4892 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4893 this memory for recording register information. STARTS and ENDS
4894 must be allocated using the malloc library routine, and must each
4895 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4896
4897 If NUM_REGS == 0, then subsequent matches should allocate their own
4898 register data.
4899
4900 Unless this function is called, the first search or match using
4901 PATTERN_BUFFER will allocate its own register data, without
4902 freeing the old data. */
4903
4904 void
4905 re_set_registers (struct re_pattern_buffer *bufp,
4906 struct re_registers *regs, unsigned num_regs,
4907 regoff_t *starts, regoff_t *ends)
4908 {
4909 if (num_regs)
4910 {
4911 bufp->regs_allocated = REGS_REALLOCATE;
4912 regs->num_regs = num_regs;
4913 regs->start = starts;
4914 regs->end = ends;
4915 }
4916 else
4917 {
4918 bufp->regs_allocated = REGS_UNALLOCATED;
4919 regs->num_regs = 0;
4920 regs->start = regs->end = (regoff_t *) 0;
4921 }
4922 }
4923 #ifdef _LIBC
4924 weak_alias (__re_set_registers, re_set_registers)
4925 #endif
4926
4927 /* Searching routines. */
4929
4930 /* Like re_search_2, below, but only one string is specified, and
4931 doesn't let you say where to stop matching. */
4932
4933 int
4934 re_search (struct re_pattern_buffer *bufp, const char *string, int size,
4935 int startpos, int range, struct re_registers *regs)
4936 {
4937 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
4938 regs, size);
4939 }
4940 #ifdef _LIBC
4941 weak_alias (__re_search, re_search)
4942 #endif
4943
4944
4945 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4946 virtual concatenation of STRING1 and STRING2, starting first at index
4947 STARTPOS, then at STARTPOS + 1, and so on.
4948
4949 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4950
4951 RANGE is how far to scan while trying to match. RANGE = 0 means try
4952 only at STARTPOS; in general, the last start tried is STARTPOS +
4953 RANGE.
4954
4955 In REGS, return the indices of the virtual concatenation of STRING1
4956 and STRING2 that matched the entire BUFP->buffer and its contained
4957 subexpressions.
4958
4959 Do not consider matching one past the index STOP in the virtual
4960 concatenation of STRING1 and STRING2.
4961
4962 We return either the position in the strings at which the match was
4963 found, -1 if no match, or -2 if error (such as failure
4964 stack overflow). */
4965
4966 int
4967 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, int size1,
4968 const char *string2, int size2, int startpos, int range,
4969 struct re_registers *regs, int stop)
4970 {
4971 # ifdef MBS_SUPPORT
4972 if (MB_CUR_MAX != 1)
4973 return wcs_re_search_2 (bufp, string1, size1, string2, size2, startpos,
4974 range, regs, stop);
4975 else
4976 # endif
4977 return byte_re_search_2 (bufp, string1, size1, string2, size2, startpos,
4978 range, regs, stop);
4979 } /* re_search_2 */
4980 #ifdef _LIBC
4981 weak_alias (__re_search_2, re_search_2)
4982 #endif
4983
4984 #endif /* not INSIDE_RECURSION */
4985
4986 #ifdef INSIDE_RECURSION
4987
4988 #ifdef MATCH_MAY_ALLOCATE
4989 # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4990 #else
4991 # define FREE_VAR(var) free (var); var = NULL
4992 #endif
4993
4994 #ifdef WCHAR
4995 # define MAX_ALLOCA_SIZE 2000
4996
4997 # define FREE_WCS_BUFFERS() \
4998 do { \
4999 if (size1 > MAX_ALLOCA_SIZE) \
5000 { \
5001 free (wcs_string1); \
5002 free (mbs_offset1); \
5003 } \
5004 else \
5005 { \
5006 FREE_VAR (wcs_string1); \
5007 FREE_VAR (mbs_offset1); \
5008 } \
5009 if (size2 > MAX_ALLOCA_SIZE) \
5010 { \
5011 free (wcs_string2); \
5012 free (mbs_offset2); \
5013 } \
5014 else \
5015 { \
5016 FREE_VAR (wcs_string2); \
5017 FREE_VAR (mbs_offset2); \
5018 } \
5019 } while (0)
5020
5021 #endif
5022
5023
5024 static int
5025 PREFIX(re_search_2) (struct re_pattern_buffer *bufp, const char *string1,
5026 int size1, const char *string2, int size2,
5027 int startpos, int range,
5028 struct re_registers *regs, int stop)
5029 {
5030 int val;
5031 register char *fastmap = bufp->fastmap;
5032 register RE_TRANSLATE_TYPE translate = bufp->translate;
5033 int total_size = size1 + size2;
5034 int endpos = startpos + range;
5035 #ifdef WCHAR
5036 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5037 wchar_t *wcs_string1 = NULL, *wcs_string2 = NULL;
5038 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5039 int wcs_size1 = 0, wcs_size2 = 0;
5040 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
5041 int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
5042 /* They hold whether each wchar_t is binary data or not. */
5043 char *is_binary = NULL;
5044 #endif /* WCHAR */
5045
5046 /* Check for out-of-range STARTPOS. */
5047 if (startpos < 0 || startpos > total_size)
5048 return -1;
5049
5050 /* Fix up RANGE if it might eventually take us outside
5051 the virtual concatenation of STRING1 and STRING2.
5052 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
5053 if (endpos < 0)
5054 range = 0 - startpos;
5055 else if (endpos > total_size)
5056 range = total_size - startpos;
5057
5058 /* If the search isn't to be a backwards one, don't waste time in a
5059 search for a pattern that must be anchored. */
5060 if (bufp->used > 0 && range > 0
5061 && ((re_opcode_t) bufp->buffer[0] == begbuf
5062 /* `begline' is like `begbuf' if it cannot match at newlines. */
5063 || ((re_opcode_t) bufp->buffer[0] == begline
5064 && !bufp->newline_anchor)))
5065 {
5066 if (startpos > 0)
5067 return -1;
5068 else
5069 range = 1;
5070 }
5071
5072 #ifdef emacs
5073 /* In a forward search for something that starts with \=.
5074 don't keep searching past point. */
5075 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
5076 {
5077 range = PT - startpos;
5078 if (range <= 0)
5079 return -1;
5080 }
5081 #endif /* emacs */
5082
5083 /* Update the fastmap now if not correct already. */
5084 if (fastmap && !bufp->fastmap_accurate)
5085 if (re_compile_fastmap (bufp) == -2)
5086 return -2;
5087
5088 #ifdef WCHAR
5089 /* Allocate wchar_t array for wcs_string1 and wcs_string2 and
5090 fill them with converted string. */
5091 if (size1 != 0)
5092 {
5093 if (size1 > MAX_ALLOCA_SIZE)
5094 {
5095 wcs_string1 = TALLOC (size1 + 1, CHAR_T);
5096 mbs_offset1 = TALLOC (size1 + 1, int);
5097 is_binary = TALLOC (size1 + 1, char);
5098 }
5099 else
5100 {
5101 wcs_string1 = REGEX_TALLOC (size1 + 1, CHAR_T);
5102 mbs_offset1 = REGEX_TALLOC (size1 + 1, int);
5103 is_binary = REGEX_TALLOC (size1 + 1, char);
5104 }
5105 if (!wcs_string1 || !mbs_offset1 || !is_binary)
5106 {
5107 if (size1 > MAX_ALLOCA_SIZE)
5108 {
5109 free (wcs_string1);
5110 free (mbs_offset1);
5111 free (is_binary);
5112 }
5113 else
5114 {
5115 FREE_VAR (wcs_string1);
5116 FREE_VAR (mbs_offset1);
5117 FREE_VAR (is_binary);
5118 }
5119 return -2;
5120 }
5121 wcs_size1 = convert_mbs_to_wcs(wcs_string1, string1, size1,
5122 mbs_offset1, is_binary);
5123 wcs_string1[wcs_size1] = L'\0'; /* for a sentinel */
5124 if (size1 > MAX_ALLOCA_SIZE)
5125 free (is_binary);
5126 else
5127 FREE_VAR (is_binary);
5128 }
5129 if (size2 != 0)
5130 {
5131 if (size2 > MAX_ALLOCA_SIZE)
5132 {
5133 wcs_string2 = TALLOC (size2 + 1, CHAR_T);
5134 mbs_offset2 = TALLOC (size2 + 1, int);
5135 is_binary = TALLOC (size2 + 1, char);
5136 }
5137 else
5138 {
5139 wcs_string2 = REGEX_TALLOC (size2 + 1, CHAR_T);
5140 mbs_offset2 = REGEX_TALLOC (size2 + 1, int);
5141 is_binary = REGEX_TALLOC (size2 + 1, char);
5142 }
5143 if (!wcs_string2 || !mbs_offset2 || !is_binary)
5144 {
5145 FREE_WCS_BUFFERS ();
5146 if (size2 > MAX_ALLOCA_SIZE)
5147 free (is_binary);
5148 else
5149 FREE_VAR (is_binary);
5150 return -2;
5151 }
5152 wcs_size2 = convert_mbs_to_wcs(wcs_string2, string2, size2,
5153 mbs_offset2, is_binary);
5154 wcs_string2[wcs_size2] = L'\0'; /* for a sentinel */
5155 if (size2 > MAX_ALLOCA_SIZE)
5156 free (is_binary);
5157 else
5158 FREE_VAR (is_binary);
5159 }
5160 #endif /* WCHAR */
5161
5162
5163 /* Loop through the string, looking for a place to start matching. */
5164 for (;;)
5165 {
5166 /* If a fastmap is supplied, skip quickly over characters that
5167 cannot be the start of a match. If the pattern can match the
5168 null string, however, we don't need to skip characters; we want
5169 the first null string. */
5170 if (fastmap && startpos < total_size && !bufp->can_be_null)
5171 {
5172 if (range > 0) /* Searching forwards. */
5173 {
5174 register const char *d;
5175 register int lim = 0;
5176 int irange = range;
5177
5178 if (startpos < size1 && startpos + range >= size1)
5179 lim = range - (size1 - startpos);
5180
5181 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
5182
5183 /* Written out as an if-else to avoid testing `translate'
5184 inside the loop. */
5185 if (translate)
5186 while (range > lim
5187 && !fastmap[(unsigned char)
5188 translate[(unsigned char) *d++]])
5189 range--;
5190 else
5191 while (range > lim && !fastmap[(unsigned char) *d++])
5192 range--;
5193
5194 startpos += irange - range;
5195 }
5196 else /* Searching backwards. */
5197 {
5198 register CHAR_T c = (size1 == 0 || startpos >= size1
5199 ? string2[startpos - size1]
5200 : string1[startpos]);
5201
5202 if (!fastmap[(unsigned char) TRANSLATE (c)])
5203 goto advance;
5204 }
5205 }
5206
5207 /* If can't match the null string, and that's all we have left, fail. */
5208 if (range >= 0 && startpos == total_size && fastmap
5209 && !bufp->can_be_null)
5210 {
5211 #ifdef WCHAR
5212 FREE_WCS_BUFFERS ();
5213 #endif
5214 return -1;
5215 }
5216
5217 #ifdef WCHAR
5218 val = wcs_re_match_2_internal (bufp, string1, size1, string2,
5219 size2, startpos, regs, stop,
5220 wcs_string1, wcs_size1,
5221 wcs_string2, wcs_size2,
5222 mbs_offset1, mbs_offset2);
5223 #else /* BYTE */
5224 val = byte_re_match_2_internal (bufp, string1, size1, string2,
5225 size2, startpos, regs, stop);
5226 #endif /* BYTE */
5227
5228 #ifndef REGEX_MALLOC
5229 # ifdef C_ALLOCA
5230 alloca (0);
5231 # endif
5232 #endif
5233
5234 if (val >= 0)
5235 {
5236 #ifdef WCHAR
5237 FREE_WCS_BUFFERS ();
5238 #endif
5239 return startpos;
5240 }
5241
5242 if (val == -2)
5243 {
5244 #ifdef WCHAR
5245 FREE_WCS_BUFFERS ();
5246 #endif
5247 return -2;
5248 }
5249
5250 advance:
5251 if (!range)
5252 break;
5253 else if (range > 0)
5254 {
5255 range--;
5256 startpos++;
5257 }
5258 else
5259 {
5260 range++;
5261 startpos--;
5262 }
5263 }
5264 #ifdef WCHAR
5265 FREE_WCS_BUFFERS ();
5266 #endif
5267 return -1;
5268 }
5269
5270 #ifdef WCHAR
5271 /* This converts PTR, a pointer into one of the search wchar_t strings
5272 `string1' and `string2' into an multibyte string offset from the
5273 beginning of that string. We use mbs_offset to optimize.
5274 See convert_mbs_to_wcs. */
5275 # define POINTER_TO_OFFSET(ptr) \
5276 (FIRST_STRING_P (ptr) \
5277 ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \
5278 : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \
5279 + csize1)))
5280 #else /* BYTE */
5281 /* This converts PTR, a pointer into one of the search strings `string1'
5282 and `string2' into an offset from the beginning of that string. */
5283 # define POINTER_TO_OFFSET(ptr) \
5284 (FIRST_STRING_P (ptr) \
5285 ? ((regoff_t) ((ptr) - string1)) \
5286 : ((regoff_t) ((ptr) - string2 + size1)))
5287 #endif /* WCHAR */
5288
5289 /* Macros for dealing with the split strings in re_match_2. */
5290
5291 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
5292
5293 /* Call before fetching a character with *d. This switches over to
5294 string2 if necessary. */
5295 #define PREFETCH() \
5296 while (d == dend) \
5297 { \
5298 /* End of string2 => fail. */ \
5299 if (dend == end_match_2) \
5300 goto fail; \
5301 /* End of string1 => advance to string2. */ \
5302 d = string2; \
5303 dend = end_match_2; \
5304 }
5305
5306 /* Test if at very beginning or at very end of the virtual concatenation
5307 of `string1' and `string2'. If only one string, it's `string2'. */
5308 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5309 #define AT_STRINGS_END(d) ((d) == end2)
5310
5311
5312 /* Test if D points to a character which is word-constituent. We have
5313 two special cases to check for: if past the end of string1, look at
5314 the first character in string2; and if before the beginning of
5315 string2, look at the last character in string1. */
5316 #ifdef WCHAR
5317 /* Use internationalized API instead of SYNTAX. */
5318 # define WORDCHAR_P(d) \
5319 (iswalnum ((wint_t)((d) == end1 ? *string2 \
5320 : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0 \
5321 || ((d) == end1 ? *string2 \
5322 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == L'_')
5323 #else /* BYTE */
5324 # define WORDCHAR_P(d) \
5325 (SYNTAX ((d) == end1 ? *string2 \
5326 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
5327 == Sword)
5328 #endif /* WCHAR */
5329
5330 /* Disabled due to a compiler bug -- see comment at case wordbound */
5331 #if 0
5332 /* Test if the character before D and the one at D differ with respect
5333 to being word-constituent. */
5334 #define AT_WORD_BOUNDARY(d) \
5335 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
5336 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
5337 #endif
5338
5339 /* Free everything we malloc. */
5340 #ifdef MATCH_MAY_ALLOCATE
5341 # ifdef WCHAR
5342 # define FREE_VARIABLES() \
5343 do { \
5344 REGEX_FREE_STACK (fail_stack.stack); \
5345 FREE_VAR (regstart); \
5346 FREE_VAR (regend); \
5347 FREE_VAR (old_regstart); \
5348 FREE_VAR (old_regend); \
5349 FREE_VAR (best_regstart); \
5350 FREE_VAR (best_regend); \
5351 FREE_VAR (reg_info); \
5352 FREE_VAR (reg_dummy); \
5353 FREE_VAR (reg_info_dummy); \
5354 if (!cant_free_wcs_buf) \
5355 { \
5356 FREE_VAR (string1); \
5357 FREE_VAR (string2); \
5358 FREE_VAR (mbs_offset1); \
5359 FREE_VAR (mbs_offset2); \
5360 } \
5361 } while (0)
5362 # else /* BYTE */
5363 # define FREE_VARIABLES() \
5364 do { \
5365 REGEX_FREE_STACK (fail_stack.stack); \
5366 FREE_VAR (regstart); \
5367 FREE_VAR (regend); \
5368 FREE_VAR (old_regstart); \
5369 FREE_VAR (old_regend); \
5370 FREE_VAR (best_regstart); \
5371 FREE_VAR (best_regend); \
5372 FREE_VAR (reg_info); \
5373 FREE_VAR (reg_dummy); \
5374 FREE_VAR (reg_info_dummy); \
5375 } while (0)
5376 # endif /* WCHAR */
5377 #else
5378 # ifdef WCHAR
5379 # define FREE_VARIABLES() \
5380 do { \
5381 if (!cant_free_wcs_buf) \
5382 { \
5383 FREE_VAR (string1); \
5384 FREE_VAR (string2); \
5385 FREE_VAR (mbs_offset1); \
5386 FREE_VAR (mbs_offset2); \
5387 } \
5388 } while (0)
5389 # else /* BYTE */
5390 # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
5391 # endif /* WCHAR */
5392 #endif /* not MATCH_MAY_ALLOCATE */
5393
5394 /* These values must meet several constraints. They must not be valid
5395 register values; since we have a limit of 255 registers (because
5396 we use only one byte in the pattern for the register number), we can
5397 use numbers larger than 255. They must differ by 1, because of
5398 NUM_FAILURE_ITEMS above. And the value for the lowest register must
5399 be larger than the value for the highest register, so we do not try
5400 to actually save any registers when none are active. */
5401 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
5402 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
5403
5404 #else /* not INSIDE_RECURSION */
5406 /* Matching routines. */
5407
5408 #ifndef emacs /* Emacs never uses this. */
5409 /* re_match is like re_match_2 except it takes only a single string. */
5410
5411 int
5412 re_match (struct re_pattern_buffer *bufp, const char *string,
5413 int size, int pos, struct re_registers *regs)
5414 {
5415 int result;
5416 # ifdef MBS_SUPPORT
5417 if (MB_CUR_MAX != 1)
5418 result = wcs_re_match_2_internal (bufp, NULL, 0, string, size,
5419 pos, regs, size,
5420 NULL, 0, NULL, 0, NULL, NULL);
5421 else
5422 # endif
5423 result = byte_re_match_2_internal (bufp, NULL, 0, string, size,
5424 pos, regs, size);
5425 # ifndef REGEX_MALLOC
5426 # ifdef C_ALLOCA
5427 alloca (0);
5428 # endif
5429 # endif
5430 return result;
5431 }
5432 # ifdef _LIBC
5433 weak_alias (__re_match, re_match)
5434 # endif
5435 #endif /* not emacs */
5436
5437 #endif /* not INSIDE_RECURSION */
5438
5439 #ifdef INSIDE_RECURSION
5440 static boolean PREFIX(group_match_null_string_p) (UCHAR_T **p,
5441 UCHAR_T *end,
5442 PREFIX(register_info_type) *reg_info);
5443 static boolean PREFIX(alt_match_null_string_p) (UCHAR_T *p,
5444 UCHAR_T *end,
5445 PREFIX(register_info_type) *reg_info);
5446 static boolean PREFIX(common_op_match_null_string_p) (UCHAR_T **p,
5447 UCHAR_T *end,
5448 PREFIX(register_info_type) *reg_info);
5449 static int PREFIX(bcmp_translate) (const CHAR_T *s1, const CHAR_T *s2,
5450 int len, char *translate);
5451 #else /* not INSIDE_RECURSION */
5452
5453 /* re_match_2 matches the compiled pattern in BUFP against the
5454 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5455 and SIZE2, respectively). We start matching at POS, and stop
5456 matching at STOP.
5457
5458 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
5459 store offsets for the substring each group matched in REGS. See the
5460 documentation for exactly how many groups we fill.
5461
5462 We return -1 if no match, -2 if an internal error (such as the
5463 failure stack overflowing). Otherwise, we return the length of the
5464 matched substring. */
5465
5466 int
5467 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int size1,
5468 const char *string2, int size2, int pos,
5469 struct re_registers *regs, int stop)
5470 {
5471 int result;
5472 # ifdef MBS_SUPPORT
5473 if (MB_CUR_MAX != 1)
5474 result = wcs_re_match_2_internal (bufp, string1, size1, string2, size2,
5475 pos, regs, stop,
5476 NULL, 0, NULL, 0, NULL, NULL);
5477 else
5478 # endif
5479 result = byte_re_match_2_internal (bufp, string1, size1, string2, size2,
5480 pos, regs, stop);
5481
5482 #ifndef REGEX_MALLOC
5483 # ifdef C_ALLOCA
5484 alloca (0);
5485 # endif
5486 #endif
5487 return result;
5488 }
5489 #ifdef _LIBC
5490 weak_alias (__re_match_2, re_match_2)
5491 #endif
5492
5493 #endif /* not INSIDE_RECURSION */
5494
5495 #ifdef INSIDE_RECURSION
5496
5497 #ifdef WCHAR
5498 static int count_mbs_length (int *, int);
5499
5500 /* This check the substring (from 0, to length) of the multibyte string,
5501 to which offset_buffer correspond. And count how many wchar_t_characters
5502 the substring occupy. We use offset_buffer to optimization.
5503 See convert_mbs_to_wcs. */
5504
5505 static int
5506 count_mbs_length(int *offset_buffer, int length)
5507 {
5508 int upper, lower;
5509
5510 /* Check whether the size is valid. */
5511 if (length < 0)
5512 return -1;
5513
5514 if (offset_buffer == NULL)
5515 return 0;
5516
5517 /* If there are no multibyte character, offset_buffer[i] == i.
5518 Optmize for this case. */
5519 if (offset_buffer[length] == length)
5520 return length;
5521
5522 /* Set up upper with length. (because for all i, offset_buffer[i] >= i) */
5523 upper = length;
5524 lower = 0;
5525
5526 while (true)
5527 {
5528 int middle = (lower + upper) / 2;
5529 if (middle == lower || middle == upper)
5530 break;
5531 if (offset_buffer[middle] > length)
5532 upper = middle;
5533 else if (offset_buffer[middle] < length)
5534 lower = middle;
5535 else
5536 return middle;
5537 }
5538
5539 return -1;
5540 }
5541 #endif /* WCHAR */
5542
5543 /* This is a separate function so that we can force an alloca cleanup
5544 afterwards. */
5545 #ifdef WCHAR
5546 static int
5547 wcs_re_match_2_internal (struct re_pattern_buffer *bufp,
5548 const char *cstring1, int csize1,
5549 const char *cstring2, int csize2,
5550 int pos,
5551 struct re_registers *regs,
5552 int stop,
5553 /* string1 == string2 == NULL means string1/2, size1/2 and
5554 mbs_offset1/2 need seting up in this function. */
5555 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5556 wchar_t *string1, int size1,
5557 wchar_t *string2, int size2,
5558 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
5559 int *mbs_offset1, int *mbs_offset2)
5560 #else /* BYTE */
5561 static int
5562 byte_re_match_2_internal (struct re_pattern_buffer *bufp,
5563 const char *string1, int size1,
5564 const char *string2, int size2,
5565 int pos,
5566 struct re_registers *regs, int stop)
5567 #endif /* BYTE */
5568 {
5569 /* General temporaries. */
5570 int mcnt;
5571 UCHAR_T *p1;
5572 #ifdef WCHAR
5573 /* They hold whether each wchar_t is binary data or not. */
5574 char *is_binary = NULL;
5575 /* If true, we can't free string1/2, mbs_offset1/2. */
5576 int cant_free_wcs_buf = 1;
5577 #endif /* WCHAR */
5578
5579 /* Just past the end of the corresponding string. */
5580 const CHAR_T *end1, *end2;
5581
5582 /* Pointers into string1 and string2, just past the last characters in
5583 each to consider matching. */
5584 const CHAR_T *end_match_1, *end_match_2;
5585
5586 /* Where we are in the data, and the end of the current string. */
5587 const CHAR_T *d, *dend;
5588
5589 /* Where we are in the pattern, and the end of the pattern. */
5590 #ifdef WCHAR
5591 UCHAR_T *pattern, *p;
5592 register UCHAR_T *pend;
5593 #else /* BYTE */
5594 UCHAR_T *p = bufp->buffer;
5595 register UCHAR_T *pend = p + bufp->used;
5596 #endif /* WCHAR */
5597
5598 /* Mark the opcode just after a start_memory, so we can test for an
5599 empty subpattern when we get to the stop_memory. */
5600 UCHAR_T *just_past_start_mem = 0;
5601
5602 /* We use this to map every character in the string. */
5603 RE_TRANSLATE_TYPE translate = bufp->translate;
5604
5605 /* Failure point stack. Each place that can handle a failure further
5606 down the line pushes a failure point on this stack. It consists of
5607 restart, regend, and reg_info for all registers corresponding to
5608 the subexpressions we're currently inside, plus the number of such
5609 registers, and, finally, two char *'s. The first char * is where
5610 to resume scanning the pattern; the second one is where to resume
5611 scanning the strings. If the latter is zero, the failure point is
5612 a ``dummy''; if a failure happens and the failure point is a dummy,
5613 it gets discarded and the next next one is tried. */
5614 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5615 PREFIX(fail_stack_type) fail_stack;
5616 #endif
5617 #ifdef DEBUG
5618 static unsigned failure_id;
5619 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5620 #endif
5621
5622 #ifdef REL_ALLOC
5623 /* This holds the pointer to the failure stack, when
5624 it is allocated relocatably. */
5625 fail_stack_elt_t *failure_stack_ptr;
5626 #endif
5627
5628 /* We fill all the registers internally, independent of what we
5629 return, for use in backreferences. The number here includes
5630 an element for register zero. */
5631 size_t num_regs = bufp->re_nsub + 1;
5632
5633 /* The currently active registers. */
5634 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5635 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5636
5637 /* Information on the contents of registers. These are pointers into
5638 the input strings; they record just what was matched (on this
5639 attempt) by a subexpression part of the pattern, that is, the
5640 regnum-th regstart pointer points to where in the pattern we began
5641 matching and the regnum-th regend points to right after where we
5642 stopped matching the regnum-th subexpression. (The zeroth register
5643 keeps track of what the whole pattern matches.) */
5644 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5645 const CHAR_T **regstart, **regend;
5646 #endif
5647
5648 /* If a group that's operated upon by a repetition operator fails to
5649 match anything, then the register for its start will need to be
5650 restored because it will have been set to wherever in the string we
5651 are when we last see its open-group operator. Similarly for a
5652 register's end. */
5653 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5654 const CHAR_T **old_regstart, **old_regend;
5655 #endif
5656
5657 /* The is_active field of reg_info helps us keep track of which (possibly
5658 nested) subexpressions we are currently in. The matched_something
5659 field of reg_info[reg_num] helps us tell whether or not we have
5660 matched any of the pattern so far this time through the reg_num-th
5661 subexpression. These two fields get reset each time through any
5662 loop their register is in. */
5663 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5664 PREFIX(register_info_type) *reg_info;
5665 #endif
5666
5667 /* The following record the register info as found in the above
5668 variables when we find a match better than any we've seen before.
5669 This happens as we backtrack through the failure points, which in
5670 turn happens only if we have not yet matched the entire string. */
5671 unsigned best_regs_set = false;
5672 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5673 const CHAR_T **best_regstart, **best_regend;
5674 #endif
5675
5676 /* Logically, this is `best_regend[0]'. But we don't want to have to
5677 allocate space for that if we're not allocating space for anything
5678 else (see below). Also, we never need info about register 0 for
5679 any of the other register vectors, and it seems rather a kludge to
5680 treat `best_regend' differently than the rest. So we keep track of
5681 the end of the best match so far in a separate variable. We
5682 initialize this to NULL so that when we backtrack the first time
5683 and need to test it, it's not garbage. */
5684 const CHAR_T *match_end = NULL;
5685
5686 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
5687 int set_regs_matched_done = 0;
5688
5689 /* Used when we pop values we don't care about. */
5690 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5691 const CHAR_T **reg_dummy;
5692 PREFIX(register_info_type) *reg_info_dummy;
5693 #endif
5694
5695 #ifdef DEBUG
5696 /* Counts the total number of registers pushed. */
5697 unsigned num_regs_pushed = 0;
5698 #endif
5699
5700 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5701
5702 INIT_FAIL_STACK ();
5703
5704 #ifdef MATCH_MAY_ALLOCATE
5705 /* Do not bother to initialize all the register variables if there are
5706 no groups in the pattern, as it takes a fair amount of time. If
5707 there are groups, we include space for register 0 (the whole
5708 pattern), even though we never use it, since it simplifies the
5709 array indexing. We should fix this. */
5710 if (bufp->re_nsub)
5711 {
5712 regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5713 regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5714 old_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5715 old_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5716 best_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5717 best_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5718 reg_info = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5719 reg_dummy = REGEX_TALLOC (num_regs, const CHAR_T *);
5720 reg_info_dummy = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5721
5722 if (!(regstart && regend && old_regstart && old_regend && reg_info
5723 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
5724 {
5725 FREE_VARIABLES ();
5726 return -2;
5727 }
5728 }
5729 else
5730 {
5731 /* We must initialize all our variables to NULL, so that
5732 `FREE_VARIABLES' doesn't try to free them. */
5733 regstart = regend = old_regstart = old_regend = best_regstart
5734 = best_regend = reg_dummy = NULL;
5735 reg_info = reg_info_dummy = (PREFIX(register_info_type) *) NULL;
5736 }
5737 #endif /* MATCH_MAY_ALLOCATE */
5738
5739 /* The starting position is bogus. */
5740 #ifdef WCHAR
5741 if (pos < 0 || pos > csize1 + csize2)
5742 #else /* BYTE */
5743 if (pos < 0 || pos > size1 + size2)
5744 #endif
5745 {
5746 FREE_VARIABLES ();
5747 return -1;
5748 }
5749
5750 #ifdef WCHAR
5751 /* Allocate wchar_t array for string1 and string2 and
5752 fill them with converted string. */
5753 if (string1 == NULL && string2 == NULL)
5754 {
5755 /* We need seting up buffers here. */
5756
5757 /* We must free wcs buffers in this function. */
5758 cant_free_wcs_buf = 0;
5759
5760 if (csize1 != 0)
5761 {
5762 string1 = REGEX_TALLOC (csize1 + 1, CHAR_T);
5763 mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
5764 is_binary = REGEX_TALLOC (csize1 + 1, char);
5765 if (!string1 || !mbs_offset1 || !is_binary)
5766 {
5767 FREE_VAR (string1);
5768 FREE_VAR (mbs_offset1);
5769 FREE_VAR (is_binary);
5770 return -2;
5771 }
5772 }
5773 if (csize2 != 0)
5774 {
5775 string2 = REGEX_TALLOC (csize2 + 1, CHAR_T);
5776 mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
5777 is_binary = REGEX_TALLOC (csize2 + 1, char);
5778 if (!string2 || !mbs_offset2 || !is_binary)
5779 {
5780 FREE_VAR (string1);
5781 FREE_VAR (mbs_offset1);
5782 FREE_VAR (string2);
5783 FREE_VAR (mbs_offset2);
5784 FREE_VAR (is_binary);
5785 return -2;
5786 }
5787 size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
5788 mbs_offset2, is_binary);
5789 string2[size2] = L'\0'; /* for a sentinel */
5790 FREE_VAR (is_binary);
5791 }
5792 }
5793
5794 /* We need to cast pattern to (wchar_t*), because we casted this compiled
5795 pattern to (char*) in regex_compile. */
5796 p = pattern = (CHAR_T*)bufp->buffer;
5797 pend = (CHAR_T*)(bufp->buffer + bufp->used);
5798
5799 #endif /* WCHAR */
5800
5801 /* Initialize subexpression text positions to -1 to mark ones that no
5802 start_memory/stop_memory has been seen for. Also initialize the
5803 register information struct. */
5804 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5805 {
5806 regstart[mcnt] = regend[mcnt]
5807 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
5808
5809 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
5810 IS_ACTIVE (reg_info[mcnt]) = 0;
5811 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5812 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5813 }
5814
5815 /* We move `string1' into `string2' if the latter's empty -- but not if
5816 `string1' is null. */
5817 if (size2 == 0 && string1 != NULL)
5818 {
5819 string2 = string1;
5820 size2 = size1;
5821 string1 = 0;
5822 size1 = 0;
5823 #ifdef WCHAR
5824 mbs_offset2 = mbs_offset1;
5825 csize2 = csize1;
5826 mbs_offset1 = NULL;
5827 csize1 = 0;
5828 #endif
5829 }
5830 end1 = string1 + size1;
5831 end2 = string2 + size2;
5832
5833 /* Compute where to stop matching, within the two strings. */
5834 #ifdef WCHAR
5835 if (stop <= csize1)
5836 {
5837 mcnt = count_mbs_length(mbs_offset1, stop);
5838 end_match_1 = string1 + mcnt;
5839 end_match_2 = string2;
5840 }
5841 else
5842 {
5843 if (stop > csize1 + csize2)
5844 stop = csize1 + csize2;
5845 end_match_1 = end1;
5846 mcnt = count_mbs_length(mbs_offset2, stop-csize1);
5847 end_match_2 = string2 + mcnt;
5848 }
5849 if (mcnt < 0)
5850 { /* count_mbs_length return error. */
5851 FREE_VARIABLES ();
5852 return -1;
5853 }
5854 #else
5855 if (stop <= size1)
5856 {
5857 end_match_1 = string1 + stop;
5858 end_match_2 = string2;
5859 }
5860 else
5861 {
5862 end_match_1 = end1;
5863 end_match_2 = string2 + stop - size1;
5864 }
5865 #endif /* WCHAR */
5866
5867 /* `p' scans through the pattern as `d' scans through the data.
5868 `dend' is the end of the input string that `d' points within. `d'
5869 is advanced into the following input string whenever necessary, but
5870 this happens before fetching; therefore, at the beginning of the
5871 loop, `d' can be pointing at the end of a string, but it cannot
5872 equal `string2'. */
5873 #ifdef WCHAR
5874 if (size1 > 0 && pos <= csize1)
5875 {
5876 mcnt = count_mbs_length(mbs_offset1, pos);
5877 d = string1 + mcnt;
5878 dend = end_match_1;
5879 }
5880 else
5881 {
5882 mcnt = count_mbs_length(mbs_offset2, pos-csize1);
5883 d = string2 + mcnt;
5884 dend = end_match_2;
5885 }
5886
5887 if (mcnt < 0)
5888 { /* count_mbs_length return error. */
5889 FREE_VARIABLES ();
5890 return -1;
5891 }
5892 #else
5893 if (size1 > 0 && pos <= size1)
5894 {
5895 d = string1 + pos;
5896 dend = end_match_1;
5897 }
5898 else
5899 {
5900 d = string2 + pos - size1;
5901 dend = end_match_2;
5902 }
5903 #endif /* WCHAR */
5904
5905 DEBUG_PRINT1 ("The compiled pattern is:\n");
5906 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5907 DEBUG_PRINT1 ("The string to match is: `");
5908 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5909 DEBUG_PRINT1 ("'\n");
5910
5911 /* This loops over pattern commands. It exits by returning from the
5912 function if the match is complete, or it drops through if the match
5913 fails at this starting point in the input data. */
5914 for (;;)
5915 {
5916 #ifdef _LIBC
5917 DEBUG_PRINT2 ("\n%p: ", p);
5918 #else
5919 DEBUG_PRINT2 ("\n0x%x: ", p);
5920 #endif
5921
5922 if (p == pend)
5923 { /* End of pattern means we might have succeeded. */
5924 DEBUG_PRINT1 ("end of pattern ... ");
5925
5926 /* If we haven't matched the entire string, and we want the
5927 longest match, try backtracking. */
5928 if (d != end_match_2)
5929 {
5930 /* 1 if this match ends in the same string (string1 or string2)
5931 as the best previous match. */
5932 boolean same_str_p;
5933
5934 /* 1 if this match is the best seen so far. */
5935 boolean best_match_p;
5936
5937 same_str_p = (FIRST_STRING_P (match_end)
5938 == MATCHING_IN_FIRST_STRING);
5939
5940 /* AIX compiler got confused when this was combined
5941 with the previous declaration. */
5942 if (same_str_p)
5943 best_match_p = d > match_end;
5944 else
5945 best_match_p = !MATCHING_IN_FIRST_STRING;
5946
5947 DEBUG_PRINT1 ("backtracking.\n");
5948
5949 if (!FAIL_STACK_EMPTY ())
5950 { /* More failure points to try. */
5951
5952 /* If exceeds best match so far, save it. */
5953 if (!best_regs_set || best_match_p)
5954 {
5955 best_regs_set = true;
5956 match_end = d;
5957
5958 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5959
5960 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5961 {
5962 best_regstart[mcnt] = regstart[mcnt];
5963 best_regend[mcnt] = regend[mcnt];
5964 }
5965 }
5966 goto fail;
5967 }
5968
5969 /* If no failure points, don't restore garbage. And if
5970 last match is real best match, don't restore second
5971 best one. */
5972 else if (best_regs_set && !best_match_p)
5973 {
5974 restore_best_regs:
5975 /* Restore best match. It may happen that `dend ==
5976 end_match_1' while the restored d is in string2.
5977 For example, the pattern `x.*y.*z' against the
5978 strings `x-' and `y-z-', if the two strings are
5979 not consecutive in memory. */
5980 DEBUG_PRINT1 ("Restoring best registers.\n");
5981
5982 d = match_end;
5983 dend = ((d >= string1 && d <= end1)
5984 ? end_match_1 : end_match_2);
5985
5986 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5987 {
5988 regstart[mcnt] = best_regstart[mcnt];
5989 regend[mcnt] = best_regend[mcnt];
5990 }
5991 }
5992 } /* d != end_match_2 */
5993
5994 succeed_label:
5995 DEBUG_PRINT1 ("Accepting match.\n");
5996 /* If caller wants register contents data back, do it. */
5997 if (regs && !bufp->no_sub)
5998 {
5999 /* Have the register data arrays been allocated? */
6000 if (bufp->regs_allocated == REGS_UNALLOCATED)
6001 { /* No. So allocate them with malloc. We need one
6002 extra element beyond `num_regs' for the `-1' marker
6003 GNU code uses. */
6004 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
6005 regs->start = TALLOC (regs->num_regs, regoff_t);
6006 regs->end = TALLOC (regs->num_regs, regoff_t);
6007 if (regs->start == NULL || regs->end == NULL)
6008 {
6009 FREE_VARIABLES ();
6010 return -2;
6011 }
6012 bufp->regs_allocated = REGS_REALLOCATE;
6013 }
6014 else if (bufp->regs_allocated == REGS_REALLOCATE)
6015 { /* Yes. If we need more elements than were already
6016 allocated, reallocate them. If we need fewer, just
6017 leave it alone. */
6018 if (regs->num_regs < num_regs + 1)
6019 {
6020 regs->num_regs = num_regs + 1;
6021 RETALLOC (regs->start, regs->num_regs, regoff_t);
6022 RETALLOC (regs->end, regs->num_regs, regoff_t);
6023 if (regs->start == NULL || regs->end == NULL)
6024 {
6025 FREE_VARIABLES ();
6026 return -2;
6027 }
6028 }
6029 }
6030 else
6031 {
6032 /* These braces fend off a "empty body in an else-statement"
6033 warning under GCC when assert expands to nothing. */
6034 assert (bufp->regs_allocated == REGS_FIXED);
6035 }
6036
6037 /* Convert the pointer data in `regstart' and `regend' to
6038 indices. Register zero has to be set differently,
6039 since we haven't kept track of any info for it. */
6040 if (regs->num_regs > 0)
6041 {
6042 regs->start[0] = pos;
6043 #ifdef WCHAR
6044 if (MATCHING_IN_FIRST_STRING)
6045 regs->end[0] = mbs_offset1 != NULL ?
6046 mbs_offset1[d-string1] : 0;
6047 else
6048 regs->end[0] = csize1 + (mbs_offset2 != NULL ?
6049 mbs_offset2[d-string2] : 0);
6050 #else
6051 regs->end[0] = (MATCHING_IN_FIRST_STRING
6052 ? ((regoff_t) (d - string1))
6053 : ((regoff_t) (d - string2 + size1)));
6054 #endif /* WCHAR */
6055 }
6056
6057 /* Go through the first `min (num_regs, regs->num_regs)'
6058 registers, since that is all we initialized. */
6059 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
6060 mcnt++)
6061 {
6062 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
6063 regs->start[mcnt] = regs->end[mcnt] = -1;
6064 else
6065 {
6066 regs->start[mcnt]
6067 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
6068 regs->end[mcnt]
6069 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
6070 }
6071 }
6072
6073 /* If the regs structure we return has more elements than
6074 were in the pattern, set the extra elements to -1. If
6075 we (re)allocated the registers, this is the case,
6076 because we always allocate enough to have at least one
6077 -1 at the end. */
6078 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
6079 regs->start[mcnt] = regs->end[mcnt] = -1;
6080 } /* regs && !bufp->no_sub */
6081
6082 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
6083 nfailure_points_pushed, nfailure_points_popped,
6084 nfailure_points_pushed - nfailure_points_popped);
6085 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
6086
6087 #ifdef WCHAR
6088 if (MATCHING_IN_FIRST_STRING)
6089 mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
6090 else
6091 mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
6092 csize1;
6093 mcnt -= pos;
6094 #else
6095 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
6096 ? string1
6097 : string2 - size1);
6098 #endif /* WCHAR */
6099
6100 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
6101
6102 FREE_VARIABLES ();
6103 return mcnt;
6104 }
6105
6106 /* Otherwise match next pattern command. */
6107 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
6108 {
6109 /* Ignore these. Used to ignore the n of succeed_n's which
6110 currently have n == 0. */
6111 case no_op:
6112 DEBUG_PRINT1 ("EXECUTING no_op.\n");
6113 break;
6114
6115 case succeed:
6116 DEBUG_PRINT1 ("EXECUTING succeed.\n");
6117 goto succeed_label;
6118
6119 /* Match the next n pattern characters exactly. The following
6120 byte in the pattern defines n, and the n bytes after that
6121 are the characters to match. */
6122 case exactn:
6123 #ifdef MBS_SUPPORT
6124 case exactn_bin:
6125 #endif
6126 mcnt = *p++;
6127 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
6128
6129 /* This is written out as an if-else so we don't waste time
6130 testing `translate' inside the loop. */
6131 if (translate)
6132 {
6133 do
6134 {
6135 PREFETCH ();
6136 #ifdef WCHAR
6137 if (*d <= 0xff)
6138 {
6139 if ((UCHAR_T) translate[(unsigned char) *d++]
6140 != (UCHAR_T) *p++)
6141 goto fail;
6142 }
6143 else
6144 {
6145 if (*d++ != (CHAR_T) *p++)
6146 goto fail;
6147 }
6148 #else
6149 if ((UCHAR_T) translate[(unsigned char) *d++]
6150 != (UCHAR_T) *p++)
6151 goto fail;
6152 #endif /* WCHAR */
6153 }
6154 while (--mcnt);
6155 }
6156 else
6157 {
6158 do
6159 {
6160 PREFETCH ();
6161 if (*d++ != (CHAR_T) *p++) goto fail;
6162 }
6163 while (--mcnt);
6164 }
6165 SET_REGS_MATCHED ();
6166 break;
6167
6168
6169 /* Match any character except possibly a newline or a null. */
6170 case anychar:
6171 DEBUG_PRINT1 ("EXECUTING anychar.\n");
6172
6173 PREFETCH ();
6174
6175 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
6176 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
6177 goto fail;
6178
6179 SET_REGS_MATCHED ();
6180 DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d);
6181 d++;
6182 break;
6183
6184
6185 case charset:
6186 case charset_not:
6187 {
6188 register UCHAR_T c;
6189 #ifdef WCHAR
6190 unsigned int i, char_class_length, coll_symbol_length,
6191 equiv_class_length, ranges_length, chars_length, length;
6192 CHAR_T *workp, *workp2, *charset_top;
6193 #define WORK_BUFFER_SIZE 128
6194 CHAR_T str_buf[WORK_BUFFER_SIZE];
6195 # ifdef _LIBC
6196 uint32_t nrules;
6197 # endif /* _LIBC */
6198 #endif /* WCHAR */
6199 boolean negate = (re_opcode_t) *(p - 1) == charset_not;
6200
6201 DEBUG_PRINT2 ("EXECUTING charset%s.\n", negate ? "_not" : "");
6202 PREFETCH ();
6203 c = TRANSLATE (*d); /* The character to match. */
6204 #ifdef WCHAR
6205 # ifdef _LIBC
6206 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
6207 # endif /* _LIBC */
6208 charset_top = p - 1;
6209 char_class_length = *p++;
6210 coll_symbol_length = *p++;
6211 equiv_class_length = *p++;
6212 ranges_length = *p++;
6213 chars_length = *p++;
6214 /* p points charset[6], so the address of the next instruction
6215 (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'],
6216 where l=length of char_classes, m=length of collating_symbol,
6217 n=equivalence_class, o=length of char_range,
6218 p'=length of character. */
6219 workp = p;
6220 /* Update p to indicate the next instruction. */
6221 p += char_class_length + coll_symbol_length+ equiv_class_length +
6222 2*ranges_length + chars_length;
6223
6224 /* match with char_class? */
6225 for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
6226 {
6227 wctype_t wctype;
6228 uintptr_t alignedp = ((uintptr_t)workp
6229 + __alignof__(wctype_t) - 1)
6230 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
6231 wctype = *((wctype_t*)alignedp);
6232 workp += CHAR_CLASS_SIZE;
6233 # ifdef _LIBC
6234 if (__iswctype((wint_t)c, wctype))
6235 goto char_set_matched;
6236 # else
6237 if (iswctype((wint_t)c, wctype))
6238 goto char_set_matched;
6239 # endif
6240 }
6241
6242 /* match with collating_symbol? */
6243 # ifdef _LIBC
6244 if (nrules != 0)
6245 {
6246 const unsigned char *extra = (const unsigned char *)
6247 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
6248
6249 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
6250 workp++)
6251 {
6252 int32_t *wextra;
6253 wextra = (int32_t*)(extra + *workp++);
6254 for (i = 0; i < *wextra; ++i)
6255 if (TRANSLATE(d[i]) != wextra[1 + i])
6256 break;
6257
6258 if (i == *wextra)
6259 {
6260 /* Update d, however d will be incremented at
6261 char_set_matched:, we decrement d here. */
6262 d += i - 1;
6263 goto char_set_matched;
6264 }
6265 }
6266 }
6267 else /* (nrules == 0) */
6268 # endif
6269 /* If we can't look up collation data, we use wcscoll
6270 instead. */
6271 {
6272 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;)
6273 {
6274 const CHAR_T *backup_d = d, *backup_dend = dend;
6275 # ifdef _LIBC
6276 length = __wcslen (workp);
6277 # else
6278 length = wcslen (workp);
6279 # endif
6280
6281 /* If wcscoll(the collating symbol, whole string) > 0,
6282 any substring of the string never match with the
6283 collating symbol. */
6284 # ifdef _LIBC
6285 if (__wcscoll (workp, d) > 0)
6286 # else
6287 if (wcscoll (workp, d) > 0)
6288 # endif
6289 {
6290 workp += length + 1;
6291 continue;
6292 }
6293
6294 /* First, we compare the collating symbol with
6295 the first character of the string.
6296 If it don't match, we add the next character to
6297 the compare buffer in turn. */
6298 for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++)
6299 {
6300 int match;
6301 if (d == dend)
6302 {
6303 if (dend == end_match_2)
6304 break;
6305 d = string2;
6306 dend = end_match_2;
6307 }
6308
6309 /* add next character to the compare buffer. */
6310 str_buf[i] = TRANSLATE(*d);
6311 str_buf[i+1] = '\0';
6312
6313 # ifdef _LIBC
6314 match = __wcscoll (workp, str_buf);
6315 # else
6316 match = wcscoll (workp, str_buf);
6317 # endif
6318 if (match == 0)
6319 goto char_set_matched;
6320
6321 if (match < 0)
6322 /* (str_buf > workp) indicate (str_buf + X > workp),
6323 because for all X (str_buf + X > str_buf).
6324 So we don't need continue this loop. */
6325 break;
6326
6327 /* Otherwise(str_buf < workp),
6328 (str_buf+next_character) may equals (workp).
6329 So we continue this loop. */
6330 }
6331 /* not matched */
6332 d = backup_d;
6333 dend = backup_dend;
6334 workp += length + 1;
6335 }
6336 }
6337 /* match with equivalence_class? */
6338 # ifdef _LIBC
6339 if (nrules != 0)
6340 {
6341 const CHAR_T *backup_d = d, *backup_dend = dend;
6342 /* Try to match the equivalence class against
6343 those known to the collate implementation. */
6344 const int32_t *table;
6345 const int32_t *weights;
6346 const int32_t *extra;
6347 const int32_t *indirect;
6348 int32_t idx, idx2;
6349 wint_t *cp;
6350 size_t len;
6351
6352 /* This #include defines a local function! */
6353 # include <locale/weightwc.h>
6354
6355 table = (const int32_t *)
6356 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC);
6357 weights = (const wint_t *)
6358 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC);
6359 extra = (const wint_t *)
6360 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC);
6361 indirect = (const int32_t *)
6362 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC);
6363
6364 /* Write 1 collating element to str_buf, and
6365 get its index. */
6366 idx2 = 0;
6367
6368 for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++)
6369 {
6370 cp = (wint_t*)str_buf;
6371 if (d == dend)
6372 {
6373 if (dend == end_match_2)
6374 break;
6375 d = string2;
6376 dend = end_match_2;
6377 }
6378 str_buf[i] = TRANSLATE(*(d+i));
6379 str_buf[i+1] = '\0'; /* sentinel */
6380 idx2 = findidx ((const wint_t**)&cp);
6381 }
6382
6383 /* Update d, however d will be incremented at
6384 char_set_matched:, we decrement d here. */
6385 d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1);
6386 if (d >= dend)
6387 {
6388 if (dend == end_match_2)
6389 d = dend;
6390 else
6391 {
6392 d = string2;
6393 dend = end_match_2;
6394 }
6395 }
6396
6397 len = weights[idx2];
6398
6399 for (workp2 = workp + equiv_class_length ; workp < workp2 ;
6400 workp++)
6401 {
6402 idx = (int32_t)*workp;
6403 /* We already checked idx != 0 in regex_compile. */
6404
6405 if (idx2 != 0 && len == weights[idx])
6406 {
6407 int cnt = 0;
6408 while (cnt < len && (weights[idx + 1 + cnt]
6409 == weights[idx2 + 1 + cnt]))
6410 ++cnt;
6411
6412 if (cnt == len)
6413 goto char_set_matched;
6414 }
6415 }
6416 /* not matched */
6417 d = backup_d;
6418 dend = backup_dend;
6419 }
6420 else /* (nrules == 0) */
6421 # endif
6422 /* If we can't look up collation data, we use wcscoll
6423 instead. */
6424 {
6425 for (workp2 = workp + equiv_class_length ; workp < workp2 ;)
6426 {
6427 const CHAR_T *backup_d = d, *backup_dend = dend;
6428 # ifdef _LIBC
6429 length = __wcslen (workp);
6430 # else
6431 length = wcslen (workp);
6432 # endif
6433
6434 /* If wcscoll(the collating symbol, whole string) > 0,
6435 any substring of the string never match with the
6436 collating symbol. */
6437 # ifdef _LIBC
6438 if (__wcscoll (workp, d) > 0)
6439 # else
6440 if (wcscoll (workp, d) > 0)
6441 # endif
6442 {
6443 workp += length + 1;
6444 break;
6445 }
6446
6447 /* First, we compare the equivalence class with
6448 the first character of the string.
6449 If it don't match, we add the next character to
6450 the compare buffer in turn. */
6451 for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++)
6452 {
6453 int match;
6454 if (d == dend)
6455 {
6456 if (dend == end_match_2)
6457 break;
6458 d = string2;
6459 dend = end_match_2;
6460 }
6461
6462 /* add next character to the compare buffer. */
6463 str_buf[i] = TRANSLATE(*d);
6464 str_buf[i+1] = '\0';
6465
6466 # ifdef _LIBC
6467 match = __wcscoll (workp, str_buf);
6468 # else
6469 match = wcscoll (workp, str_buf);
6470 # endif
6471
6472 if (match == 0)
6473 goto char_set_matched;
6474
6475 if (match < 0)
6476 /* (str_buf > workp) indicate (str_buf + X > workp),
6477 because for all X (str_buf + X > str_buf).
6478 So we don't need continue this loop. */
6479 break;
6480
6481 /* Otherwise(str_buf < workp),
6482 (str_buf+next_character) may equals (workp).
6483 So we continue this loop. */
6484 }
6485 /* not matched */
6486 d = backup_d;
6487 dend = backup_dend;
6488 workp += length + 1;
6489 }
6490 }
6491
6492 /* match with char_range? */
6493 # ifdef _LIBC
6494 if (nrules != 0)
6495 {
6496 uint32_t collseqval;
6497 const char *collseq = (const char *)
6498 _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC);
6499
6500 collseqval = collseq_table_lookup (collseq, c);
6501
6502 for (; workp < p - chars_length ;)
6503 {
6504 uint32_t start_val, end_val;
6505
6506 /* We already compute the collation sequence value
6507 of the characters (or collating symbols). */
6508 start_val = (uint32_t) *workp++; /* range_start */
6509 end_val = (uint32_t) *workp++; /* range_end */
6510
6511 if (start_val <= collseqval && collseqval <= end_val)
6512 goto char_set_matched;
6513 }
6514 }
6515 else
6516 # endif
6517 {
6518 /* We set range_start_char at str_buf[0], range_end_char
6519 at str_buf[4], and compared char at str_buf[2]. */
6520 str_buf[1] = 0;
6521 str_buf[2] = c;
6522 str_buf[3] = 0;
6523 str_buf[5] = 0;
6524 for (; workp < p - chars_length ;)
6525 {
6526 wchar_t *range_start_char, *range_end_char;
6527
6528 /* match if (range_start_char <= c <= range_end_char). */
6529
6530 /* If range_start(or end) < 0, we assume -range_start(end)
6531 is the offset of the collating symbol which is specified
6532 as the character of the range start(end). */
6533
6534 /* range_start */
6535 if (*workp < 0)
6536 range_start_char = charset_top - (*workp++);
6537 else
6538 {
6539 str_buf[0] = *workp++;
6540 range_start_char = str_buf;
6541 }
6542
6543 /* range_end */
6544 if (*workp < 0)
6545 range_end_char = charset_top - (*workp++);
6546 else
6547 {
6548 str_buf[4] = *workp++;
6549 range_end_char = str_buf + 4;
6550 }
6551
6552 # ifdef _LIBC
6553 if (__wcscoll (range_start_char, str_buf+2) <= 0
6554 && __wcscoll (str_buf+2, range_end_char) <= 0)
6555 # else
6556 if (wcscoll (range_start_char, str_buf+2) <= 0
6557 && wcscoll (str_buf+2, range_end_char) <= 0)
6558 # endif
6559 goto char_set_matched;
6560 }
6561 }
6562
6563 /* match with char? */
6564 for (; workp < p ; workp++)
6565 if (c == *workp)
6566 goto char_set_matched;
6567
6568 negate = !negate;
6569
6570 char_set_matched:
6571 if (negate) goto fail;
6572 #else
6573 /* Cast to `unsigned' instead of `unsigned char' in case the
6574 bit list is a full 32 bytes long. */
6575 if (c < (unsigned) (*p * BYTEWIDTH)
6576 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
6577 negate = !negate;
6578
6579 p += 1 + *p;
6580
6581 if (!negate) goto fail;
6582 #undef WORK_BUFFER_SIZE
6583 #endif /* WCHAR */
6584 SET_REGS_MATCHED ();
6585 d++;
6586 break;
6587 }
6588
6589
6590 /* The beginning of a group is represented by start_memory.
6591 The arguments are the register number in the next byte, and the
6592 number of groups inner to this one in the next. The text
6593 matched within the group is recorded (in the internal
6594 registers data structure) under the register number. */
6595 case start_memory:
6596 DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n",
6597 (long int) *p, (long int) p[1]);
6598
6599 /* Find out if this group can match the empty string. */
6600 p1 = p; /* To send to group_match_null_string_p. */
6601
6602 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
6603 REG_MATCH_NULL_STRING_P (reg_info[*p])
6604 = PREFIX(group_match_null_string_p) (&p1, pend, reg_info);
6605
6606 /* Save the position in the string where we were the last time
6607 we were at this open-group operator in case the group is
6608 operated upon by a repetition operator, e.g., with `(a*)*b'
6609 against `ab'; then we want to ignore where we are now in
6610 the string in case this attempt to match fails. */
6611 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6612 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
6613 : regstart[*p];
6614 DEBUG_PRINT2 (" old_regstart: %d\n",
6615 POINTER_TO_OFFSET (old_regstart[*p]));
6616
6617 regstart[*p] = d;
6618 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
6619
6620 IS_ACTIVE (reg_info[*p]) = 1;
6621 MATCHED_SOMETHING (reg_info[*p]) = 0;
6622
6623 /* Clear this whenever we change the register activity status. */
6624 set_regs_matched_done = 0;
6625
6626 /* This is the new highest active register. */
6627 highest_active_reg = *p;
6628
6629 /* If nothing was active before, this is the new lowest active
6630 register. */
6631 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6632 lowest_active_reg = *p;
6633
6634 /* Move past the register number and inner group count. */
6635 p += 2;
6636 just_past_start_mem = p;
6637
6638 break;
6639
6640
6641 /* The stop_memory opcode represents the end of a group. Its
6642 arguments are the same as start_memory's: the register
6643 number, and the number of inner groups. */
6644 case stop_memory:
6645 DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n",
6646 (long int) *p, (long int) p[1]);
6647
6648 /* We need to save the string position the last time we were at
6649 this close-group operator in case the group is operated
6650 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
6651 against `aba'; then we want to ignore where we are now in
6652 the string in case this attempt to match fails. */
6653 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6654 ? REG_UNSET (regend[*p]) ? d : regend[*p]
6655 : regend[*p];
6656 DEBUG_PRINT2 (" old_regend: %d\n",
6657 POINTER_TO_OFFSET (old_regend[*p]));
6658
6659 regend[*p] = d;
6660 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
6661
6662 /* This register isn't active anymore. */
6663 IS_ACTIVE (reg_info[*p]) = 0;
6664
6665 /* Clear this whenever we change the register activity status. */
6666 set_regs_matched_done = 0;
6667
6668 /* If this was the only register active, nothing is active
6669 anymore. */
6670 if (lowest_active_reg == highest_active_reg)
6671 {
6672 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6673 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6674 }
6675 else
6676 { /* We must scan for the new highest active register, since
6677 it isn't necessarily one less than now: consider
6678 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
6679 new highest active register is 1. */
6680 UCHAR_T r = *p - 1;
6681 while (r > 0 && !IS_ACTIVE (reg_info[r]))
6682 r--;
6683
6684 /* If we end up at register zero, that means that we saved
6685 the registers as the result of an `on_failure_jump', not
6686 a `start_memory', and we jumped to past the innermost
6687 `stop_memory'. For example, in ((.)*) we save
6688 registers 1 and 2 as a result of the *, but when we pop
6689 back to the second ), we are at the stop_memory 1.
6690 Thus, nothing is active. */
6691 if (r == 0)
6692 {
6693 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6694 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6695 }
6696 else
6697 highest_active_reg = r;
6698 }
6699
6700 /* If just failed to match something this time around with a
6701 group that's operated on by a repetition operator, try to
6702 force exit from the ``loop'', and restore the register
6703 information for this group that we had before trying this
6704 last match. */
6705 if ((!MATCHED_SOMETHING (reg_info[*p])
6706 || just_past_start_mem == p - 1)
6707 && (p + 2) < pend)
6708 {
6709 boolean is_a_jump_n = false;
6710
6711 p1 = p + 2;
6712 mcnt = 0;
6713 switch ((re_opcode_t) *p1++)
6714 {
6715 case jump_n:
6716 is_a_jump_n = true;
6717 /* Fall through. */
6718 case pop_failure_jump:
6719 case maybe_pop_jump:
6720 case jump:
6721 case dummy_failure_jump:
6722 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6723 if (is_a_jump_n)
6724 p1 += OFFSET_ADDRESS_SIZE;
6725 break;
6726
6727 default:
6728 /* do nothing */ ;
6729 }
6730 p1 += mcnt;
6731
6732 /* If the next operation is a jump backwards in the pattern
6733 to an on_failure_jump right before the start_memory
6734 corresponding to this stop_memory, exit from the loop
6735 by forcing a failure after pushing on the stack the
6736 on_failure_jump's jump in the pattern, and d. */
6737 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
6738 && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory
6739 && p1[2+OFFSET_ADDRESS_SIZE] == *p)
6740 {
6741 /* If this group ever matched anything, then restore
6742 what its registers were before trying this last
6743 failed match, e.g., with `(a*)*b' against `ab' for
6744 regstart[1], and, e.g., with `((a*)*(b*)*)*'
6745 against `aba' for regend[3].
6746
6747 Also restore the registers for inner groups for,
6748 e.g., `((a*)(b*))*' against `aba' (register 3 would
6749 otherwise get trashed). */
6750
6751 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
6752 {
6753 unsigned r;
6754
6755 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
6756
6757 /* Restore this and inner groups' (if any) registers. */
6758 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
6759 r++)
6760 {
6761 regstart[r] = old_regstart[r];
6762
6763 /* xx why this test? */
6764 if (old_regend[r] >= regstart[r])
6765 regend[r] = old_regend[r];
6766 }
6767 }
6768 p1++;
6769 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6770 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
6771
6772 goto fail;
6773 }
6774 }
6775
6776 /* Move past the register number and the inner group count. */
6777 p += 2;
6778 break;
6779
6780
6781 /* \<digit> has been turned into a `duplicate' command which is
6782 followed by the numeric value of <digit> as the register number. */
6783 case duplicate:
6784 {
6785 register const CHAR_T *d2, *dend2;
6786 int regno = *p++; /* Get which register to match against. */
6787 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
6788
6789 /* Can't back reference a group which we've never matched. */
6790 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
6791 goto fail;
6792
6793 /* Where in input to try to start matching. */
6794 d2 = regstart[regno];
6795
6796 /* Where to stop matching; if both the place to start and
6797 the place to stop matching are in the same string, then
6798 set to the place to stop, otherwise, for now have to use
6799 the end of the first string. */
6800
6801 dend2 = ((FIRST_STRING_P (regstart[regno])
6802 == FIRST_STRING_P (regend[regno]))
6803 ? regend[regno] : end_match_1);
6804 for (;;)
6805 {
6806 /* If necessary, advance to next segment in register
6807 contents. */
6808 while (d2 == dend2)
6809 {
6810 if (dend2 == end_match_2) break;
6811 if (dend2 == regend[regno]) break;
6812
6813 /* End of string1 => advance to string2. */
6814 d2 = string2;
6815 dend2 = regend[regno];
6816 }
6817 /* At end of register contents => success */
6818 if (d2 == dend2) break;
6819
6820 /* If necessary, advance to next segment in data. */
6821 PREFETCH ();
6822
6823 /* How many characters left in this segment to match. */
6824 mcnt = dend - d;
6825
6826 /* Want how many consecutive characters we can match in
6827 one shot, so, if necessary, adjust the count. */
6828 if (mcnt > dend2 - d2)
6829 mcnt = dend2 - d2;
6830
6831 /* Compare that many; failure if mismatch, else move
6832 past them. */
6833 if (translate
6834 ? PREFIX(bcmp_translate) (d, d2, mcnt, translate)
6835 : memcmp (d, d2, mcnt*sizeof(UCHAR_T)))
6836 goto fail;
6837 d += mcnt, d2 += mcnt;
6838
6839 /* Do this because we've match some characters. */
6840 SET_REGS_MATCHED ();
6841 }
6842 }
6843 break;
6844
6845
6846 /* begline matches the empty string at the beginning of the string
6847 (unless `not_bol' is set in `bufp'), and, if
6848 `newline_anchor' is set, after newlines. */
6849 case begline:
6850 DEBUG_PRINT1 ("EXECUTING begline.\n");
6851
6852 if (AT_STRINGS_BEG (d))
6853 {
6854 if (!bufp->not_bol) break;
6855 }
6856 else if (d[-1] == '\n' && bufp->newline_anchor)
6857 {
6858 break;
6859 }
6860 /* In all other cases, we fail. */
6861 goto fail;
6862
6863
6864 /* endline is the dual of begline. */
6865 case endline:
6866 DEBUG_PRINT1 ("EXECUTING endline.\n");
6867
6868 if (AT_STRINGS_END (d))
6869 {
6870 if (!bufp->not_eol) break;
6871 }
6872
6873 /* We have to ``prefetch'' the next character. */
6874 else if ((d == end1 ? *string2 : *d) == '\n'
6875 && bufp->newline_anchor)
6876 {
6877 break;
6878 }
6879 goto fail;
6880
6881
6882 /* Match at the very beginning of the data. */
6883 case begbuf:
6884 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
6885 if (AT_STRINGS_BEG (d))
6886 break;
6887 goto fail;
6888
6889
6890 /* Match at the very end of the data. */
6891 case endbuf:
6892 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
6893 if (AT_STRINGS_END (d))
6894 break;
6895 goto fail;
6896
6897
6898 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
6899 pushes NULL as the value for the string on the stack. Then
6900 `pop_failure_point' will keep the current value for the
6901 string, instead of restoring it. To see why, consider
6902 matching `foo\nbar' against `.*\n'. The .* matches the foo;
6903 then the . fails against the \n. But the next thing we want
6904 to do is match the \n against the \n; if we restored the
6905 string value, we would be back at the foo.
6906
6907 Because this is used only in specific cases, we don't need to
6908 check all the things that `on_failure_jump' does, to make
6909 sure the right things get saved on the stack. Hence we don't
6910 share its code. The only reason to push anything on the
6911 stack at all is that otherwise we would have to change
6912 `anychar's code to do something besides goto fail in this
6913 case; that seems worse than this. */
6914 case on_failure_keep_string_jump:
6915 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
6916
6917 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6918 #ifdef _LIBC
6919 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
6920 #else
6921 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
6922 #endif
6923
6924 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
6925 break;
6926
6927
6928 /* Uses of on_failure_jump:
6929
6930 Each alternative starts with an on_failure_jump that points
6931 to the beginning of the next alternative. Each alternative
6932 except the last ends with a jump that in effect jumps past
6933 the rest of the alternatives. (They really jump to the
6934 ending jump of the following alternative, because tensioning
6935 these jumps is a hassle.)
6936
6937 Repeats start with an on_failure_jump that points past both
6938 the repetition text and either the following jump or
6939 pop_failure_jump back to this on_failure_jump. */
6940 case on_failure_jump:
6941 on_failure:
6942 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
6943
6944 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6945 #ifdef _LIBC
6946 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
6947 #else
6948 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
6949 #endif
6950
6951 /* If this on_failure_jump comes right before a group (i.e.,
6952 the original * applied to a group), save the information
6953 for that group and all inner ones, so that if we fail back
6954 to this point, the group's information will be correct.
6955 For example, in \(a*\)*\1, we need the preceding group,
6956 and in \(zz\(a*\)b*\)\2, we need the inner group. */
6957
6958 /* We can't use `p' to check ahead because we push
6959 a failure point to `p + mcnt' after we do this. */
6960 p1 = p;
6961
6962 /* We need to skip no_op's before we look for the
6963 start_memory in case this on_failure_jump is happening as
6964 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
6965 against aba. */
6966 while (p1 < pend && (re_opcode_t) *p1 == no_op)
6967 p1++;
6968
6969 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
6970 {
6971 /* We have a new highest active register now. This will
6972 get reset at the start_memory we are about to get to,
6973 but we will have saved all the registers relevant to
6974 this repetition op, as described above. */
6975 highest_active_reg = *(p1 + 1) + *(p1 + 2);
6976 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6977 lowest_active_reg = *(p1 + 1);
6978 }
6979
6980 DEBUG_PRINT1 (":\n");
6981 PUSH_FAILURE_POINT (p + mcnt, d, -2);
6982 break;
6983
6984
6985 /* A smart repeat ends with `maybe_pop_jump'.
6986 We change it to either `pop_failure_jump' or `jump'. */
6987 case maybe_pop_jump:
6988 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6989 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
6990 {
6991 register UCHAR_T *p2 = p;
6992
6993 /* Compare the beginning of the repeat with what in the
6994 pattern follows its end. If we can establish that there
6995 is nothing that they would both match, i.e., that we
6996 would have to backtrack because of (as in, e.g., `a*a')
6997 then we can change to pop_failure_jump, because we'll
6998 never have to backtrack.
6999
7000 This is not true in the case of alternatives: in
7001 `(a|ab)*' we do need to backtrack to the `ab' alternative
7002 (e.g., if the string was `ab'). But instead of trying to
7003 detect that here, the alternative has put on a dummy
7004 failure point which is what we will end up popping. */
7005
7006 /* Skip over open/close-group commands.
7007 If what follows this loop is a ...+ construct,
7008 look at what begins its body, since we will have to
7009 match at least one of that. */
7010 while (1)
7011 {
7012 if (p2 + 2 < pend
7013 && ((re_opcode_t) *p2 == stop_memory
7014 || (re_opcode_t) *p2 == start_memory))
7015 p2 += 3;
7016 else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend
7017 && (re_opcode_t) *p2 == dummy_failure_jump)
7018 p2 += 2 + 2 * OFFSET_ADDRESS_SIZE;
7019 else
7020 break;
7021 }
7022
7023 p1 = p + mcnt;
7024 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
7025 to the `maybe_finalize_jump' of this case. Examine what
7026 follows. */
7027
7028 /* If we're at the end of the pattern, we can change. */
7029 if (p2 == pend)
7030 {
7031 /* Consider what happens when matching ":\(.*\)"
7032 against ":/". I don't really understand this code
7033 yet. */
7034 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7035 pop_failure_jump;
7036 DEBUG_PRINT1
7037 (" End of pattern: change to `pop_failure_jump'.\n");
7038 }
7039
7040 else if ((re_opcode_t) *p2 == exactn
7041 #ifdef MBS_SUPPORT
7042 || (re_opcode_t) *p2 == exactn_bin
7043 #endif
7044 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
7045 {
7046 register UCHAR_T c
7047 = *p2 == (UCHAR_T) endline ? '\n' : p2[2];
7048
7049 if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn
7050 #ifdef MBS_SUPPORT
7051 || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin
7052 #endif
7053 ) && p1[3+OFFSET_ADDRESS_SIZE] != c)
7054 {
7055 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7056 pop_failure_jump;
7057 #ifdef WCHAR
7058 DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n",
7059 (wint_t) c,
7060 (wint_t) p1[3+OFFSET_ADDRESS_SIZE]);
7061 #else
7062 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
7063 (char) c,
7064 (char) p1[3+OFFSET_ADDRESS_SIZE]);
7065 #endif
7066 }
7067
7068 #ifndef WCHAR
7069 else if ((re_opcode_t) p1[3] == charset
7070 || (re_opcode_t) p1[3] == charset_not)
7071 {
7072 int negate = (re_opcode_t) p1[3] == charset_not;
7073
7074 if (c < (unsigned) (p1[4] * BYTEWIDTH)
7075 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
7076 negate = !negate;
7077
7078 /* `negate' is equal to 1 if c would match, which means
7079 that we can't change to pop_failure_jump. */
7080 if (!negate)
7081 {
7082 p[-3] = (unsigned char) pop_failure_jump;
7083 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7084 }
7085 }
7086 #endif /* not WCHAR */
7087 }
7088 #ifndef WCHAR
7089 else if ((re_opcode_t) *p2 == charset)
7090 {
7091 /* We win if the first character of the loop is not part
7092 of the charset. */
7093 if ((re_opcode_t) p1[3] == exactn
7094 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
7095 && (p2[2 + p1[5] / BYTEWIDTH]
7096 & (1 << (p1[5] % BYTEWIDTH)))))
7097 {
7098 p[-3] = (unsigned char) pop_failure_jump;
7099 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7100 }
7101
7102 else if ((re_opcode_t) p1[3] == charset_not)
7103 {
7104 int idx;
7105 /* We win if the charset_not inside the loop
7106 lists every character listed in the charset after. */
7107 for (idx = 0; idx < (int) p2[1]; idx++)
7108 if (! (p2[2 + idx] == 0
7109 || (idx < (int) p1[4]
7110 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
7111 break;
7112
7113 if (idx == p2[1])
7114 {
7115 p[-3] = (unsigned char) pop_failure_jump;
7116 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7117 }
7118 }
7119 else if ((re_opcode_t) p1[3] == charset)
7120 {
7121 int idx;
7122 /* We win if the charset inside the loop
7123 has no overlap with the one after the loop. */
7124 for (idx = 0;
7125 idx < (int) p2[1] && idx < (int) p1[4];
7126 idx++)
7127 if ((p2[2 + idx] & p1[5 + idx]) != 0)
7128 break;
7129
7130 if (idx == p2[1] || idx == p1[4])
7131 {
7132 p[-3] = (unsigned char) pop_failure_jump;
7133 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7134 }
7135 }
7136 }
7137 #endif /* not WCHAR */
7138 }
7139 p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */
7140 if ((re_opcode_t) p[-1] != pop_failure_jump)
7141 {
7142 p[-1] = (UCHAR_T) jump;
7143 DEBUG_PRINT1 (" Match => jump.\n");
7144 goto unconditional_jump;
7145 }
7146 /* Fall through. */
7147
7148
7149 /* The end of a simple repeat has a pop_failure_jump back to
7150 its matching on_failure_jump, where the latter will push a
7151 failure point. The pop_failure_jump takes off failure
7152 points put on by this pop_failure_jump's matching
7153 on_failure_jump; we got through the pattern to here from the
7154 matching on_failure_jump, so didn't fail. */
7155 case pop_failure_jump:
7156 {
7157 /* We need to pass separate storage for the lowest and
7158 highest registers, even though we don't care about the
7159 actual values. Otherwise, we will restore only one
7160 register from the stack, since lowest will == highest in
7161 `pop_failure_point'. */
7162 active_reg_t dummy_low_reg, dummy_high_reg;
7163 UCHAR_T *pdummy ATTRIBUTE_UNUSED = NULL;
7164 const CHAR_T *sdummy ATTRIBUTE_UNUSED = NULL;
7165
7166 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
7167 POP_FAILURE_POINT (sdummy, pdummy,
7168 dummy_low_reg, dummy_high_reg,
7169 reg_dummy, reg_dummy, reg_info_dummy);
7170 }
7171 /* Fall through. */
7172
7173 unconditional_jump:
7174 #ifdef _LIBC
7175 DEBUG_PRINT2 ("\n%p: ", p);
7176 #else
7177 DEBUG_PRINT2 ("\n0x%x: ", p);
7178 #endif
7179 /* Note fall through. */
7180
7181 /* Unconditionally jump (without popping any failure points). */
7182 case jump:
7183 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
7184 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7185 p += mcnt; /* Do the jump. */
7186 #ifdef _LIBC
7187 DEBUG_PRINT2 ("(to %p).\n", p);
7188 #else
7189 DEBUG_PRINT2 ("(to 0x%x).\n", p);
7190 #endif
7191 break;
7192
7193
7194 /* We need this opcode so we can detect where alternatives end
7195 in `group_match_null_string_p' et al. */
7196 case jump_past_alt:
7197 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
7198 goto unconditional_jump;
7199
7200
7201 /* Normally, the on_failure_jump pushes a failure point, which
7202 then gets popped at pop_failure_jump. We will end up at
7203 pop_failure_jump, also, and with a pattern of, say, `a+', we
7204 are skipping over the on_failure_jump, so we have to push
7205 something meaningless for pop_failure_jump to pop. */
7206 case dummy_failure_jump:
7207 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
7208 /* It doesn't matter what we push for the string here. What
7209 the code at `fail' tests is the value for the pattern. */
7210 PUSH_FAILURE_POINT (NULL, NULL, -2);
7211 goto unconditional_jump;
7212
7213
7214 /* At the end of an alternative, we need to push a dummy failure
7215 point in case we are followed by a `pop_failure_jump', because
7216 we don't want the failure point for the alternative to be
7217 popped. For example, matching `(a|ab)*' against `aab'
7218 requires that we match the `ab' alternative. */
7219 case push_dummy_failure:
7220 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
7221 /* See comments just above at `dummy_failure_jump' about the
7222 two zeroes. */
7223 PUSH_FAILURE_POINT (NULL, NULL, -2);
7224 break;
7225
7226 /* Have to succeed matching what follows at least n times.
7227 After that, handle like `on_failure_jump'. */
7228 case succeed_n:
7229 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7230 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
7231
7232 assert (mcnt >= 0);
7233 /* Originally, this is how many times we HAVE to succeed. */
7234 if (mcnt > 0)
7235 {
7236 mcnt--;
7237 p += OFFSET_ADDRESS_SIZE;
7238 STORE_NUMBER_AND_INCR (p, mcnt);
7239 #ifdef _LIBC
7240 DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE
7241 , mcnt);
7242 #else
7243 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE
7244 , mcnt);
7245 #endif
7246 }
7247 else if (mcnt == 0)
7248 {
7249 #ifdef _LIBC
7250 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n",
7251 p + OFFSET_ADDRESS_SIZE);
7252 #else
7253 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n",
7254 p + OFFSET_ADDRESS_SIZE);
7255 #endif /* _LIBC */
7256
7257 #ifdef WCHAR
7258 p[1] = (UCHAR_T) no_op;
7259 #else
7260 p[2] = (UCHAR_T) no_op;
7261 p[3] = (UCHAR_T) no_op;
7262 #endif /* WCHAR */
7263 goto on_failure;
7264 }
7265 break;
7266
7267 case jump_n:
7268 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7269 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
7270
7271 /* Originally, this is how many times we CAN jump. */
7272 if (mcnt)
7273 {
7274 mcnt--;
7275 STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt);
7276
7277 #ifdef _LIBC
7278 DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE,
7279 mcnt);
7280 #else
7281 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE,
7282 mcnt);
7283 #endif /* _LIBC */
7284 goto unconditional_jump;
7285 }
7286 /* If don't have to jump any more, skip over the rest of command. */
7287 else
7288 p += 2 * OFFSET_ADDRESS_SIZE;
7289 break;
7290
7291 case set_number_at:
7292 {
7293 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
7294
7295 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7296 p1 = p + mcnt;
7297 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7298 #ifdef _LIBC
7299 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
7300 #else
7301 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
7302 #endif
7303 STORE_NUMBER (p1, mcnt);
7304 break;
7305 }
7306
7307 #if 0
7308 /* The DEC Alpha C compiler 3.x generates incorrect code for the
7309 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7310 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
7311 macro and introducing temporary variables works around the bug. */
7312
7313 case wordbound:
7314 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7315 if (AT_WORD_BOUNDARY (d))
7316 break;
7317 goto fail;
7318
7319 case notwordbound:
7320 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7321 if (AT_WORD_BOUNDARY (d))
7322 goto fail;
7323 break;
7324 #else
7325 case wordbound:
7326 {
7327 boolean prevchar, thischar;
7328
7329 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7330 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7331 break;
7332
7333 prevchar = WORDCHAR_P (d - 1);
7334 thischar = WORDCHAR_P (d);
7335 if (prevchar != thischar)
7336 break;
7337 goto fail;
7338 }
7339
7340 case notwordbound:
7341 {
7342 boolean prevchar, thischar;
7343
7344 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7345 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7346 goto fail;
7347
7348 prevchar = WORDCHAR_P (d - 1);
7349 thischar = WORDCHAR_P (d);
7350 if (prevchar != thischar)
7351 goto fail;
7352 break;
7353 }
7354 #endif
7355
7356 case wordbeg:
7357 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
7358 if (!AT_STRINGS_END (d) && WORDCHAR_P (d)
7359 && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
7360 break;
7361 goto fail;
7362
7363 case wordend:
7364 DEBUG_PRINT1 ("EXECUTING wordend.\n");
7365 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
7366 && (AT_STRINGS_END (d) || !WORDCHAR_P (d)))
7367 break;
7368 goto fail;
7369
7370 #ifdef emacs
7371 case before_dot:
7372 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
7373 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
7374 goto fail;
7375 break;
7376
7377 case at_dot:
7378 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
7379 if (PTR_CHAR_POS ((unsigned char *) d) != point)
7380 goto fail;
7381 break;
7382
7383 case after_dot:
7384 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
7385 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
7386 goto fail;
7387 break;
7388
7389 case syntaxspec:
7390 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
7391 mcnt = *p++;
7392 goto matchsyntax;
7393
7394 case wordchar:
7395 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
7396 mcnt = (int) Sword;
7397 matchsyntax:
7398 PREFETCH ();
7399 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7400 d++;
7401 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
7402 goto fail;
7403 SET_REGS_MATCHED ();
7404 break;
7405
7406 case notsyntaxspec:
7407 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
7408 mcnt = *p++;
7409 goto matchnotsyntax;
7410
7411 case notwordchar:
7412 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
7413 mcnt = (int) Sword;
7414 matchnotsyntax:
7415 PREFETCH ();
7416 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7417 d++;
7418 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
7419 goto fail;
7420 SET_REGS_MATCHED ();
7421 break;
7422
7423 #else /* not emacs */
7424 case wordchar:
7425 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
7426 PREFETCH ();
7427 if (!WORDCHAR_P (d))
7428 goto fail;
7429 SET_REGS_MATCHED ();
7430 d++;
7431 break;
7432
7433 case notwordchar:
7434 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
7435 PREFETCH ();
7436 if (WORDCHAR_P (d))
7437 goto fail;
7438 SET_REGS_MATCHED ();
7439 d++;
7440 break;
7441 #endif /* not emacs */
7442
7443 default:
7444 abort ();
7445 }
7446 continue; /* Successfully executed one pattern command; keep going. */
7447
7448
7449 /* We goto here if a matching operation fails. */
7450 fail:
7451 if (!FAIL_STACK_EMPTY ())
7452 { /* A restart point is known. Restore to that state. */
7453 DEBUG_PRINT1 ("\nFAIL:\n");
7454 POP_FAILURE_POINT (d, p,
7455 lowest_active_reg, highest_active_reg,
7456 regstart, regend, reg_info);
7457
7458 /* If this failure point is a dummy, try the next one. */
7459 if (!p)
7460 goto fail;
7461
7462 /* If we failed to the end of the pattern, don't examine *p. */
7463 assert (p <= pend);
7464 if (p < pend)
7465 {
7466 boolean is_a_jump_n = false;
7467
7468 /* If failed to a backwards jump that's part of a repetition
7469 loop, need to pop this failure point and use the next one. */
7470 switch ((re_opcode_t) *p)
7471 {
7472 case jump_n:
7473 is_a_jump_n = true;
7474 /* Fall through. */
7475 case maybe_pop_jump:
7476 case pop_failure_jump:
7477 case jump:
7478 p1 = p + 1;
7479 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7480 p1 += mcnt;
7481
7482 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
7483 || (!is_a_jump_n
7484 && (re_opcode_t) *p1 == on_failure_jump))
7485 goto fail;
7486 break;
7487 default:
7488 /* do nothing */ ;
7489 }
7490 }
7491
7492 if (d >= string1 && d <= end1)
7493 dend = end_match_1;
7494 }
7495 else
7496 break; /* Matching at this starting point really fails. */
7497 } /* for (;;) */
7498
7499 if (best_regs_set)
7500 goto restore_best_regs;
7501
7502 FREE_VARIABLES ();
7503
7504 return -1; /* Failure to match. */
7505 } /* re_match_2 */
7506
7507 /* Subroutine definitions for re_match_2. */
7509
7510
7511 /* We are passed P pointing to a register number after a start_memory.
7512
7513 Return true if the pattern up to the corresponding stop_memory can
7514 match the empty string, and false otherwise.
7515
7516 If we find the matching stop_memory, sets P to point to one past its number.
7517 Otherwise, sets P to an undefined byte less than or equal to END.
7518
7519 We don't handle duplicates properly (yet). */
7520
7521 static boolean
7522 PREFIX(group_match_null_string_p) (UCHAR_T **p, UCHAR_T *end,
7523 PREFIX(register_info_type) *reg_info)
7524 {
7525 int mcnt;
7526 /* Point to after the args to the start_memory. */
7527 UCHAR_T *p1 = *p + 2;
7528
7529 while (p1 < end)
7530 {
7531 /* Skip over opcodes that can match nothing, and return true or
7532 false, as appropriate, when we get to one that can't, or to the
7533 matching stop_memory. */
7534
7535 switch ((re_opcode_t) *p1)
7536 {
7537 /* Could be either a loop or a series of alternatives. */
7538 case on_failure_jump:
7539 p1++;
7540 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7541
7542 /* If the next operation is not a jump backwards in the
7543 pattern. */
7544
7545 if (mcnt >= 0)
7546 {
7547 /* Go through the on_failure_jumps of the alternatives,
7548 seeing if any of the alternatives cannot match nothing.
7549 The last alternative starts with only a jump,
7550 whereas the rest start with on_failure_jump and end
7551 with a jump, e.g., here is the pattern for `a|b|c':
7552
7553 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
7554 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
7555 /exactn/1/c
7556
7557 So, we have to first go through the first (n-1)
7558 alternatives and then deal with the last one separately. */
7559
7560
7561 /* Deal with the first (n-1) alternatives, which start
7562 with an on_failure_jump (see above) that jumps to right
7563 past a jump_past_alt. */
7564
7565 while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] ==
7566 jump_past_alt)
7567 {
7568 /* `mcnt' holds how many bytes long the alternative
7569 is, including the ending `jump_past_alt' and
7570 its number. */
7571
7572 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt -
7573 (1 + OFFSET_ADDRESS_SIZE),
7574 reg_info))
7575 return false;
7576
7577 /* Move to right after this alternative, including the
7578 jump_past_alt. */
7579 p1 += mcnt;
7580
7581 /* Break if it's the beginning of an n-th alternative
7582 that doesn't begin with an on_failure_jump. */
7583 if ((re_opcode_t) *p1 != on_failure_jump)
7584 break;
7585
7586 /* Still have to check that it's not an n-th
7587 alternative that starts with an on_failure_jump. */
7588 p1++;
7589 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7590 if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] !=
7591 jump_past_alt)
7592 {
7593 /* Get to the beginning of the n-th alternative. */
7594 p1 -= 1 + OFFSET_ADDRESS_SIZE;
7595 break;
7596 }
7597 }
7598
7599 /* Deal with the last alternative: go back and get number
7600 of the `jump_past_alt' just before it. `mcnt' contains
7601 the length of the alternative. */
7602 EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE);
7603
7604 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt, reg_info))
7605 return false;
7606
7607 p1 += mcnt; /* Get past the n-th alternative. */
7608 } /* if mcnt > 0 */
7609 break;
7610
7611
7612 case stop_memory:
7613 assert (p1[1] == **p);
7614 *p = p1 + 2;
7615 return true;
7616
7617
7618 default:
7619 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7620 return false;
7621 }
7622 } /* while p1 < end */
7623
7624 return false;
7625 } /* group_match_null_string_p */
7626
7627
7628 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
7629 It expects P to be the first byte of a single alternative and END one
7630 byte past the last. The alternative can contain groups. */
7631
7632 static boolean
7633 PREFIX(alt_match_null_string_p) (UCHAR_T *p, UCHAR_T *end,
7634 PREFIX(register_info_type) *reg_info)
7635 {
7636 int mcnt;
7637 UCHAR_T *p1 = p;
7638
7639 while (p1 < end)
7640 {
7641 /* Skip over opcodes that can match nothing, and break when we get
7642 to one that can't. */
7643
7644 switch ((re_opcode_t) *p1)
7645 {
7646 /* It's a loop. */
7647 case on_failure_jump:
7648 p1++;
7649 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7650 p1 += mcnt;
7651 break;
7652
7653 default:
7654 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7655 return false;
7656 }
7657 } /* while p1 < end */
7658
7659 return true;
7660 } /* alt_match_null_string_p */
7661
7662
7663 /* Deals with the ops common to group_match_null_string_p and
7664 alt_match_null_string_p.
7665
7666 Sets P to one after the op and its arguments, if any. */
7667
7668 static boolean
7669 PREFIX(common_op_match_null_string_p) (UCHAR_T **p, UCHAR_T *end,
7670 PREFIX(register_info_type) *reg_info)
7671 {
7672 int mcnt;
7673 boolean ret;
7674 int reg_no;
7675 UCHAR_T *p1 = *p;
7676
7677 switch ((re_opcode_t) *p1++)
7678 {
7679 case no_op:
7680 case begline:
7681 case endline:
7682 case begbuf:
7683 case endbuf:
7684 case wordbeg:
7685 case wordend:
7686 case wordbound:
7687 case notwordbound:
7688 #ifdef emacs
7689 case before_dot:
7690 case at_dot:
7691 case after_dot:
7692 #endif
7693 break;
7694
7695 case start_memory:
7696 reg_no = *p1;
7697 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
7698 ret = PREFIX(group_match_null_string_p) (&p1, end, reg_info);
7699
7700 /* Have to set this here in case we're checking a group which
7701 contains a group and a back reference to it. */
7702
7703 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
7704 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
7705
7706 if (!ret)
7707 return false;
7708 break;
7709
7710 /* If this is an optimized succeed_n for zero times, make the jump. */
7711 case jump:
7712 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7713 if (mcnt >= 0)
7714 p1 += mcnt;
7715 else
7716 return false;
7717 break;
7718
7719 case succeed_n:
7720 /* Get to the number of times to succeed. */
7721 p1 += OFFSET_ADDRESS_SIZE;
7722 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7723
7724 if (mcnt == 0)
7725 {
7726 p1 -= 2 * OFFSET_ADDRESS_SIZE;
7727 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7728 p1 += mcnt;
7729 }
7730 else
7731 return false;
7732 break;
7733
7734 case duplicate:
7735 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
7736 return false;
7737 break;
7738
7739 case set_number_at:
7740 p1 += 2 * OFFSET_ADDRESS_SIZE;
7741 return false;
7742
7743 default:
7744 /* All other opcodes mean we cannot match the empty string. */
7745 return false;
7746 }
7747
7748 *p = p1;
7749 return true;
7750 } /* common_op_match_null_string_p */
7751
7752
7753 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
7754 bytes; nonzero otherwise. */
7755
7756 static int
7757 PREFIX(bcmp_translate) (const CHAR_T *s1, const CHAR_T *s2, register int len,
7758 RE_TRANSLATE_TYPE translate)
7759 {
7760 register const UCHAR_T *p1 = (const UCHAR_T *) s1;
7761 register const UCHAR_T *p2 = (const UCHAR_T *) s2;
7762 while (len)
7763 {
7764 #ifdef WCHAR
7765 if (((*p1<=0xff)?translate[*p1++]:*p1++)
7766 != ((*p2<=0xff)?translate[*p2++]:*p2++))
7767 return 1;
7768 #else /* BYTE */
7769 if (translate[*p1++] != translate[*p2++]) return 1;
7770 #endif /* WCHAR */
7771 len--;
7772 }
7773 return 0;
7774 }
7775
7776
7778 #else /* not INSIDE_RECURSION */
7779
7780 /* Entry points for GNU code. */
7781
7782 /* re_compile_pattern is the GNU regular expression compiler: it
7783 compiles PATTERN (of length SIZE) and puts the result in BUFP.
7784 Returns 0 if the pattern was valid, otherwise an error string.
7785
7786 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
7787 are set in BUFP on entry.
7788
7789 We call regex_compile to do the actual compilation. */
7790
7791 const char *
7792 re_compile_pattern (const char *pattern, size_t length,
7793 struct re_pattern_buffer *bufp)
7794 {
7795 reg_errcode_t ret;
7796
7797 /* GNU code is written to assume at least RE_NREGS registers will be set
7798 (and at least one extra will be -1). */
7799 bufp->regs_allocated = REGS_UNALLOCATED;
7800
7801 /* And GNU code determines whether or not to get register information
7802 by passing null for the REGS argument to re_match, etc., not by
7803 setting no_sub. */
7804 bufp->no_sub = 0;
7805
7806 /* Match anchors at newline. */
7807 bufp->newline_anchor = 1;
7808
7809 # ifdef MBS_SUPPORT
7810 if (MB_CUR_MAX != 1)
7811 ret = wcs_regex_compile (pattern, length, re_syntax_options, bufp);
7812 else
7813 # endif
7814 ret = byte_regex_compile (pattern, length, re_syntax_options, bufp);
7815
7816 if (!ret)
7817 return NULL;
7818 return gettext (re_error_msgid[(int) ret]);
7819 }
7820 #ifdef _LIBC
7821 weak_alias (__re_compile_pattern, re_compile_pattern)
7822 #endif
7823
7824 /* Entry points compatible with 4.2 BSD regex library. We don't define
7826 them unless specifically requested. */
7827
7828 #if defined _REGEX_RE_COMP || defined _LIBC
7829
7830 /* BSD has one and only one pattern buffer. */
7831 static struct re_pattern_buffer re_comp_buf;
7832
7833 char *
7834 #ifdef _LIBC
7835 /* Make these definitions weak in libc, so POSIX programs can redefine
7836 these names if they don't use our functions, and still use
7837 regcomp/regexec below without link errors. */
7838 weak_function
7839 #endif
7840 re_comp (const char *s)
7841 {
7842 reg_errcode_t ret;
7843
7844 if (!s)
7845 {
7846 if (!re_comp_buf.buffer)
7847 return (char *) gettext ("No previous regular expression");
7848 return 0;
7849 }
7850
7851 if (!re_comp_buf.buffer)
7852 {
7853 re_comp_buf.buffer = (unsigned char *) malloc (200);
7854 if (re_comp_buf.buffer == NULL)
7855 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
7856 re_comp_buf.allocated = 200;
7857
7858 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
7859 if (re_comp_buf.fastmap == NULL)
7860 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
7861 }
7862
7863 /* Since `re_exec' always passes NULL for the `regs' argument, we
7864 don't need to initialize the pattern buffer fields which affect it. */
7865
7866 /* Match anchors at newlines. */
7867 re_comp_buf.newline_anchor = 1;
7868
7869 # ifdef MBS_SUPPORT
7870 if (MB_CUR_MAX != 1)
7871 ret = wcs_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
7872 else
7873 # endif
7874 ret = byte_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
7875
7876 if (!ret)
7877 return NULL;
7878
7879 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
7880 return (char *) gettext (re_error_msgid[(int) ret]);
7881 }
7882
7883
7884 int
7885 #ifdef _LIBC
7886 weak_function
7887 #endif
7888 re_exec (const char *s)
7889 {
7890 const int len = strlen (s);
7891 return
7892 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
7893 }
7894
7895 #endif /* _REGEX_RE_COMP */
7896
7897 /* POSIX.2 functions. Don't define these for Emacs. */
7899
7900 #ifndef emacs
7901
7902 /* regcomp takes a regular expression as a string and compiles it.
7903
7904 PREG is a regex_t *. We do not expect any fields to be initialized,
7905 since POSIX says we shouldn't. Thus, we set
7906
7907 `buffer' to the compiled pattern;
7908 `used' to the length of the compiled pattern;
7909 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
7910 REG_EXTENDED bit in CFLAGS is set; otherwise, to
7911 RE_SYNTAX_POSIX_BASIC;
7912 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
7913 `fastmap' to an allocated space for the fastmap;
7914 `fastmap_accurate' to zero;
7915 `re_nsub' to the number of subexpressions in PATTERN.
7916
7917 PATTERN is the address of the pattern string.
7918
7919 CFLAGS is a series of bits which affect compilation.
7920
7921 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
7922 use POSIX basic syntax.
7923
7924 If REG_NEWLINE is set, then . and [^...] don't match newline.
7925 Also, regexec will try a match beginning after every newline.
7926
7927 If REG_ICASE is set, then we considers upper- and lowercase
7928 versions of letters to be equivalent when matching.
7929
7930 If REG_NOSUB is set, then when PREG is passed to regexec, that
7931 routine will report only success or failure, and nothing about the
7932 registers.
7933
7934 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
7935 the return codes and their meanings.) */
7936
7937 int
7938 regcomp (regex_t *preg, const char *pattern, int cflags)
7939 {
7940 reg_errcode_t ret;
7941 reg_syntax_t syntax
7942 = (cflags & REG_EXTENDED) ?
7943 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
7944
7945 /* regex_compile will allocate the space for the compiled pattern. */
7946 preg->buffer = 0;
7947 preg->allocated = 0;
7948 preg->used = 0;
7949
7950 /* Try to allocate space for the fastmap. */
7951 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
7952
7953 if (cflags & REG_ICASE)
7954 {
7955 int i;
7956
7957 preg->translate
7958 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
7959 * sizeof (*(RE_TRANSLATE_TYPE)0));
7960 if (preg->translate == NULL)
7961 return (int) REG_ESPACE;
7962
7963 /* Map uppercase characters to corresponding lowercase ones. */
7964 for (i = 0; i < CHAR_SET_SIZE; i++)
7965 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
7966 }
7967 else
7968 preg->translate = NULL;
7969
7970 /* If REG_NEWLINE is set, newlines are treated differently. */
7971 if (cflags & REG_NEWLINE)
7972 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
7973 syntax &= ~RE_DOT_NEWLINE;
7974 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
7975 /* It also changes the matching behavior. */
7976 preg->newline_anchor = 1;
7977 }
7978 else
7979 preg->newline_anchor = 0;
7980
7981 preg->no_sub = !!(cflags & REG_NOSUB);
7982
7983 /* POSIX says a null character in the pattern terminates it, so we
7984 can use strlen here in compiling the pattern. */
7985 # ifdef MBS_SUPPORT
7986 if (MB_CUR_MAX != 1)
7987 ret = wcs_regex_compile (pattern, strlen (pattern), syntax, preg);
7988 else
7989 # endif
7990 ret = byte_regex_compile (pattern, strlen (pattern), syntax, preg);
7991
7992 /* POSIX doesn't distinguish between an unmatched open-group and an
7993 unmatched close-group: both are REG_EPAREN. */
7994 if (ret == REG_ERPAREN) ret = REG_EPAREN;
7995
7996 if (ret == REG_NOERROR && preg->fastmap)
7997 {
7998 /* Compute the fastmap now, since regexec cannot modify the pattern
7999 buffer. */
8000 if (re_compile_fastmap (preg) == -2)
8001 {
8002 /* Some error occurred while computing the fastmap, just forget
8003 about it. */
8004 free (preg->fastmap);
8005 preg->fastmap = NULL;
8006 }
8007 }
8008
8009 return (int) ret;
8010 }
8011 #ifdef _LIBC
8012 weak_alias (__regcomp, regcomp)
8013 #endif
8014
8015
8016 /* regexec searches for a given pattern, specified by PREG, in the
8017 string STRING.
8018
8019 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
8020 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
8021 least NMATCH elements, and we set them to the offsets of the
8022 corresponding matched substrings.
8023
8024 EFLAGS specifies `execution flags' which affect matching: if
8025 REG_NOTBOL is set, then ^ does not match at the beginning of the
8026 string; if REG_NOTEOL is set, then $ does not match at the end.
8027
8028 We return 0 if we find a match and REG_NOMATCH if not. */
8029
8030 int
8031 regexec (const regex_t *preg, const char *string, size_t nmatch,
8032 regmatch_t pmatch[], int eflags)
8033 {
8034 int ret;
8035 struct re_registers regs;
8036 regex_t private_preg;
8037 int len = strlen (string);
8038 boolean want_reg_info = !preg->no_sub && nmatch > 0;
8039
8040 private_preg = *preg;
8041
8042 private_preg.not_bol = !!(eflags & REG_NOTBOL);
8043 private_preg.not_eol = !!(eflags & REG_NOTEOL);
8044
8045 /* The user has told us exactly how many registers to return
8046 information about, via `nmatch'. We have to pass that on to the
8047 matching routines. */
8048 private_preg.regs_allocated = REGS_FIXED;
8049
8050 if (want_reg_info)
8051 {
8052 regs.num_regs = nmatch;
8053 regs.start = TALLOC (nmatch * 2, regoff_t);
8054 if (regs.start == NULL)
8055 return (int) REG_NOMATCH;
8056 regs.end = regs.start + nmatch;
8057 }
8058
8059 /* Perform the searching operation. */
8060 ret = re_search (&private_preg, string, len,
8061 /* start: */ 0, /* range: */ len,
8062 want_reg_info ? ®s : (struct re_registers *) 0);
8063
8064 /* Copy the register information to the POSIX structure. */
8065 if (want_reg_info)
8066 {
8067 if (ret >= 0)
8068 {
8069 unsigned r;
8070
8071 for (r = 0; r < nmatch; r++)
8072 {
8073 pmatch[r].rm_so = regs.start[r];
8074 pmatch[r].rm_eo = regs.end[r];
8075 }
8076 }
8077
8078 /* If we needed the temporary register info, free the space now. */
8079 free (regs.start);
8080 }
8081
8082 /* We want zero return to mean success, unlike `re_search'. */
8083 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
8084 }
8085 #ifdef _LIBC
8086 weak_alias (__regexec, regexec)
8087 #endif
8088
8089
8090 /* Returns a message corresponding to an error code, ERRCODE, returned
8091 from either regcomp or regexec. We don't use PREG here. */
8092
8093 size_t
8094 regerror (int errcode, const regex_t *preg ATTRIBUTE_UNUSED,
8095 char *errbuf, size_t errbuf_size)
8096 {
8097 const char *msg;
8098 size_t msg_size;
8099
8100 if (errcode < 0
8101 || errcode >= (int) (sizeof (re_error_msgid)
8102 / sizeof (re_error_msgid[0])))
8103 /* Only error codes returned by the rest of the code should be passed
8104 to this routine. If we are given anything else, or if other regex
8105 code generates an invalid error code, then the program has a bug.
8106 Dump core so we can fix it. */
8107 abort ();
8108
8109 msg = gettext (re_error_msgid[errcode]);
8110
8111 msg_size = strlen (msg) + 1; /* Includes the null. */
8112
8113 if (errbuf_size != 0)
8114 {
8115 if (msg_size > errbuf_size)
8116 {
8117 #if defined HAVE_MEMPCPY || defined _LIBC
8118 *((char *) mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
8119 #else
8120 (void) memcpy (errbuf, msg, errbuf_size - 1);
8121 errbuf[errbuf_size - 1] = 0;
8122 #endif
8123 }
8124 else
8125 (void) memcpy (errbuf, msg, msg_size);
8126 }
8127
8128 return msg_size;
8129 }
8130 #ifdef _LIBC
8131 weak_alias (__regerror, regerror)
8132 #endif
8133
8134
8135 /* Free dynamically allocated space used by PREG. */
8136
8137 void
8138 regfree (regex_t *preg)
8139 {
8140 free (preg->buffer);
8141 preg->buffer = NULL;
8142
8143 preg->allocated = 0;
8144 preg->used = 0;
8145
8146 free (preg->fastmap);
8147 preg->fastmap = NULL;
8148 preg->fastmap_accurate = 0;
8149
8150 free (preg->translate);
8151 preg->translate = NULL;
8152 }
8153 #ifdef _LIBC
8154 weak_alias (__regfree, regfree)
8155 #endif
8156
8157 #endif /* not emacs */
8158
8159 #endif /* not INSIDE_RECURSION */
8160
8161
8162 #undef STORE_NUMBER
8164 #undef STORE_NUMBER_AND_INCR
8165 #undef EXTRACT_NUMBER
8166 #undef EXTRACT_NUMBER_AND_INCR
8167
8168 #undef DEBUG_PRINT_COMPILED_PATTERN
8169 #undef DEBUG_PRINT_DOUBLE_STRING
8170
8171 #undef INIT_FAIL_STACK
8172 #undef RESET_FAIL_STACK
8173 #undef DOUBLE_FAIL_STACK
8174 #undef PUSH_PATTERN_OP
8175 #undef PUSH_FAILURE_POINTER
8176 #undef PUSH_FAILURE_INT
8177 #undef PUSH_FAILURE_ELT
8178 #undef POP_FAILURE_POINTER
8179 #undef POP_FAILURE_INT
8180 #undef POP_FAILURE_ELT
8181 #undef DEBUG_PUSH
8182 #undef DEBUG_POP
8183 #undef PUSH_FAILURE_POINT
8184 #undef POP_FAILURE_POINT
8185
8186 #undef REG_UNSET_VALUE
8187 #undef REG_UNSET
8188
8189 #undef PATFETCH
8190 #undef PATFETCH_RAW
8191 #undef PATUNFETCH
8192 #undef TRANSLATE
8193
8194 #undef INIT_BUF_SIZE
8195 #undef GET_BUFFER_SPACE
8196 #undef BUF_PUSH
8197 #undef BUF_PUSH_2
8198 #undef BUF_PUSH_3
8199 #undef STORE_JUMP
8200 #undef STORE_JUMP2
8201 #undef INSERT_JUMP
8202 #undef INSERT_JUMP2
8203 #undef EXTEND_BUFFER
8204 #undef GET_UNSIGNED_NUMBER
8205 #undef FREE_STACK_RETURN
8206
8207 # undef POINTER_TO_OFFSET
8208 # undef MATCHING_IN_FRST_STRING
8209 # undef PREFETCH
8210 # undef AT_STRINGS_BEG
8211 # undef AT_STRINGS_END
8212 # undef WORDCHAR_P
8213 # undef FREE_VAR
8214 # undef FREE_VARIABLES
8215 # undef NO_HIGHEST_ACTIVE_REG
8216 # undef NO_LOWEST_ACTIVE_REG
8217
8218 # undef CHAR_T
8219 # undef UCHAR_T
8220 # undef COMPILED_BUFFER_VAR
8221 # undef OFFSET_ADDRESS_SIZE
8222 # undef CHAR_CLASS_SIZE
8223 # undef PREFIX
8224 # undef ARG_PREFIX
8225 # undef PUT_CHAR
8226 # undef BYTE
8227 # undef WCHAR
8228
8229 # define DEFINED_ONCE
8230