regex.c revision 1.1 1 /* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P1003.2/D11.2, except for some of the
4 internationalization features.)
5
6 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
7 2002, 2005, 2010 Free Software Foundation, Inc.
8 This file is part of the GNU C Library.
9
10 The GNU C Library is free software; you can redistribute it and/or
11 modify it under the terms of the GNU Lesser General Public
12 License as published by the Free Software Foundation; either
13 version 2.1 of the License, or (at your option) any later version.
14
15 The GNU C Library is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 Lesser General Public License for more details.
19
20 You should have received a copy of the GNU Lesser General Public
21 License along with the GNU C Library; if not, write to the Free
22 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 02110-1301 USA. */
24
25 /* This file has been modified for usage in libiberty. It includes "xregex.h"
26 instead of <regex.h>. The "xregex.h" header file renames all external
27 routines with an "x" prefix so they do not collide with the native regex
28 routines or with other components regex routines. */
29 /* AIX requires this to be the first thing in the file. */
30 #if defined _AIX && !defined __GNUC__ && !defined REGEX_MALLOC
31 #pragma alloca
32 #endif
33
34 #undef _GNU_SOURCE
35 #define _GNU_SOURCE
36
37 #ifndef INSIDE_RECURSION
38 # ifdef HAVE_CONFIG_H
39 # include <config.h>
40 # endif
41 #endif
42
43 #include <ansidecl.h>
44
45 #ifndef INSIDE_RECURSION
46
47 # if defined STDC_HEADERS && !defined emacs
48 # include <stddef.h>
49 # else
50 /* We need this for `regex.h', and perhaps for the Emacs include files. */
51 # include <sys/types.h>
52 # endif
53
54 # define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
55
56 /* For platform which support the ISO C amendement 1 functionality we
57 support user defined character classes. */
58 # if defined _LIBC || WIDE_CHAR_SUPPORT
59 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
60 # include <wchar.h>
61 # include <wctype.h>
62 # endif
63
64 # ifdef _LIBC
65 /* We have to keep the namespace clean. */
66 # define regfree(preg) __regfree (preg)
67 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
68 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
69 # define regerror(errcode, preg, errbuf, errbuf_size) \
70 __regerror(errcode, preg, errbuf, errbuf_size)
71 # define re_set_registers(bu, re, nu, st, en) \
72 __re_set_registers (bu, re, nu, st, en)
73 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
74 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
75 # define re_match(bufp, string, size, pos, regs) \
76 __re_match (bufp, string, size, pos, regs)
77 # define re_search(bufp, string, size, startpos, range, regs) \
78 __re_search (bufp, string, size, startpos, range, regs)
79 # define re_compile_pattern(pattern, length, bufp) \
80 __re_compile_pattern (pattern, length, bufp)
81 # define re_set_syntax(syntax) __re_set_syntax (syntax)
82 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
83 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
84 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
85
86 # define btowc __btowc
87
88 /* We are also using some library internals. */
89 # include <locale/localeinfo.h>
90 # include <locale/elem-hash.h>
91 # include <langinfo.h>
92 # include <locale/coll-lookup.h>
93 # endif
94
95 /* This is for other GNU distributions with internationalized messages. */
96 # if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
97 # include <libintl.h>
98 # ifdef _LIBC
99 # undef gettext
100 # define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES)
101 # endif
102 # else
103 # define gettext(msgid) (msgid)
104 # endif
105
106 # ifndef gettext_noop
107 /* This define is so xgettext can find the internationalizable
108 strings. */
109 # define gettext_noop(String) String
110 # endif
111
112 /* The `emacs' switch turns on certain matching commands
113 that make sense only in Emacs. */
114 # ifdef emacs
115
116 # include "lisp.h"
117 # include "buffer.h"
118 # include "syntax.h"
119
120 # else /* not emacs */
121
122 /* If we are not linking with Emacs proper,
123 we can't use the relocating allocator
124 even if config.h says that we can. */
125 # undef REL_ALLOC
126
127 # if defined STDC_HEADERS || defined _LIBC
128 # include <stdlib.h>
129 # else
130 char *malloc ();
131 char *realloc ();
132 # endif
133
134 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
135 If nothing else has been done, use the method below. */
136 # ifdef INHIBIT_STRING_HEADER
137 # if !(defined HAVE_BZERO && defined HAVE_BCOPY)
138 # if !defined bzero && !defined bcopy
139 # undef INHIBIT_STRING_HEADER
140 # endif
141 # endif
142 # endif
143
144 /* This is the normal way of making sure we have a bcopy and a bzero.
145 This is used in most programs--a few other programs avoid this
146 by defining INHIBIT_STRING_HEADER. */
147 # ifndef INHIBIT_STRING_HEADER
148 # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
149 # include <string.h>
150 # ifndef bzero
151 # ifndef _LIBC
152 # define bzero(s, n) (memset (s, '\0', n), (s))
153 # else
154 # define bzero(s, n) __bzero (s, n)
155 # endif
156 # endif
157 # else
158 # include <strings.h>
159 # ifndef memcmp
160 # define memcmp(s1, s2, n) bcmp (s1, s2, n)
161 # endif
162 # ifndef memcpy
163 # define memcpy(d, s, n) (bcopy (s, d, n), (d))
164 # endif
165 # endif
166 # endif
167
168 /* Define the syntax stuff for \<, \>, etc. */
169
170 /* This must be nonzero for the wordchar and notwordchar pattern
171 commands in re_match_2. */
172 # ifndef Sword
173 # define Sword 1
174 # endif
175
176 # ifdef SWITCH_ENUM_BUG
177 # define SWITCH_ENUM_CAST(x) ((int)(x))
178 # else
179 # define SWITCH_ENUM_CAST(x) (x)
180 # endif
181
182 # endif /* not emacs */
183
184 # if defined _LIBC || HAVE_LIMITS_H
185 # include <limits.h>
186 # endif
187
188 # ifndef MB_LEN_MAX
189 # define MB_LEN_MAX 1
190 # endif
191
192 /* Get the interface, including the syntax bits. */
194 # include "xregex.h" /* change for libiberty */
195
196 /* isalpha etc. are used for the character classes. */
197 # include <ctype.h>
198
199 /* Jim Meyering writes:
200
201 "... Some ctype macros are valid only for character codes that
202 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
203 using /bin/cc or gcc but without giving an ansi option). So, all
204 ctype uses should be through macros like ISPRINT... If
205 STDC_HEADERS is defined, then autoconf has verified that the ctype
206 macros don't need to be guarded with references to isascii. ...
207 Defining isascii to 1 should let any compiler worth its salt
208 eliminate the && through constant folding."
209 Solaris defines some of these symbols so we must undefine them first. */
210
211 # undef ISASCII
212 # if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
213 # define ISASCII(c) 1
214 # else
215 # define ISASCII(c) isascii(c)
216 # endif
217
218 # ifdef isblank
219 # define ISBLANK(c) (ISASCII (c) && isblank (c))
220 # else
221 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
222 # endif
223 # ifdef isgraph
224 # define ISGRAPH(c) (ISASCII (c) && isgraph (c))
225 # else
226 # define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
227 # endif
228
229 # undef ISPRINT
230 # define ISPRINT(c) (ISASCII (c) && isprint (c))
231 # define ISDIGIT(c) (ISASCII (c) && isdigit (c))
232 # define ISALNUM(c) (ISASCII (c) && isalnum (c))
233 # define ISALPHA(c) (ISASCII (c) && isalpha (c))
234 # define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
235 # define ISLOWER(c) (ISASCII (c) && islower (c))
236 # define ISPUNCT(c) (ISASCII (c) && ispunct (c))
237 # define ISSPACE(c) (ISASCII (c) && isspace (c))
238 # define ISUPPER(c) (ISASCII (c) && isupper (c))
239 # define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
240
241 # ifdef _tolower
242 # define TOLOWER(c) _tolower(c)
243 # else
244 # define TOLOWER(c) tolower(c)
245 # endif
246
247 # ifndef NULL
248 # define NULL (void *)0
249 # endif
250
251 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
252 since ours (we hope) works properly with all combinations of
253 machines, compilers, `char' and `unsigned char' argument types.
254 (Per Bothner suggested the basic approach.) */
255 # undef SIGN_EXTEND_CHAR
256 # if __STDC__
257 # define SIGN_EXTEND_CHAR(c) ((signed char) (c))
258 # else /* not __STDC__ */
259 /* As in Harbison and Steele. */
260 # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
261 # endif
262
263 # ifndef emacs
265 /* How many characters in the character set. */
266 # define CHAR_SET_SIZE 256
267
268 # ifdef SYNTAX_TABLE
269
270 extern char *re_syntax_table;
271
272 # else /* not SYNTAX_TABLE */
273
274 static char re_syntax_table[CHAR_SET_SIZE];
275
276 static void init_syntax_once (void);
277
278 static void
279 init_syntax_once (void)
280 {
281 register int c;
282 static int done = 0;
283
284 if (done)
285 return;
286 bzero (re_syntax_table, sizeof re_syntax_table);
287
288 for (c = 0; c < CHAR_SET_SIZE; ++c)
289 if (ISALNUM (c))
290 re_syntax_table[c] = Sword;
291
292 re_syntax_table['_'] = Sword;
293
294 done = 1;
295 }
296
297 # endif /* not SYNTAX_TABLE */
298
299 # define SYNTAX(c) re_syntax_table[(unsigned char) (c)]
300
301 # endif /* emacs */
302
303 /* Integer type for pointers. */
305 # if !defined _LIBC && !defined HAVE_UINTPTR_T
306 typedef unsigned long int uintptr_t;
307 # endif
308
309 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
310 use `alloca' instead of `malloc'. This is because using malloc in
311 re_search* or re_match* could cause memory leaks when C-g is used in
312 Emacs; also, malloc is slower and causes storage fragmentation. On
313 the other hand, malloc is more portable, and easier to debug.
314
315 Because we sometimes use alloca, some routines have to be macros,
316 not functions -- `alloca'-allocated space disappears at the end of the
317 function it is called in. */
318
319 # ifdef REGEX_MALLOC
320
321 # define REGEX_ALLOCATE malloc
322 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
323 # define REGEX_FREE free
324
325 # else /* not REGEX_MALLOC */
326
327 /* Emacs already defines alloca, sometimes. */
328 # ifndef alloca
329
330 /* Make alloca work the best possible way. */
331 # ifdef __GNUC__
332 # define alloca __builtin_alloca
333 # else /* not __GNUC__ */
334 # if HAVE_ALLOCA_H
335 # include <alloca.h>
336 # endif /* HAVE_ALLOCA_H */
337 # endif /* not __GNUC__ */
338
339 # endif /* not alloca */
340
341 # define REGEX_ALLOCATE alloca
342
343 /* Assumes a `char *destination' variable. */
344 # define REGEX_REALLOCATE(source, osize, nsize) \
345 (destination = (char *) alloca (nsize), \
346 memcpy (destination, source, osize))
347
348 /* No need to do anything to free, after alloca. */
349 # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
350
351 # endif /* not REGEX_MALLOC */
352
353 /* Define how to allocate the failure stack. */
354
355 # if defined REL_ALLOC && defined REGEX_MALLOC
356
357 # define REGEX_ALLOCATE_STACK(size) \
358 r_alloc (&failure_stack_ptr, (size))
359 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
360 r_re_alloc (&failure_stack_ptr, (nsize))
361 # define REGEX_FREE_STACK(ptr) \
362 r_alloc_free (&failure_stack_ptr)
363
364 # else /* not using relocating allocator */
365
366 # ifdef REGEX_MALLOC
367
368 # define REGEX_ALLOCATE_STACK malloc
369 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
370 # define REGEX_FREE_STACK free
371
372 # else /* not REGEX_MALLOC */
373
374 # define REGEX_ALLOCATE_STACK alloca
375
376 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
377 REGEX_REALLOCATE (source, osize, nsize)
378 /* No need to explicitly free anything. */
379 # define REGEX_FREE_STACK(arg)
380
381 # endif /* not REGEX_MALLOC */
382 # endif /* not using relocating allocator */
383
384
385 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
386 `string1' or just past its end. This works if PTR is NULL, which is
387 a good thing. */
388 # define FIRST_STRING_P(ptr) \
389 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
390
391 /* (Re)Allocate N items of type T using malloc, or fail. */
392 # define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
393 # define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
394 # define RETALLOC_IF(addr, n, t) \
395 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
396 # define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
397
398 # define BYTEWIDTH 8 /* In bits. */
399
400 # define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
401
402 # undef MAX
403 # undef MIN
404 # define MAX(a, b) ((a) > (b) ? (a) : (b))
405 # define MIN(a, b) ((a) < (b) ? (a) : (b))
406
407 typedef char boolean;
408 # define false 0
409 # define true 1
410
411 static reg_errcode_t byte_regex_compile (const char *pattern, size_t size,
412 reg_syntax_t syntax,
413 struct re_pattern_buffer *bufp);
414
415 static int byte_re_match_2_internal (struct re_pattern_buffer *bufp,
416 const char *string1, int size1,
417 const char *string2, int size2,
418 int pos,
419 struct re_registers *regs,
420 int stop);
421 static int byte_re_search_2 (struct re_pattern_buffer *bufp,
422 const char *string1, int size1,
423 const char *string2, int size2,
424 int startpos, int range,
425 struct re_registers *regs, int stop);
426 static int byte_re_compile_fastmap (struct re_pattern_buffer *bufp);
427
428 #ifdef MBS_SUPPORT
429 static reg_errcode_t wcs_regex_compile (const char *pattern, size_t size,
430 reg_syntax_t syntax,
431 struct re_pattern_buffer *bufp);
432
433
434 static int wcs_re_match_2_internal (struct re_pattern_buffer *bufp,
435 const char *cstring1, int csize1,
436 const char *cstring2, int csize2,
437 int pos,
438 struct re_registers *regs,
439 int stop,
440 wchar_t *string1, int size1,
441 wchar_t *string2, int size2,
442 int *mbs_offset1, int *mbs_offset2);
443 static int wcs_re_search_2 (struct re_pattern_buffer *bufp,
444 const char *string1, int size1,
445 const char *string2, int size2,
446 int startpos, int range,
447 struct re_registers *regs, int stop);
448 static int wcs_re_compile_fastmap (struct re_pattern_buffer *bufp);
449 #endif
450
451 /* These are the command codes that appear in compiled regular
453 expressions. Some opcodes are followed by argument bytes. A
454 command code can specify any interpretation whatsoever for its
455 arguments. Zero bytes may appear in the compiled regular expression. */
456
457 typedef enum
458 {
459 no_op = 0,
460
461 /* Succeed right away--no more backtracking. */
462 succeed,
463
464 /* Followed by one byte giving n, then by n literal bytes. */
465 exactn,
466
467 # ifdef MBS_SUPPORT
468 /* Same as exactn, but contains binary data. */
469 exactn_bin,
470 # endif
471
472 /* Matches any (more or less) character. */
473 anychar,
474
475 /* Matches any one char belonging to specified set. First
476 following byte is number of bitmap bytes. Then come bytes
477 for a bitmap saying which chars are in. Bits in each byte
478 are ordered low-bit-first. A character is in the set if its
479 bit is 1. A character too large to have a bit in the map is
480 automatically not in the set. */
481 /* ifdef MBS_SUPPORT, following element is length of character
482 classes, length of collating symbols, length of equivalence
483 classes, length of character ranges, and length of characters.
484 Next, character class element, collating symbols elements,
485 equivalence class elements, range elements, and character
486 elements follow.
487 See regex_compile function. */
488 charset,
489
490 /* Same parameters as charset, but match any character that is
491 not one of those specified. */
492 charset_not,
493
494 /* Start remembering the text that is matched, for storing in a
495 register. Followed by one byte with the register number, in
496 the range 0 to one less than the pattern buffer's re_nsub
497 field. Then followed by one byte with the number of groups
498 inner to this one. (This last has to be part of the
499 start_memory only because we need it in the on_failure_jump
500 of re_match_2.) */
501 start_memory,
502
503 /* Stop remembering the text that is matched and store it in a
504 memory register. Followed by one byte with the register
505 number, in the range 0 to one less than `re_nsub' in the
506 pattern buffer, and one byte with the number of inner groups,
507 just like `start_memory'. (We need the number of inner
508 groups here because we don't have any easy way of finding the
509 corresponding start_memory when we're at a stop_memory.) */
510 stop_memory,
511
512 /* Match a duplicate of something remembered. Followed by one
513 byte containing the register number. */
514 duplicate,
515
516 /* Fail unless at beginning of line. */
517 begline,
518
519 /* Fail unless at end of line. */
520 endline,
521
522 /* Succeeds if at beginning of buffer (if emacs) or at beginning
523 of string to be matched (if not). */
524 begbuf,
525
526 /* Analogously, for end of buffer/string. */
527 endbuf,
528
529 /* Followed by two byte relative address to which to jump. */
530 jump,
531
532 /* Same as jump, but marks the end of an alternative. */
533 jump_past_alt,
534
535 /* Followed by two-byte relative address of place to resume at
536 in case of failure. */
537 /* ifdef MBS_SUPPORT, the size of address is 1. */
538 on_failure_jump,
539
540 /* Like on_failure_jump, but pushes a placeholder instead of the
541 current string position when executed. */
542 on_failure_keep_string_jump,
543
544 /* Throw away latest failure point and then jump to following
545 two-byte relative address. */
546 /* ifdef MBS_SUPPORT, the size of address is 1. */
547 pop_failure_jump,
548
549 /* Change to pop_failure_jump if know won't have to backtrack to
550 match; otherwise change to jump. This is used to jump
551 back to the beginning of a repeat. If what follows this jump
552 clearly won't match what the repeat does, such that we can be
553 sure that there is no use backtracking out of repetitions
554 already matched, then we change it to a pop_failure_jump.
555 Followed by two-byte address. */
556 /* ifdef MBS_SUPPORT, the size of address is 1. */
557 maybe_pop_jump,
558
559 /* Jump to following two-byte address, and push a dummy failure
560 point. This failure point will be thrown away if an attempt
561 is made to use it for a failure. A `+' construct makes this
562 before the first repeat. Also used as an intermediary kind
563 of jump when compiling an alternative. */
564 /* ifdef MBS_SUPPORT, the size of address is 1. */
565 dummy_failure_jump,
566
567 /* Push a dummy failure point and continue. Used at the end of
568 alternatives. */
569 push_dummy_failure,
570
571 /* Followed by two-byte relative address and two-byte number n.
572 After matching N times, jump to the address upon failure. */
573 /* ifdef MBS_SUPPORT, the size of address is 1. */
574 succeed_n,
575
576 /* Followed by two-byte relative address, and two-byte number n.
577 Jump to the address N times, then fail. */
578 /* ifdef MBS_SUPPORT, the size of address is 1. */
579 jump_n,
580
581 /* Set the following two-byte relative address to the
582 subsequent two-byte number. The address *includes* the two
583 bytes of number. */
584 /* ifdef MBS_SUPPORT, the size of address is 1. */
585 set_number_at,
586
587 wordchar, /* Matches any word-constituent character. */
588 notwordchar, /* Matches any char that is not a word-constituent. */
589
590 wordbeg, /* Succeeds if at word beginning. */
591 wordend, /* Succeeds if at word end. */
592
593 wordbound, /* Succeeds if at a word boundary. */
594 notwordbound /* Succeeds if not at a word boundary. */
595
596 # ifdef emacs
597 ,before_dot, /* Succeeds if before point. */
598 at_dot, /* Succeeds if at point. */
599 after_dot, /* Succeeds if after point. */
600
601 /* Matches any character whose syntax is specified. Followed by
602 a byte which contains a syntax code, e.g., Sword. */
603 syntaxspec,
604
605 /* Matches any character whose syntax is not that specified. */
606 notsyntaxspec
607 # endif /* emacs */
608 } re_opcode_t;
609 #endif /* not INSIDE_RECURSION */
610
611
613 #ifdef BYTE
614 # define CHAR_T char
615 # define UCHAR_T unsigned char
616 # define COMPILED_BUFFER_VAR bufp->buffer
617 # define OFFSET_ADDRESS_SIZE 2
618 # define PREFIX(name) byte_##name
619 # define ARG_PREFIX(name) name
620 # define PUT_CHAR(c) putchar (c)
621 #else
622 # ifdef WCHAR
623 # define CHAR_T wchar_t
624 # define UCHAR_T wchar_t
625 # define COMPILED_BUFFER_VAR wc_buffer
626 # define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
627 # define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_T)+1)
628 # define PREFIX(name) wcs_##name
629 # define ARG_PREFIX(name) c##name
630 /* Should we use wide stream?? */
631 # define PUT_CHAR(c) printf ("%C", c);
632 # define TRUE 1
633 # define FALSE 0
634 # else
635 # ifdef MBS_SUPPORT
636 # define WCHAR
637 # define INSIDE_RECURSION
638 # include "regex.c"
639 # undef INSIDE_RECURSION
640 # endif
641 # define BYTE
642 # define INSIDE_RECURSION
643 # include "regex.c"
644 # undef INSIDE_RECURSION
645 # endif
646 #endif
647
648 #ifdef INSIDE_RECURSION
649 /* Common operations on the compiled pattern. */
650
651 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
652 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
653
654 # ifdef WCHAR
655 # define STORE_NUMBER(destination, number) \
656 do { \
657 *(destination) = (UCHAR_T)(number); \
658 } while (0)
659 # else /* BYTE */
660 # define STORE_NUMBER(destination, number) \
661 do { \
662 (destination)[0] = (number) & 0377; \
663 (destination)[1] = (number) >> 8; \
664 } while (0)
665 # endif /* WCHAR */
666
667 /* Same as STORE_NUMBER, except increment DESTINATION to
668 the byte after where the number is stored. Therefore, DESTINATION
669 must be an lvalue. */
670 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
671
672 # define STORE_NUMBER_AND_INCR(destination, number) \
673 do { \
674 STORE_NUMBER (destination, number); \
675 (destination) += OFFSET_ADDRESS_SIZE; \
676 } while (0)
677
678 /* Put into DESTINATION a number stored in two contiguous bytes starting
679 at SOURCE. */
680 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
681
682 # ifdef WCHAR
683 # define EXTRACT_NUMBER(destination, source) \
684 do { \
685 (destination) = *(source); \
686 } while (0)
687 # else /* BYTE */
688 # define EXTRACT_NUMBER(destination, source) \
689 do { \
690 (destination) = *(source) & 0377; \
691 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
692 } while (0)
693 # endif
694
695 # ifdef DEBUG
696 static void PREFIX(extract_number) (int *dest, UCHAR_T *source);
697 static void
698 PREFIX(extract_number) (int *dest, UCHAR_T *source)
699 {
700 # ifdef WCHAR
701 *dest = *source;
702 # else /* BYTE */
703 int temp = SIGN_EXTEND_CHAR (*(source + 1));
704 *dest = *source & 0377;
705 *dest += temp << 8;
706 # endif
707 }
708
709 # ifndef EXTRACT_MACROS /* To debug the macros. */
710 # undef EXTRACT_NUMBER
711 # define EXTRACT_NUMBER(dest, src) PREFIX(extract_number) (&dest, src)
712 # endif /* not EXTRACT_MACROS */
713
714 # endif /* DEBUG */
715
716 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
717 SOURCE must be an lvalue. */
718
719 # define EXTRACT_NUMBER_AND_INCR(destination, source) \
720 do { \
721 EXTRACT_NUMBER (destination, source); \
722 (source) += OFFSET_ADDRESS_SIZE; \
723 } while (0)
724
725 # ifdef DEBUG
726 static void PREFIX(extract_number_and_incr) (int *destination,
727 UCHAR_T **source);
728 static void
729 PREFIX(extract_number_and_incr) (int *destination, UCHAR_T **source)
730 {
731 PREFIX(extract_number) (destination, *source);
732 *source += OFFSET_ADDRESS_SIZE;
733 }
734
735 # ifndef EXTRACT_MACROS
736 # undef EXTRACT_NUMBER_AND_INCR
737 # define EXTRACT_NUMBER_AND_INCR(dest, src) \
738 PREFIX(extract_number_and_incr) (&dest, &src)
739 # endif /* not EXTRACT_MACROS */
740
741 # endif /* DEBUG */
742
743
744
746 /* If DEBUG is defined, Regex prints many voluminous messages about what
747 it is doing (if the variable `debug' is nonzero). If linked with the
748 main program in `iregex.c', you can enter patterns and strings
749 interactively. And if linked with the main program in `main.c' and
750 the other test files, you can run the already-written tests. */
751
752 # ifdef DEBUG
753
754 # ifndef DEFINED_ONCE
755
756 /* We use standard I/O for debugging. */
757 # include <stdio.h>
758
759 /* It is useful to test things that ``must'' be true when debugging. */
760 # include <assert.h>
761
762 static int debug;
763
764 # define DEBUG_STATEMENT(e) e
765 # define DEBUG_PRINT1(x) if (debug) printf (x)
766 # define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
767 # define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
768 # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
769 # endif /* not DEFINED_ONCE */
770
771 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
772 if (debug) PREFIX(print_partial_compiled_pattern) (s, e)
773 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
774 if (debug) PREFIX(print_double_string) (w, s1, sz1, s2, sz2)
775
776
777 /* Print the fastmap in human-readable form. */
778
779 # ifndef DEFINED_ONCE
780 void
781 print_fastmap (char *fastmap)
782 {
783 unsigned was_a_range = 0;
784 unsigned i = 0;
785
786 while (i < (1 << BYTEWIDTH))
787 {
788 if (fastmap[i++])
789 {
790 was_a_range = 0;
791 putchar (i - 1);
792 while (i < (1 << BYTEWIDTH) && fastmap[i])
793 {
794 was_a_range = 1;
795 i++;
796 }
797 if (was_a_range)
798 {
799 printf ("-");
800 putchar (i - 1);
801 }
802 }
803 }
804 putchar ('\n');
805 }
806 # endif /* not DEFINED_ONCE */
807
808
809 /* Print a compiled pattern string in human-readable form, starting at
810 the START pointer into it and ending just before the pointer END. */
811
812 void
813 PREFIX(print_partial_compiled_pattern) (UCHAR_T *start, UCHAR_T *end)
814 {
815 int mcnt, mcnt2;
816 UCHAR_T *p1;
817 UCHAR_T *p = start;
818 UCHAR_T *pend = end;
819
820 if (start == NULL)
821 {
822 printf ("(null)\n");
823 return;
824 }
825
826 /* Loop over pattern commands. */
827 while (p < pend)
828 {
829 # ifdef _LIBC
830 printf ("%td:\t", p - start);
831 # else
832 printf ("%ld:\t", (long int) (p - start));
833 # endif
834
835 switch ((re_opcode_t) *p++)
836 {
837 case no_op:
838 printf ("/no_op");
839 break;
840
841 case exactn:
842 mcnt = *p++;
843 printf ("/exactn/%d", mcnt);
844 do
845 {
846 putchar ('/');
847 PUT_CHAR (*p++);
848 }
849 while (--mcnt);
850 break;
851
852 # ifdef MBS_SUPPORT
853 case exactn_bin:
854 mcnt = *p++;
855 printf ("/exactn_bin/%d", mcnt);
856 do
857 {
858 printf("/%lx", (long int) *p++);
859 }
860 while (--mcnt);
861 break;
862 # endif /* MBS_SUPPORT */
863
864 case start_memory:
865 mcnt = *p++;
866 printf ("/start_memory/%d/%ld", mcnt, (long int) *p++);
867 break;
868
869 case stop_memory:
870 mcnt = *p++;
871 printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++);
872 break;
873
874 case duplicate:
875 printf ("/duplicate/%ld", (long int) *p++);
876 break;
877
878 case anychar:
879 printf ("/anychar");
880 break;
881
882 case charset:
883 case charset_not:
884 {
885 # ifdef WCHAR
886 int i, length;
887 wchar_t *workp = p;
888 printf ("/charset [%s",
889 (re_opcode_t) *(workp - 1) == charset_not ? "^" : "");
890 p += 5;
891 length = *workp++; /* the length of char_classes */
892 for (i=0 ; i<length ; i++)
893 printf("[:%lx:]", (long int) *p++);
894 length = *workp++; /* the length of collating_symbol */
895 for (i=0 ; i<length ;)
896 {
897 printf("[.");
898 while(*p != 0)
899 PUT_CHAR((i++,*p++));
900 i++,p++;
901 printf(".]");
902 }
903 length = *workp++; /* the length of equivalence_class */
904 for (i=0 ; i<length ;)
905 {
906 printf("[=");
907 while(*p != 0)
908 PUT_CHAR((i++,*p++));
909 i++,p++;
910 printf("=]");
911 }
912 length = *workp++; /* the length of char_range */
913 for (i=0 ; i<length ; i++)
914 {
915 wchar_t range_start = *p++;
916 wchar_t range_end = *p++;
917 printf("%C-%C", range_start, range_end);
918 }
919 length = *workp++; /* the length of char */
920 for (i=0 ; i<length ; i++)
921 printf("%C", *p++);
922 putchar (']');
923 # else
924 register int c, last = -100;
925 register int in_range = 0;
926
927 printf ("/charset [%s",
928 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
929
930 assert (p + *p < pend);
931
932 for (c = 0; c < 256; c++)
933 if (c / 8 < *p
934 && (p[1 + (c/8)] & (1 << (c % 8))))
935 {
936 /* Are we starting a range? */
937 if (last + 1 == c && ! in_range)
938 {
939 putchar ('-');
940 in_range = 1;
941 }
942 /* Have we broken a range? */
943 else if (last + 1 != c && in_range)
944 {
945 putchar (last);
946 in_range = 0;
947 }
948
949 if (! in_range)
950 putchar (c);
951
952 last = c;
953 }
954
955 if (in_range)
956 putchar (last);
957
958 putchar (']');
959
960 p += 1 + *p;
961 # endif /* WCHAR */
962 }
963 break;
964
965 case begline:
966 printf ("/begline");
967 break;
968
969 case endline:
970 printf ("/endline");
971 break;
972
973 case on_failure_jump:
974 PREFIX(extract_number_and_incr) (&mcnt, &p);
975 # ifdef _LIBC
976 printf ("/on_failure_jump to %td", p + mcnt - start);
977 # else
978 printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start));
979 # endif
980 break;
981
982 case on_failure_keep_string_jump:
983 PREFIX(extract_number_and_incr) (&mcnt, &p);
984 # ifdef _LIBC
985 printf ("/on_failure_keep_string_jump to %td", p + mcnt - start);
986 # else
987 printf ("/on_failure_keep_string_jump to %ld",
988 (long int) (p + mcnt - start));
989 # endif
990 break;
991
992 case dummy_failure_jump:
993 PREFIX(extract_number_and_incr) (&mcnt, &p);
994 # ifdef _LIBC
995 printf ("/dummy_failure_jump to %td", p + mcnt - start);
996 # else
997 printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start));
998 # endif
999 break;
1000
1001 case push_dummy_failure:
1002 printf ("/push_dummy_failure");
1003 break;
1004
1005 case maybe_pop_jump:
1006 PREFIX(extract_number_and_incr) (&mcnt, &p);
1007 # ifdef _LIBC
1008 printf ("/maybe_pop_jump to %td", p + mcnt - start);
1009 # else
1010 printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start));
1011 # endif
1012 break;
1013
1014 case pop_failure_jump:
1015 PREFIX(extract_number_and_incr) (&mcnt, &p);
1016 # ifdef _LIBC
1017 printf ("/pop_failure_jump to %td", p + mcnt - start);
1018 # else
1019 printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start));
1020 # endif
1021 break;
1022
1023 case jump_past_alt:
1024 PREFIX(extract_number_and_incr) (&mcnt, &p);
1025 # ifdef _LIBC
1026 printf ("/jump_past_alt to %td", p + mcnt - start);
1027 # else
1028 printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start));
1029 # endif
1030 break;
1031
1032 case jump:
1033 PREFIX(extract_number_and_incr) (&mcnt, &p);
1034 # ifdef _LIBC
1035 printf ("/jump to %td", p + mcnt - start);
1036 # else
1037 printf ("/jump to %ld", (long int) (p + mcnt - start));
1038 # endif
1039 break;
1040
1041 case succeed_n:
1042 PREFIX(extract_number_and_incr) (&mcnt, &p);
1043 p1 = p + mcnt;
1044 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1045 # ifdef _LIBC
1046 printf ("/succeed_n to %td, %d times", p1 - start, mcnt2);
1047 # else
1048 printf ("/succeed_n to %ld, %d times",
1049 (long int) (p1 - start), mcnt2);
1050 # endif
1051 break;
1052
1053 case jump_n:
1054 PREFIX(extract_number_and_incr) (&mcnt, &p);
1055 p1 = p + mcnt;
1056 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1057 printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
1058 break;
1059
1060 case set_number_at:
1061 PREFIX(extract_number_and_incr) (&mcnt, &p);
1062 p1 = p + mcnt;
1063 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1064 # ifdef _LIBC
1065 printf ("/set_number_at location %td to %d", p1 - start, mcnt2);
1066 # else
1067 printf ("/set_number_at location %ld to %d",
1068 (long int) (p1 - start), mcnt2);
1069 # endif
1070 break;
1071
1072 case wordbound:
1073 printf ("/wordbound");
1074 break;
1075
1076 case notwordbound:
1077 printf ("/notwordbound");
1078 break;
1079
1080 case wordbeg:
1081 printf ("/wordbeg");
1082 break;
1083
1084 case wordend:
1085 printf ("/wordend");
1086 break;
1087
1088 # ifdef emacs
1089 case before_dot:
1090 printf ("/before_dot");
1091 break;
1092
1093 case at_dot:
1094 printf ("/at_dot");
1095 break;
1096
1097 case after_dot:
1098 printf ("/after_dot");
1099 break;
1100
1101 case syntaxspec:
1102 printf ("/syntaxspec");
1103 mcnt = *p++;
1104 printf ("/%d", mcnt);
1105 break;
1106
1107 case notsyntaxspec:
1108 printf ("/notsyntaxspec");
1109 mcnt = *p++;
1110 printf ("/%d", mcnt);
1111 break;
1112 # endif /* emacs */
1113
1114 case wordchar:
1115 printf ("/wordchar");
1116 break;
1117
1118 case notwordchar:
1119 printf ("/notwordchar");
1120 break;
1121
1122 case begbuf:
1123 printf ("/begbuf");
1124 break;
1125
1126 case endbuf:
1127 printf ("/endbuf");
1128 break;
1129
1130 default:
1131 printf ("?%ld", (long int) *(p-1));
1132 }
1133
1134 putchar ('\n');
1135 }
1136
1137 # ifdef _LIBC
1138 printf ("%td:\tend of pattern.\n", p - start);
1139 # else
1140 printf ("%ld:\tend of pattern.\n", (long int) (p - start));
1141 # endif
1142 }
1143
1144
1145 void
1146 PREFIX(print_compiled_pattern) (struct re_pattern_buffer *bufp)
1147 {
1148 UCHAR_T *buffer = (UCHAR_T*) bufp->buffer;
1149
1150 PREFIX(print_partial_compiled_pattern) (buffer, buffer
1151 + bufp->used / sizeof(UCHAR_T));
1152 printf ("%ld bytes used/%ld bytes allocated.\n",
1153 bufp->used, bufp->allocated);
1154
1155 if (bufp->fastmap_accurate && bufp->fastmap)
1156 {
1157 printf ("fastmap: ");
1158 print_fastmap (bufp->fastmap);
1159 }
1160
1161 # ifdef _LIBC
1162 printf ("re_nsub: %Zd\t", bufp->re_nsub);
1163 # else
1164 printf ("re_nsub: %ld\t", (long int) bufp->re_nsub);
1165 # endif
1166 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1167 printf ("can_be_null: %d\t", bufp->can_be_null);
1168 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1169 printf ("no_sub: %d\t", bufp->no_sub);
1170 printf ("not_bol: %d\t", bufp->not_bol);
1171 printf ("not_eol: %d\t", bufp->not_eol);
1172 printf ("syntax: %lx\n", bufp->syntax);
1173 /* Perhaps we should print the translate table? */
1174 }
1175
1176
1177 void
1178 PREFIX(print_double_string) (const CHAR_T *where, const CHAR_T *string1,
1179 int size1, const CHAR_T *string2, int size2)
1180 {
1181 int this_char;
1182
1183 if (where == NULL)
1184 printf ("(null)");
1185 else
1186 {
1187 int cnt;
1188
1189 if (FIRST_STRING_P (where))
1190 {
1191 for (this_char = where - string1; this_char < size1; this_char++)
1192 PUT_CHAR (string1[this_char]);
1193
1194 where = string2;
1195 }
1196
1197 cnt = 0;
1198 for (this_char = where - string2; this_char < size2; this_char++)
1199 {
1200 PUT_CHAR (string2[this_char]);
1201 if (++cnt > 100)
1202 {
1203 fputs ("...", stdout);
1204 break;
1205 }
1206 }
1207 }
1208 }
1209
1210 # ifndef DEFINED_ONCE
1211 void
1212 printchar (int c)
1213 {
1214 putc (c, stderr);
1215 }
1216 # endif
1217
1218 # else /* not DEBUG */
1219
1220 # ifndef DEFINED_ONCE
1221 # undef assert
1222 # define assert(e)
1223
1224 # define DEBUG_STATEMENT(e)
1225 # define DEBUG_PRINT1(x)
1226 # define DEBUG_PRINT2(x1, x2)
1227 # define DEBUG_PRINT3(x1, x2, x3)
1228 # define DEBUG_PRINT4(x1, x2, x3, x4)
1229 # endif /* not DEFINED_ONCE */
1230 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1231 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1232
1233 # endif /* not DEBUG */
1234
1235
1236
1238 # ifdef WCHAR
1239 /* This convert a multibyte string to a wide character string.
1240 And write their correspondances to offset_buffer(see below)
1241 and write whether each wchar_t is binary data to is_binary.
1242 This assume invalid multibyte sequences as binary data.
1243 We assume offset_buffer and is_binary is already allocated
1244 enough space. */
1245
1246 static size_t convert_mbs_to_wcs (CHAR_T *dest, const unsigned char* src,
1247 size_t len, int *offset_buffer,
1248 char *is_binary);
1249 static size_t
1250 convert_mbs_to_wcs (CHAR_T *dest, const unsigned char*src, size_t len,
1251 int *offset_buffer, char *is_binary)
1252 /* It hold correspondances between src(char string) and
1253 dest(wchar_t string) for optimization.
1254 e.g. src = "xxxyzz"
1255 dest = {'X', 'Y', 'Z'}
1256 (each "xxx", "y" and "zz" represent one multibyte character
1257 corresponding to 'X', 'Y' and 'Z'.)
1258 offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")}
1259 = {0, 3, 4, 6}
1260 */
1261 {
1262 wchar_t *pdest = dest;
1263 const unsigned char *psrc = src;
1264 size_t wc_count = 0;
1265
1266 mbstate_t mbs;
1267 int i, consumed;
1268 size_t mb_remain = len;
1269 size_t mb_count = 0;
1270
1271 /* Initialize the conversion state. */
1272 memset (&mbs, 0, sizeof (mbstate_t));
1273
1274 offset_buffer[0] = 0;
1275 for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed,
1276 psrc += consumed)
1277 {
1278 #ifdef _LIBC
1279 consumed = __mbrtowc (pdest, psrc, mb_remain, &mbs);
1280 #else
1281 consumed = mbrtowc (pdest, psrc, mb_remain, &mbs);
1282 #endif
1283
1284 if (consumed <= 0)
1285 /* failed to convert. maybe src contains binary data.
1286 So we consume 1 byte manualy. */
1287 {
1288 *pdest = *psrc;
1289 consumed = 1;
1290 is_binary[wc_count] = TRUE;
1291 }
1292 else
1293 is_binary[wc_count] = FALSE;
1294 /* In sjis encoding, we use yen sign as escape character in
1295 place of reverse solidus. So we convert 0x5c(yen sign in
1296 sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse
1297 solidus in UCS2). */
1298 if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5)
1299 *pdest = (wchar_t) *psrc;
1300
1301 offset_buffer[wc_count + 1] = mb_count += consumed;
1302 }
1303
1304 /* Fill remain of the buffer with sentinel. */
1305 for (i = wc_count + 1 ; i <= len ; i++)
1306 offset_buffer[i] = mb_count + 1;
1307
1308 return wc_count;
1309 }
1310
1311 # endif /* WCHAR */
1312
1313 #else /* not INSIDE_RECURSION */
1314
1315 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1316 also be assigned to arbitrarily: each pattern buffer stores its own
1317 syntax, so it can be changed between regex compilations. */
1318 /* This has no initializer because initialized variables in Emacs
1319 become read-only after dumping. */
1320 reg_syntax_t re_syntax_options;
1321
1322
1323 /* Specify the precise syntax of regexps for compilation. This provides
1324 for compatibility for various utilities which historically have
1325 different, incompatible syntaxes.
1326
1327 The argument SYNTAX is a bit mask comprised of the various bits
1328 defined in regex.h. We return the old syntax. */
1329
1330 reg_syntax_t
1331 re_set_syntax (reg_syntax_t syntax)
1332 {
1333 reg_syntax_t ret = re_syntax_options;
1334
1335 re_syntax_options = syntax;
1336 # ifdef DEBUG
1337 if (syntax & RE_DEBUG)
1338 debug = 1;
1339 else if (debug) /* was on but now is not */
1340 debug = 0;
1341 # endif /* DEBUG */
1342 return ret;
1343 }
1344 # ifdef _LIBC
1345 weak_alias (__re_set_syntax, re_set_syntax)
1346 # endif
1347
1348 /* This table gives an error message for each of the error codes listed
1350 in regex.h. Obviously the order here has to be same as there.
1351 POSIX doesn't require that we do anything for REG_NOERROR,
1352 but why not be nice? */
1353
1354 static const char *re_error_msgid[] =
1355 {
1356 gettext_noop ("Success"), /* REG_NOERROR */
1357 gettext_noop ("No match"), /* REG_NOMATCH */
1358 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1359 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1360 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1361 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1362 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1363 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1364 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1365 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1366 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1367 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1368 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1369 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1370 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1371 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1372 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
1373 };
1374
1375 #endif /* INSIDE_RECURSION */
1377
1378 #ifndef DEFINED_ONCE
1379 /* Avoiding alloca during matching, to placate r_alloc. */
1380
1381 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1382 searching and matching functions should not call alloca. On some
1383 systems, alloca is implemented in terms of malloc, and if we're
1384 using the relocating allocator routines, then malloc could cause a
1385 relocation, which might (if the strings being searched are in the
1386 ralloc heap) shift the data out from underneath the regexp
1387 routines.
1388
1389 Here's another reason to avoid allocation: Emacs
1390 processes input from X in a signal handler; processing X input may
1391 call malloc; if input arrives while a matching routine is calling
1392 malloc, then we're scrod. But Emacs can't just block input while
1393 calling matching routines; then we don't notice interrupts when
1394 they come in. So, Emacs blocks input around all regexp calls
1395 except the matching calls, which it leaves unprotected, in the
1396 faith that they will not malloc. */
1397
1398 /* Normally, this is fine. */
1399 # define MATCH_MAY_ALLOCATE
1400
1401 /* When using GNU C, we are not REALLY using the C alloca, no matter
1402 what config.h may say. So don't take precautions for it. */
1403 # ifdef __GNUC__
1404 # undef C_ALLOCA
1405 # endif
1406
1407 /* The match routines may not allocate if (1) they would do it with malloc
1408 and (2) it's not safe for them to use malloc.
1409 Note that if REL_ALLOC is defined, matching would not use malloc for the
1410 failure stack, but we would still use it for the register vectors;
1411 so REL_ALLOC should not affect this. */
1412 # if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1413 # undef MATCH_MAY_ALLOCATE
1414 # endif
1415 #endif /* not DEFINED_ONCE */
1416
1417 #ifdef INSIDE_RECURSION
1419 /* Failure stack declarations and macros; both re_compile_fastmap and
1420 re_match_2 use a failure stack. These have to be macros because of
1421 REGEX_ALLOCATE_STACK. */
1422
1423
1424 /* Number of failure points for which to initially allocate space
1425 when matching. If this number is exceeded, we allocate more
1426 space, so it is not a hard limit. */
1427 # ifndef INIT_FAILURE_ALLOC
1428 # define INIT_FAILURE_ALLOC 5
1429 # endif
1430
1431 /* Roughly the maximum number of failure points on the stack. Would be
1432 exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1433 This is a variable only so users of regex can assign to it; we never
1434 change it ourselves. */
1435
1436 # ifdef INT_IS_16BIT
1437
1438 # ifndef DEFINED_ONCE
1439 # if defined MATCH_MAY_ALLOCATE
1440 /* 4400 was enough to cause a crash on Alpha OSF/1,
1441 whose default stack limit is 2mb. */
1442 long int re_max_failures = 4000;
1443 # else
1444 long int re_max_failures = 2000;
1445 # endif
1446 # endif
1447
1448 union PREFIX(fail_stack_elt)
1449 {
1450 UCHAR_T *pointer;
1451 long int integer;
1452 };
1453
1454 typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1455
1456 typedef struct
1457 {
1458 PREFIX(fail_stack_elt_t) *stack;
1459 unsigned long int size;
1460 unsigned long int avail; /* Offset of next open position. */
1461 } PREFIX(fail_stack_type);
1462
1463 # else /* not INT_IS_16BIT */
1464
1465 # ifndef DEFINED_ONCE
1466 # if defined MATCH_MAY_ALLOCATE
1467 /* 4400 was enough to cause a crash on Alpha OSF/1,
1468 whose default stack limit is 2mb. */
1469 int re_max_failures = 4000;
1470 # else
1471 int re_max_failures = 2000;
1472 # endif
1473 # endif
1474
1475 union PREFIX(fail_stack_elt)
1476 {
1477 UCHAR_T *pointer;
1478 int integer;
1479 };
1480
1481 typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1482
1483 typedef struct
1484 {
1485 PREFIX(fail_stack_elt_t) *stack;
1486 unsigned size;
1487 unsigned avail; /* Offset of next open position. */
1488 } PREFIX(fail_stack_type);
1489
1490 # endif /* INT_IS_16BIT */
1491
1492 # ifndef DEFINED_ONCE
1493 # define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1494 # define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1495 # define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1496 # endif
1497
1498
1499 /* Define macros to initialize and free the failure stack.
1500 Do `return -2' if the alloc fails. */
1501
1502 # ifdef MATCH_MAY_ALLOCATE
1503 # define INIT_FAIL_STACK() \
1504 do { \
1505 fail_stack.stack = (PREFIX(fail_stack_elt_t) *) \
1506 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (PREFIX(fail_stack_elt_t))); \
1507 \
1508 if (fail_stack.stack == NULL) \
1509 return -2; \
1510 \
1511 fail_stack.size = INIT_FAILURE_ALLOC; \
1512 fail_stack.avail = 0; \
1513 } while (0)
1514
1515 # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1516 # else
1517 # define INIT_FAIL_STACK() \
1518 do { \
1519 fail_stack.avail = 0; \
1520 } while (0)
1521
1522 # define RESET_FAIL_STACK()
1523 # endif
1524
1525
1526 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1527
1528 Return 1 if succeeds, and 0 if either ran out of memory
1529 allocating space for it or it was already too large.
1530
1531 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1532
1533 # define DOUBLE_FAIL_STACK(fail_stack) \
1534 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1535 ? 0 \
1536 : ((fail_stack).stack = (PREFIX(fail_stack_elt_t) *) \
1537 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1538 (fail_stack).size * sizeof (PREFIX(fail_stack_elt_t)), \
1539 ((fail_stack).size << 1) * sizeof (PREFIX(fail_stack_elt_t))),\
1540 \
1541 (fail_stack).stack == NULL \
1542 ? 0 \
1543 : ((fail_stack).size <<= 1, \
1544 1)))
1545
1546
1547 /* Push pointer POINTER on FAIL_STACK.
1548 Return 1 if was able to do so and 0 if ran out of memory allocating
1549 space to do so. */
1550 # define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1551 ((FAIL_STACK_FULL () \
1552 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1553 ? 0 \
1554 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1555 1))
1556
1557 /* Push a pointer value onto the failure stack.
1558 Assumes the variable `fail_stack'. Probably should only
1559 be called from within `PUSH_FAILURE_POINT'. */
1560 # define PUSH_FAILURE_POINTER(item) \
1561 fail_stack.stack[fail_stack.avail++].pointer = (UCHAR_T *) (item)
1562
1563 /* This pushes an integer-valued item onto the failure stack.
1564 Assumes the variable `fail_stack'. Probably should only
1565 be called from within `PUSH_FAILURE_POINT'. */
1566 # define PUSH_FAILURE_INT(item) \
1567 fail_stack.stack[fail_stack.avail++].integer = (item)
1568
1569 /* Push a fail_stack_elt_t value onto the failure stack.
1570 Assumes the variable `fail_stack'. Probably should only
1571 be called from within `PUSH_FAILURE_POINT'. */
1572 # define PUSH_FAILURE_ELT(item) \
1573 fail_stack.stack[fail_stack.avail++] = (item)
1574
1575 /* These three POP... operations complement the three PUSH... operations.
1576 All assume that `fail_stack' is nonempty. */
1577 # define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1578 # define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1579 # define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1580
1581 /* Used to omit pushing failure point id's when we're not debugging. */
1582 # ifdef DEBUG
1583 # define DEBUG_PUSH PUSH_FAILURE_INT
1584 # define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1585 # else
1586 # define DEBUG_PUSH(item)
1587 # define DEBUG_POP(item_addr)
1588 # endif
1589
1590
1591 /* Push the information about the state we will need
1592 if we ever fail back to it.
1593
1594 Requires variables fail_stack, regstart, regend, reg_info, and
1595 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
1596 be declared.
1597
1598 Does `return FAILURE_CODE' if runs out of memory. */
1599
1600 # define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1601 do { \
1602 char *destination; \
1603 /* Must be int, so when we don't save any registers, the arithmetic \
1604 of 0 + -1 isn't done as unsigned. */ \
1605 /* Can't be int, since there is not a shred of a guarantee that int \
1606 is wide enough to hold a value of something to which pointer can \
1607 be assigned */ \
1608 active_reg_t this_reg; \
1609 \
1610 DEBUG_STATEMENT (failure_id++); \
1611 DEBUG_STATEMENT (nfailure_points_pushed++); \
1612 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1613 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1614 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1615 \
1616 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
1617 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1618 \
1619 /* Ensure we have enough space allocated for what we will push. */ \
1620 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1621 { \
1622 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1623 return failure_code; \
1624 \
1625 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1626 (fail_stack).size); \
1627 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1628 } \
1629 \
1630 /* Push the info, starting with the registers. */ \
1631 DEBUG_PRINT1 ("\n"); \
1632 \
1633 if (1) \
1634 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1635 this_reg++) \
1636 { \
1637 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
1638 DEBUG_STATEMENT (num_regs_pushed++); \
1639 \
1640 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1641 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1642 \
1643 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1644 PUSH_FAILURE_POINTER (regend[this_reg]); \
1645 \
1646 DEBUG_PRINT2 (" info: %p\n ", \
1647 reg_info[this_reg].word.pointer); \
1648 DEBUG_PRINT2 (" match_null=%d", \
1649 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1650 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1651 DEBUG_PRINT2 (" matched_something=%d", \
1652 MATCHED_SOMETHING (reg_info[this_reg])); \
1653 DEBUG_PRINT2 (" ever_matched=%d", \
1654 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1655 DEBUG_PRINT1 ("\n"); \
1656 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1657 } \
1658 \
1659 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
1660 PUSH_FAILURE_INT (lowest_active_reg); \
1661 \
1662 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
1663 PUSH_FAILURE_INT (highest_active_reg); \
1664 \
1665 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
1666 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1667 PUSH_FAILURE_POINTER (pattern_place); \
1668 \
1669 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
1670 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1671 size2); \
1672 DEBUG_PRINT1 ("'\n"); \
1673 PUSH_FAILURE_POINTER (string_place); \
1674 \
1675 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1676 DEBUG_PUSH (failure_id); \
1677 } while (0)
1678
1679 # ifndef DEFINED_ONCE
1680 /* This is the number of items that are pushed and popped on the stack
1681 for each register. */
1682 # define NUM_REG_ITEMS 3
1683
1684 /* Individual items aside from the registers. */
1685 # ifdef DEBUG
1686 # define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1687 # else
1688 # define NUM_NONREG_ITEMS 4
1689 # endif
1690
1691 /* We push at most this many items on the stack. */
1692 /* We used to use (num_regs - 1), which is the number of registers
1693 this regexp will save; but that was changed to 5
1694 to avoid stack overflow for a regexp with lots of parens. */
1695 # define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1696
1697 /* We actually push this many items. */
1698 # define NUM_FAILURE_ITEMS \
1699 (((0 \
1700 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1701 * NUM_REG_ITEMS) \
1702 + NUM_NONREG_ITEMS)
1703
1704 /* How many items can still be added to the stack without overflowing it. */
1705 # define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1706 # endif /* not DEFINED_ONCE */
1707
1708
1709 /* Pops what PUSH_FAIL_STACK pushes.
1710
1711 We restore into the parameters, all of which should be lvalues:
1712 STR -- the saved data position.
1713 PAT -- the saved pattern position.
1714 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1715 REGSTART, REGEND -- arrays of string positions.
1716 REG_INFO -- array of information about each subexpression.
1717
1718 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1719 `pend', `string1', `size1', `string2', and `size2'. */
1720 # define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1721 { \
1722 DEBUG_STATEMENT (unsigned failure_id;) \
1723 active_reg_t this_reg; \
1724 const UCHAR_T *string_temp; \
1725 \
1726 assert (!FAIL_STACK_EMPTY ()); \
1727 \
1728 /* Remove failure points and point to how many regs pushed. */ \
1729 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1730 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1731 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1732 \
1733 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1734 \
1735 DEBUG_POP (&failure_id); \
1736 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1737 \
1738 /* If the saved string location is NULL, it came from an \
1739 on_failure_keep_string_jump opcode, and we want to throw away the \
1740 saved NULL, thus retaining our current position in the string. */ \
1741 string_temp = POP_FAILURE_POINTER (); \
1742 if (string_temp != NULL) \
1743 str = (const CHAR_T *) string_temp; \
1744 \
1745 DEBUG_PRINT2 (" Popping string %p: `", str); \
1746 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1747 DEBUG_PRINT1 ("'\n"); \
1748 \
1749 pat = (UCHAR_T *) POP_FAILURE_POINTER (); \
1750 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
1751 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1752 \
1753 /* Restore register info. */ \
1754 high_reg = (active_reg_t) POP_FAILURE_INT (); \
1755 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
1756 \
1757 low_reg = (active_reg_t) POP_FAILURE_INT (); \
1758 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
1759 \
1760 if (1) \
1761 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1762 { \
1763 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
1764 \
1765 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1766 DEBUG_PRINT2 (" info: %p\n", \
1767 reg_info[this_reg].word.pointer); \
1768 \
1769 regend[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1770 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1771 \
1772 regstart[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1773 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1774 } \
1775 else \
1776 { \
1777 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1778 { \
1779 reg_info[this_reg].word.integer = 0; \
1780 regend[this_reg] = 0; \
1781 regstart[this_reg] = 0; \
1782 } \
1783 highest_active_reg = high_reg; \
1784 } \
1785 \
1786 set_regs_matched_done = 0; \
1787 DEBUG_STATEMENT (nfailure_points_popped++); \
1788 } /* POP_FAILURE_POINT */
1789
1790 /* Structure for per-register (a.k.a. per-group) information.
1792 Other register information, such as the
1793 starting and ending positions (which are addresses), and the list of
1794 inner groups (which is a bits list) are maintained in separate
1795 variables.
1796
1797 We are making a (strictly speaking) nonportable assumption here: that
1798 the compiler will pack our bit fields into something that fits into
1799 the type of `word', i.e., is something that fits into one item on the
1800 failure stack. */
1801
1802
1803 /* Declarations and macros for re_match_2. */
1804
1805 typedef union
1806 {
1807 PREFIX(fail_stack_elt_t) word;
1808 struct
1809 {
1810 /* This field is one if this group can match the empty string,
1811 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1812 # define MATCH_NULL_UNSET_VALUE 3
1813 unsigned match_null_string_p : 2;
1814 unsigned is_active : 1;
1815 unsigned matched_something : 1;
1816 unsigned ever_matched_something : 1;
1817 } bits;
1818 } PREFIX(register_info_type);
1819
1820 # ifndef DEFINED_ONCE
1821 # define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1822 # define IS_ACTIVE(R) ((R).bits.is_active)
1823 # define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1824 # define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1825
1826
1827 /* Call this when have matched a real character; it sets `matched' flags
1828 for the subexpressions which we are currently inside. Also records
1829 that those subexprs have matched. */
1830 # define SET_REGS_MATCHED() \
1831 do \
1832 { \
1833 if (!set_regs_matched_done) \
1834 { \
1835 active_reg_t r; \
1836 set_regs_matched_done = 1; \
1837 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1838 { \
1839 MATCHED_SOMETHING (reg_info[r]) \
1840 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1841 = 1; \
1842 } \
1843 } \
1844 } \
1845 while (0)
1846 # endif /* not DEFINED_ONCE */
1847
1848 /* Registers are set to a sentinel when they haven't yet matched. */
1849 static CHAR_T PREFIX(reg_unset_dummy);
1850 # define REG_UNSET_VALUE (&PREFIX(reg_unset_dummy))
1851 # define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1852
1853 /* Subroutine declarations and macros for regex_compile. */
1854 static void PREFIX(store_op1) (re_opcode_t op, UCHAR_T *loc, int arg);
1855 static void PREFIX(store_op2) (re_opcode_t op, UCHAR_T *loc,
1856 int arg1, int arg2);
1857 static void PREFIX(insert_op1) (re_opcode_t op, UCHAR_T *loc,
1858 int arg, UCHAR_T *end);
1859 static void PREFIX(insert_op2) (re_opcode_t op, UCHAR_T *loc,
1860 int arg1, int arg2, UCHAR_T *end);
1861 static boolean PREFIX(at_begline_loc_p) (const CHAR_T *pattern,
1862 const CHAR_T *p,
1863 reg_syntax_t syntax);
1864 static boolean PREFIX(at_endline_loc_p) (const CHAR_T *p,
1865 const CHAR_T *pend,
1866 reg_syntax_t syntax);
1867 # ifdef WCHAR
1868 static reg_errcode_t wcs_compile_range (CHAR_T range_start,
1869 const CHAR_T **p_ptr,
1870 const CHAR_T *pend,
1871 char *translate,
1872 reg_syntax_t syntax,
1873 UCHAR_T *b,
1874 CHAR_T *char_set);
1875 static void insert_space (int num, CHAR_T *loc, CHAR_T *end);
1876 # else /* BYTE */
1877 static reg_errcode_t byte_compile_range (unsigned int range_start,
1878 const char **p_ptr,
1879 const char *pend,
1880 char *translate,
1881 reg_syntax_t syntax,
1882 unsigned char *b);
1883 # endif /* WCHAR */
1884
1885 /* Fetch the next character in the uncompiled pattern---translating it
1886 if necessary. Also cast from a signed character in the constant
1887 string passed to us by the user to an unsigned char that we can use
1888 as an array index (in, e.g., `translate'). */
1889 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1890 because it is impossible to allocate 4GB array for some encodings
1891 which have 4 byte character_set like UCS4. */
1892 # ifndef PATFETCH
1893 # ifdef WCHAR
1894 # define PATFETCH(c) \
1895 do {if (p == pend) return REG_EEND; \
1896 c = (UCHAR_T) *p++; \
1897 if (translate && (c <= 0xff)) c = (UCHAR_T) translate[c]; \
1898 } while (0)
1899 # else /* BYTE */
1900 # define PATFETCH(c) \
1901 do {if (p == pend) return REG_EEND; \
1902 c = (unsigned char) *p++; \
1903 if (translate) c = (unsigned char) translate[c]; \
1904 } while (0)
1905 # endif /* WCHAR */
1906 # endif
1907
1908 /* Fetch the next character in the uncompiled pattern, with no
1909 translation. */
1910 # define PATFETCH_RAW(c) \
1911 do {if (p == pend) return REG_EEND; \
1912 c = (UCHAR_T) *p++; \
1913 } while (0)
1914
1915 /* Go backwards one character in the pattern. */
1916 # define PATUNFETCH p--
1917
1918
1919 /* If `translate' is non-null, return translate[D], else just D. We
1920 cast the subscript to translate because some data is declared as
1921 `char *', to avoid warnings when a string constant is passed. But
1922 when we use a character as a subscript we must make it unsigned. */
1923 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1924 because it is impossible to allocate 4GB array for some encodings
1925 which have 4 byte character_set like UCS4. */
1926
1927 # ifndef TRANSLATE
1928 # ifdef WCHAR
1929 # define TRANSLATE(d) \
1930 ((translate && ((UCHAR_T) (d)) <= 0xff) \
1931 ? (char) translate[(unsigned char) (d)] : (d))
1932 # else /* BYTE */
1933 # define TRANSLATE(d) \
1934 (translate ? (char) translate[(unsigned char) (d)] : (char) (d))
1935 # endif /* WCHAR */
1936 # endif
1937
1938
1939 /* Macros for outputting the compiled pattern into `buffer'. */
1940
1941 /* If the buffer isn't allocated when it comes in, use this. */
1942 # define INIT_BUF_SIZE (32 * sizeof(UCHAR_T))
1943
1944 /* Make sure we have at least N more bytes of space in buffer. */
1945 # ifdef WCHAR
1946 # define GET_BUFFER_SPACE(n) \
1947 while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \
1948 + (n)*sizeof(CHAR_T)) > bufp->allocated) \
1949 EXTEND_BUFFER ()
1950 # else /* BYTE */
1951 # define GET_BUFFER_SPACE(n) \
1952 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
1953 EXTEND_BUFFER ()
1954 # endif /* WCHAR */
1955
1956 /* Make sure we have one more byte of buffer space and then add C to it. */
1957 # define BUF_PUSH(c) \
1958 do { \
1959 GET_BUFFER_SPACE (1); \
1960 *b++ = (UCHAR_T) (c); \
1961 } while (0)
1962
1963
1964 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1965 # define BUF_PUSH_2(c1, c2) \
1966 do { \
1967 GET_BUFFER_SPACE (2); \
1968 *b++ = (UCHAR_T) (c1); \
1969 *b++ = (UCHAR_T) (c2); \
1970 } while (0)
1971
1972
1973 /* As with BUF_PUSH_2, except for three bytes. */
1974 # define BUF_PUSH_3(c1, c2, c3) \
1975 do { \
1976 GET_BUFFER_SPACE (3); \
1977 *b++ = (UCHAR_T) (c1); \
1978 *b++ = (UCHAR_T) (c2); \
1979 *b++ = (UCHAR_T) (c3); \
1980 } while (0)
1981
1982 /* Store a jump with opcode OP at LOC to location TO. We store a
1983 relative address offset by the three bytes the jump itself occupies. */
1984 # define STORE_JUMP(op, loc, to) \
1985 PREFIX(store_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)))
1986
1987 /* Likewise, for a two-argument jump. */
1988 # define STORE_JUMP2(op, loc, to, arg) \
1989 PREFIX(store_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg)
1990
1991 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1992 # define INSERT_JUMP(op, loc, to) \
1993 PREFIX(insert_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b)
1994
1995 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1996 # define INSERT_JUMP2(op, loc, to, arg) \
1997 PREFIX(insert_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\
1998 arg, b)
1999
2000 /* This is not an arbitrary limit: the arguments which represent offsets
2001 into the pattern are two bytes long. So if 2^16 bytes turns out to
2002 be too small, many things would have to change. */
2003 /* Any other compiler which, like MSC, has allocation limit below 2^16
2004 bytes will have to use approach similar to what was done below for
2005 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
2006 reallocating to 0 bytes. Such thing is not going to work too well.
2007 You have been warned!! */
2008 # ifndef DEFINED_ONCE
2009 # if defined _MSC_VER && !defined WIN32
2010 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
2011 The REALLOC define eliminates a flurry of conversion warnings,
2012 but is not required. */
2013 # define MAX_BUF_SIZE 65500L
2014 # define REALLOC(p,s) realloc ((p), (size_t) (s))
2015 # else
2016 # define MAX_BUF_SIZE (1L << 16)
2017 # define REALLOC(p,s) realloc ((p), (s))
2018 # endif
2019
2020 /* Extend the buffer by twice its current size via realloc and
2021 reset the pointers that pointed into the old block to point to the
2022 correct places in the new one. If extending the buffer results in it
2023 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
2024 # if __BOUNDED_POINTERS__
2025 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
2026 # define MOVE_BUFFER_POINTER(P) \
2027 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
2028 # define ELSE_EXTEND_BUFFER_HIGH_BOUND \
2029 else \
2030 { \
2031 SET_HIGH_BOUND (b); \
2032 SET_HIGH_BOUND (begalt); \
2033 if (fixup_alt_jump) \
2034 SET_HIGH_BOUND (fixup_alt_jump); \
2035 if (laststart) \
2036 SET_HIGH_BOUND (laststart); \
2037 if (pending_exact) \
2038 SET_HIGH_BOUND (pending_exact); \
2039 }
2040 # else
2041 # define MOVE_BUFFER_POINTER(P) (P) += incr
2042 # define ELSE_EXTEND_BUFFER_HIGH_BOUND
2043 # endif
2044 # endif /* not DEFINED_ONCE */
2045
2046 # ifdef WCHAR
2047 # define EXTEND_BUFFER() \
2048 do { \
2049 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2050 int wchar_count; \
2051 if (bufp->allocated + sizeof(UCHAR_T) > MAX_BUF_SIZE) \
2052 return REG_ESIZE; \
2053 bufp->allocated <<= 1; \
2054 if (bufp->allocated > MAX_BUF_SIZE) \
2055 bufp->allocated = MAX_BUF_SIZE; \
2056 /* How many characters the new buffer can have? */ \
2057 wchar_count = bufp->allocated / sizeof(UCHAR_T); \
2058 if (wchar_count == 0) wchar_count = 1; \
2059 /* Truncate the buffer to CHAR_T align. */ \
2060 bufp->allocated = wchar_count * sizeof(UCHAR_T); \
2061 RETALLOC (COMPILED_BUFFER_VAR, wchar_count, UCHAR_T); \
2062 bufp->buffer = (char*)COMPILED_BUFFER_VAR; \
2063 if (COMPILED_BUFFER_VAR == NULL) \
2064 return REG_ESPACE; \
2065 /* If the buffer moved, move all the pointers into it. */ \
2066 if (old_buffer != COMPILED_BUFFER_VAR) \
2067 { \
2068 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2069 MOVE_BUFFER_POINTER (b); \
2070 MOVE_BUFFER_POINTER (begalt); \
2071 if (fixup_alt_jump) \
2072 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2073 if (laststart) \
2074 MOVE_BUFFER_POINTER (laststart); \
2075 if (pending_exact) \
2076 MOVE_BUFFER_POINTER (pending_exact); \
2077 } \
2078 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2079 } while (0)
2080 # else /* BYTE */
2081 # define EXTEND_BUFFER() \
2082 do { \
2083 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2084 if (bufp->allocated == MAX_BUF_SIZE) \
2085 return REG_ESIZE; \
2086 bufp->allocated <<= 1; \
2087 if (bufp->allocated > MAX_BUF_SIZE) \
2088 bufp->allocated = MAX_BUF_SIZE; \
2089 bufp->buffer = (UCHAR_T *) REALLOC (COMPILED_BUFFER_VAR, \
2090 bufp->allocated); \
2091 if (COMPILED_BUFFER_VAR == NULL) \
2092 return REG_ESPACE; \
2093 /* If the buffer moved, move all the pointers into it. */ \
2094 if (old_buffer != COMPILED_BUFFER_VAR) \
2095 { \
2096 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2097 MOVE_BUFFER_POINTER (b); \
2098 MOVE_BUFFER_POINTER (begalt); \
2099 if (fixup_alt_jump) \
2100 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2101 if (laststart) \
2102 MOVE_BUFFER_POINTER (laststart); \
2103 if (pending_exact) \
2104 MOVE_BUFFER_POINTER (pending_exact); \
2105 } \
2106 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2107 } while (0)
2108 # endif /* WCHAR */
2109
2110 # ifndef DEFINED_ONCE
2111 /* Since we have one byte reserved for the register number argument to
2112 {start,stop}_memory, the maximum number of groups we can report
2113 things about is what fits in that byte. */
2114 # define MAX_REGNUM 255
2115
2116 /* But patterns can have more than `MAX_REGNUM' registers. We just
2117 ignore the excess. */
2118 typedef unsigned regnum_t;
2119
2120
2121 /* Macros for the compile stack. */
2122
2123 /* Since offsets can go either forwards or backwards, this type needs to
2124 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
2125 /* int may be not enough when sizeof(int) == 2. */
2126 typedef long pattern_offset_t;
2127
2128 typedef struct
2129 {
2130 pattern_offset_t begalt_offset;
2131 pattern_offset_t fixup_alt_jump;
2132 pattern_offset_t inner_group_offset;
2133 pattern_offset_t laststart_offset;
2134 regnum_t regnum;
2135 } compile_stack_elt_t;
2136
2137
2138 typedef struct
2139 {
2140 compile_stack_elt_t *stack;
2141 unsigned size;
2142 unsigned avail; /* Offset of next open position. */
2143 } compile_stack_type;
2144
2145
2146 # define INIT_COMPILE_STACK_SIZE 32
2147
2148 # define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
2149 # define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
2150
2151 /* The next available element. */
2152 # define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
2153
2154 # endif /* not DEFINED_ONCE */
2155
2156 /* Set the bit for character C in a list. */
2157 # ifndef DEFINED_ONCE
2158 # define SET_LIST_BIT(c) \
2159 (b[((unsigned char) (c)) / BYTEWIDTH] \
2160 |= 1 << (((unsigned char) c) % BYTEWIDTH))
2161 # endif /* DEFINED_ONCE */
2162
2163 /* Get the next unsigned number in the uncompiled pattern. */
2164 # define GET_UNSIGNED_NUMBER(num) \
2165 { \
2166 while (p != pend) \
2167 { \
2168 PATFETCH (c); \
2169 if (c < '0' || c > '9') \
2170 break; \
2171 if (num <= RE_DUP_MAX) \
2172 { \
2173 if (num < 0) \
2174 num = 0; \
2175 num = num * 10 + c - '0'; \
2176 } \
2177 } \
2178 }
2179
2180 # ifndef DEFINED_ONCE
2181 # if defined _LIBC || WIDE_CHAR_SUPPORT
2182 /* The GNU C library provides support for user-defined character classes
2183 and the functions from ISO C amendement 1. */
2184 # ifdef CHARCLASS_NAME_MAX
2185 # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
2186 # else
2187 /* This shouldn't happen but some implementation might still have this
2188 problem. Use a reasonable default value. */
2189 # define CHAR_CLASS_MAX_LENGTH 256
2190 # endif
2191
2192 # ifdef _LIBC
2193 # define IS_CHAR_CLASS(string) __wctype (string)
2194 # else
2195 # define IS_CHAR_CLASS(string) wctype (string)
2196 # endif
2197 # else
2198 # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
2199
2200 # define IS_CHAR_CLASS(string) \
2201 (STREQ (string, "alpha") || STREQ (string, "upper") \
2202 || STREQ (string, "lower") || STREQ (string, "digit") \
2203 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
2204 || STREQ (string, "space") || STREQ (string, "print") \
2205 || STREQ (string, "punct") || STREQ (string, "graph") \
2206 || STREQ (string, "cntrl") || STREQ (string, "blank"))
2207 # endif
2208 # endif /* DEFINED_ONCE */
2209
2210 # ifndef MATCH_MAY_ALLOCATE
2212
2213 /* If we cannot allocate large objects within re_match_2_internal,
2214 we make the fail stack and register vectors global.
2215 The fail stack, we grow to the maximum size when a regexp
2216 is compiled.
2217 The register vectors, we adjust in size each time we
2218 compile a regexp, according to the number of registers it needs. */
2219
2220 static PREFIX(fail_stack_type) fail_stack;
2221
2222 /* Size with which the following vectors are currently allocated.
2223 That is so we can make them bigger as needed,
2224 but never make them smaller. */
2225 # ifdef DEFINED_ONCE
2226 static int regs_allocated_size;
2227
2228 static const char ** regstart, ** regend;
2229 static const char ** old_regstart, ** old_regend;
2230 static const char **best_regstart, **best_regend;
2231 static const char **reg_dummy;
2232 # endif /* DEFINED_ONCE */
2233
2234 static PREFIX(register_info_type) *PREFIX(reg_info);
2235 static PREFIX(register_info_type) *PREFIX(reg_info_dummy);
2236
2237 /* Make the register vectors big enough for NUM_REGS registers,
2238 but don't make them smaller. */
2239
2240 static void
2241 PREFIX(regex_grow_registers) (int num_regs)
2242 {
2243 if (num_regs > regs_allocated_size)
2244 {
2245 RETALLOC_IF (regstart, num_regs, const char *);
2246 RETALLOC_IF (regend, num_regs, const char *);
2247 RETALLOC_IF (old_regstart, num_regs, const char *);
2248 RETALLOC_IF (old_regend, num_regs, const char *);
2249 RETALLOC_IF (best_regstart, num_regs, const char *);
2250 RETALLOC_IF (best_regend, num_regs, const char *);
2251 RETALLOC_IF (PREFIX(reg_info), num_regs, PREFIX(register_info_type));
2252 RETALLOC_IF (reg_dummy, num_regs, const char *);
2253 RETALLOC_IF (PREFIX(reg_info_dummy), num_regs, PREFIX(register_info_type));
2254
2255 regs_allocated_size = num_regs;
2256 }
2257 }
2258
2259 # endif /* not MATCH_MAY_ALLOCATE */
2260
2261 # ifndef DEFINED_ONCE
2263 static boolean group_in_compile_stack (compile_stack_type compile_stack,
2264 regnum_t regnum);
2265 # endif /* not DEFINED_ONCE */
2266
2267 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2268 Returns one of error codes defined in `regex.h', or zero for success.
2269
2270 Assumes the `allocated' (and perhaps `buffer') and `translate'
2271 fields are set in BUFP on entry.
2272
2273 If it succeeds, results are put in BUFP (if it returns an error, the
2274 contents of BUFP are undefined):
2275 `buffer' is the compiled pattern;
2276 `syntax' is set to SYNTAX;
2277 `used' is set to the length of the compiled pattern;
2278 `fastmap_accurate' is zero;
2279 `re_nsub' is the number of subexpressions in PATTERN;
2280 `not_bol' and `not_eol' are zero;
2281
2282 The `fastmap' and `newline_anchor' fields are neither
2283 examined nor set. */
2284
2285 /* Return, freeing storage we allocated. */
2286 # ifdef WCHAR
2287 # define FREE_STACK_RETURN(value) \
2288 return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value)
2289 # else
2290 # define FREE_STACK_RETURN(value) \
2291 return (free (compile_stack.stack), value)
2292 # endif /* WCHAR */
2293
2294 static reg_errcode_t
2295 PREFIX(regex_compile) (const char *ARG_PREFIX(pattern),
2296 size_t ARG_PREFIX(size), reg_syntax_t syntax,
2297 struct re_pattern_buffer *bufp)
2298 {
2299 /* We fetch characters from PATTERN here. Even though PATTERN is
2300 `char *' (i.e., signed), we declare these variables as unsigned, so
2301 they can be reliably used as array indices. */
2302 register UCHAR_T c, c1;
2303
2304 #ifdef WCHAR
2305 /* A temporary space to keep wchar_t pattern and compiled pattern. */
2306 CHAR_T *pattern, *COMPILED_BUFFER_VAR;
2307 size_t size;
2308 /* offset buffer for optimization. See convert_mbs_to_wc. */
2309 int *mbs_offset = NULL;
2310 /* It hold whether each wchar_t is binary data or not. */
2311 char *is_binary = NULL;
2312 /* A flag whether exactn is handling binary data or not. */
2313 char is_exactn_bin = FALSE;
2314 #endif /* WCHAR */
2315
2316 /* A random temporary spot in PATTERN. */
2317 const CHAR_T *p1;
2318
2319 /* Points to the end of the buffer, where we should append. */
2320 register UCHAR_T *b;
2321
2322 /* Keeps track of unclosed groups. */
2323 compile_stack_type compile_stack;
2324
2325 /* Points to the current (ending) position in the pattern. */
2326 #ifdef WCHAR
2327 const CHAR_T *p;
2328 const CHAR_T *pend;
2329 #else /* BYTE */
2330 const CHAR_T *p = pattern;
2331 const CHAR_T *pend = pattern + size;
2332 #endif /* WCHAR */
2333
2334 /* How to translate the characters in the pattern. */
2335 RE_TRANSLATE_TYPE translate = bufp->translate;
2336
2337 /* Address of the count-byte of the most recently inserted `exactn'
2338 command. This makes it possible to tell if a new exact-match
2339 character can be added to that command or if the character requires
2340 a new `exactn' command. */
2341 UCHAR_T *pending_exact = 0;
2342
2343 /* Address of start of the most recently finished expression.
2344 This tells, e.g., postfix * where to find the start of its
2345 operand. Reset at the beginning of groups and alternatives. */
2346 UCHAR_T *laststart = 0;
2347
2348 /* Address of beginning of regexp, or inside of last group. */
2349 UCHAR_T *begalt;
2350
2351 /* Address of the place where a forward jump should go to the end of
2352 the containing expression. Each alternative of an `or' -- except the
2353 last -- ends with a forward jump of this sort. */
2354 UCHAR_T *fixup_alt_jump = 0;
2355
2356 /* Counts open-groups as they are encountered. Remembered for the
2357 matching close-group on the compile stack, so the same register
2358 number is put in the stop_memory as the start_memory. */
2359 regnum_t regnum = 0;
2360
2361 #ifdef WCHAR
2362 /* Initialize the wchar_t PATTERN and offset_buffer. */
2363 p = pend = pattern = TALLOC(csize + 1, CHAR_T);
2364 mbs_offset = TALLOC(csize + 1, int);
2365 is_binary = TALLOC(csize + 1, char);
2366 if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
2367 {
2368 free(pattern);
2369 free(mbs_offset);
2370 free(is_binary);
2371 return REG_ESPACE;
2372 }
2373 pattern[csize] = L'\0'; /* sentinel */
2374 size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
2375 pend = p + size;
2376 if (size < 0)
2377 {
2378 free(pattern);
2379 free(mbs_offset);
2380 free(is_binary);
2381 return REG_BADPAT;
2382 }
2383 #endif
2384
2385 #ifdef DEBUG
2386 DEBUG_PRINT1 ("\nCompiling pattern: ");
2387 if (debug)
2388 {
2389 unsigned debug_count;
2390
2391 for (debug_count = 0; debug_count < size; debug_count++)
2392 PUT_CHAR (pattern[debug_count]);
2393 putchar ('\n');
2394 }
2395 #endif /* DEBUG */
2396
2397 /* Initialize the compile stack. */
2398 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2399 if (compile_stack.stack == NULL)
2400 {
2401 #ifdef WCHAR
2402 free(pattern);
2403 free(mbs_offset);
2404 free(is_binary);
2405 #endif
2406 return REG_ESPACE;
2407 }
2408
2409 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2410 compile_stack.avail = 0;
2411
2412 /* Initialize the pattern buffer. */
2413 bufp->syntax = syntax;
2414 bufp->fastmap_accurate = 0;
2415 bufp->not_bol = bufp->not_eol = 0;
2416
2417 /* Set `used' to zero, so that if we return an error, the pattern
2418 printer (for debugging) will think there's no pattern. We reset it
2419 at the end. */
2420 bufp->used = 0;
2421
2422 /* Always count groups, whether or not bufp->no_sub is set. */
2423 bufp->re_nsub = 0;
2424
2425 #if !defined emacs && !defined SYNTAX_TABLE
2426 /* Initialize the syntax table. */
2427 init_syntax_once ();
2428 #endif
2429
2430 if (bufp->allocated == 0)
2431 {
2432 if (bufp->buffer)
2433 { /* If zero allocated, but buffer is non-null, try to realloc
2434 enough space. This loses if buffer's address is bogus, but
2435 that is the user's responsibility. */
2436 #ifdef WCHAR
2437 /* Free bufp->buffer and allocate an array for wchar_t pattern
2438 buffer. */
2439 free(bufp->buffer);
2440 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(UCHAR_T),
2441 UCHAR_T);
2442 #else
2443 RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, UCHAR_T);
2444 #endif /* WCHAR */
2445 }
2446 else
2447 { /* Caller did not allocate a buffer. Do it for them. */
2448 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(UCHAR_T),
2449 UCHAR_T);
2450 }
2451
2452 if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE);
2453 #ifdef WCHAR
2454 bufp->buffer = (char*)COMPILED_BUFFER_VAR;
2455 #endif /* WCHAR */
2456 bufp->allocated = INIT_BUF_SIZE;
2457 }
2458 #ifdef WCHAR
2459 else
2460 COMPILED_BUFFER_VAR = (UCHAR_T*) bufp->buffer;
2461 #endif
2462
2463 begalt = b = COMPILED_BUFFER_VAR;
2464
2465 /* Loop through the uncompiled pattern until we're at the end. */
2466 while (p != pend)
2467 {
2468 PATFETCH (c);
2469
2470 switch (c)
2471 {
2472 case '^':
2473 {
2474 if ( /* If at start of pattern, it's an operator. */
2475 p == pattern + 1
2476 /* If context independent, it's an operator. */
2477 || syntax & RE_CONTEXT_INDEP_ANCHORS
2478 /* Otherwise, depends on what's come before. */
2479 || PREFIX(at_begline_loc_p) (pattern, p, syntax))
2480 BUF_PUSH (begline);
2481 else
2482 goto normal_char;
2483 }
2484 break;
2485
2486
2487 case '$':
2488 {
2489 if ( /* If at end of pattern, it's an operator. */
2490 p == pend
2491 /* If context independent, it's an operator. */
2492 || syntax & RE_CONTEXT_INDEP_ANCHORS
2493 /* Otherwise, depends on what's next. */
2494 || PREFIX(at_endline_loc_p) (p, pend, syntax))
2495 BUF_PUSH (endline);
2496 else
2497 goto normal_char;
2498 }
2499 break;
2500
2501
2502 case '+':
2503 case '?':
2504 if ((syntax & RE_BK_PLUS_QM)
2505 || (syntax & RE_LIMITED_OPS))
2506 goto normal_char;
2507 handle_plus:
2508 case '*':
2509 /* If there is no previous pattern... */
2510 if (!laststart)
2511 {
2512 if (syntax & RE_CONTEXT_INVALID_OPS)
2513 FREE_STACK_RETURN (REG_BADRPT);
2514 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2515 goto normal_char;
2516 }
2517
2518 {
2519 /* Are we optimizing this jump? */
2520 boolean keep_string_p = false;
2521
2522 /* 1 means zero (many) matches is allowed. */
2523 char zero_times_ok = 0, many_times_ok = 0;
2524
2525 /* If there is a sequence of repetition chars, collapse it
2526 down to just one (the right one). We can't combine
2527 interval operators with these because of, e.g., `a{2}*',
2528 which should only match an even number of `a's. */
2529
2530 for (;;)
2531 {
2532 zero_times_ok |= c != '+';
2533 many_times_ok |= c != '?';
2534
2535 if (p == pend)
2536 break;
2537
2538 PATFETCH (c);
2539
2540 if (c == '*'
2541 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2542 ;
2543
2544 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2545 {
2546 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2547
2548 PATFETCH (c1);
2549 if (!(c1 == '+' || c1 == '?'))
2550 {
2551 PATUNFETCH;
2552 PATUNFETCH;
2553 break;
2554 }
2555
2556 c = c1;
2557 }
2558 else
2559 {
2560 PATUNFETCH;
2561 break;
2562 }
2563
2564 /* If we get here, we found another repeat character. */
2565 }
2566
2567 /* Star, etc. applied to an empty pattern is equivalent
2568 to an empty pattern. */
2569 if (!laststart)
2570 break;
2571
2572 /* Now we know whether or not zero matches is allowed
2573 and also whether or not two or more matches is allowed. */
2574 if (many_times_ok)
2575 { /* More than one repetition is allowed, so put in at the
2576 end a backward relative jump from `b' to before the next
2577 jump we're going to put in below (which jumps from
2578 laststart to after this jump).
2579
2580 But if we are at the `*' in the exact sequence `.*\n',
2581 insert an unconditional jump backwards to the .,
2582 instead of the beginning of the loop. This way we only
2583 push a failure point once, instead of every time
2584 through the loop. */
2585 assert (p - 1 > pattern);
2586
2587 /* Allocate the space for the jump. */
2588 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2589
2590 /* We know we are not at the first character of the pattern,
2591 because laststart was nonzero. And we've already
2592 incremented `p', by the way, to be the character after
2593 the `*'. Do we have to do something analogous here
2594 for null bytes, because of RE_DOT_NOT_NULL? */
2595 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2596 && zero_times_ok
2597 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2598 && !(syntax & RE_DOT_NEWLINE))
2599 { /* We have .*\n. */
2600 STORE_JUMP (jump, b, laststart);
2601 keep_string_p = true;
2602 }
2603 else
2604 /* Anything else. */
2605 STORE_JUMP (maybe_pop_jump, b, laststart -
2606 (1 + OFFSET_ADDRESS_SIZE));
2607
2608 /* We've added more stuff to the buffer. */
2609 b += 1 + OFFSET_ADDRESS_SIZE;
2610 }
2611
2612 /* On failure, jump from laststart to b + 3, which will be the
2613 end of the buffer after this jump is inserted. */
2614 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of
2615 'b + 3'. */
2616 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2617 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2618 : on_failure_jump,
2619 laststart, b + 1 + OFFSET_ADDRESS_SIZE);
2620 pending_exact = 0;
2621 b += 1 + OFFSET_ADDRESS_SIZE;
2622
2623 if (!zero_times_ok)
2624 {
2625 /* At least one repetition is required, so insert a
2626 `dummy_failure_jump' before the initial
2627 `on_failure_jump' instruction of the loop. This
2628 effects a skip over that instruction the first time
2629 we hit that loop. */
2630 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2631 INSERT_JUMP (dummy_failure_jump, laststart, laststart +
2632 2 + 2 * OFFSET_ADDRESS_SIZE);
2633 b += 1 + OFFSET_ADDRESS_SIZE;
2634 }
2635 }
2636 break;
2637
2638
2639 case '.':
2640 laststart = b;
2641 BUF_PUSH (anychar);
2642 break;
2643
2644
2645 case '[':
2646 {
2647 boolean had_char_class = false;
2648 #ifdef WCHAR
2649 CHAR_T range_start = 0xffffffff;
2650 #else
2651 unsigned int range_start = 0xffffffff;
2652 #endif
2653 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2654
2655 #ifdef WCHAR
2656 /* We assume a charset(_not) structure as a wchar_t array.
2657 charset[0] = (re_opcode_t) charset(_not)
2658 charset[1] = l (= length of char_classes)
2659 charset[2] = m (= length of collating_symbols)
2660 charset[3] = n (= length of equivalence_classes)
2661 charset[4] = o (= length of char_ranges)
2662 charset[5] = p (= length of chars)
2663
2664 charset[6] = char_class (wctype_t)
2665 charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
2666 ...
2667 charset[l+5] = char_class (wctype_t)
2668
2669 charset[l+6] = collating_symbol (wchar_t)
2670 ...
2671 charset[l+m+5] = collating_symbol (wchar_t)
2672 ifdef _LIBC we use the index if
2673 _NL_COLLATE_SYMB_EXTRAMB instead of
2674 wchar_t string.
2675
2676 charset[l+m+6] = equivalence_classes (wchar_t)
2677 ...
2678 charset[l+m+n+5] = equivalence_classes (wchar_t)
2679 ifdef _LIBC we use the index in
2680 _NL_COLLATE_WEIGHT instead of
2681 wchar_t string.
2682
2683 charset[l+m+n+6] = range_start
2684 charset[l+m+n+7] = range_end
2685 ...
2686 charset[l+m+n+2o+4] = range_start
2687 charset[l+m+n+2o+5] = range_end
2688 ifdef _LIBC we use the value looked up
2689 in _NL_COLLATE_COLLSEQ instead of
2690 wchar_t character.
2691
2692 charset[l+m+n+2o+6] = char
2693 ...
2694 charset[l+m+n+2o+p+5] = char
2695
2696 */
2697
2698 /* We need at least 6 spaces: the opcode, the length of
2699 char_classes, the length of collating_symbols, the length of
2700 equivalence_classes, the length of char_ranges, the length of
2701 chars. */
2702 GET_BUFFER_SPACE (6);
2703
2704 /* Save b as laststart. And We use laststart as the pointer
2705 to the first element of the charset here.
2706 In other words, laststart[i] indicates charset[i]. */
2707 laststart = b;
2708
2709 /* We test `*p == '^' twice, instead of using an if
2710 statement, so we only need one BUF_PUSH. */
2711 BUF_PUSH (*p == '^' ? charset_not : charset);
2712 if (*p == '^')
2713 p++;
2714
2715 /* Push the length of char_classes, the length of
2716 collating_symbols, the length of equivalence_classes, the
2717 length of char_ranges and the length of chars. */
2718 BUF_PUSH_3 (0, 0, 0);
2719 BUF_PUSH_2 (0, 0);
2720
2721 /* Remember the first position in the bracket expression. */
2722 p1 = p;
2723
2724 /* charset_not matches newline according to a syntax bit. */
2725 if ((re_opcode_t) b[-6] == charset_not
2726 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2727 {
2728 BUF_PUSH('\n');
2729 laststart[5]++; /* Update the length of characters */
2730 }
2731
2732 /* Read in characters and ranges, setting map bits. */
2733 for (;;)
2734 {
2735 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2736
2737 PATFETCH (c);
2738
2739 /* \ might escape characters inside [...] and [^...]. */
2740 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2741 {
2742 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2743
2744 PATFETCH (c1);
2745 BUF_PUSH(c1);
2746 laststart[5]++; /* Update the length of chars */
2747 range_start = c1;
2748 continue;
2749 }
2750
2751 /* Could be the end of the bracket expression. If it's
2752 not (i.e., when the bracket expression is `[]' so
2753 far), the ']' character bit gets set way below. */
2754 if (c == ']' && p != p1 + 1)
2755 break;
2756
2757 /* Look ahead to see if it's a range when the last thing
2758 was a character class. */
2759 if (had_char_class && c == '-' && *p != ']')
2760 FREE_STACK_RETURN (REG_ERANGE);
2761
2762 /* Look ahead to see if it's a range when the last thing
2763 was a character: if this is a hyphen not at the
2764 beginning or the end of a list, then it's the range
2765 operator. */
2766 if (c == '-'
2767 && !(p - 2 >= pattern && p[-2] == '[')
2768 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2769 && *p != ']')
2770 {
2771 reg_errcode_t ret;
2772 /* Allocate the space for range_start and range_end. */
2773 GET_BUFFER_SPACE (2);
2774 /* Update the pointer to indicate end of buffer. */
2775 b += 2;
2776 ret = wcs_compile_range (range_start, &p, pend, translate,
2777 syntax, b, laststart);
2778 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2779 range_start = 0xffffffff;
2780 }
2781 else if (p[0] == '-' && p[1] != ']')
2782 { /* This handles ranges made up of characters only. */
2783 reg_errcode_t ret;
2784
2785 /* Move past the `-'. */
2786 PATFETCH (c1);
2787 /* Allocate the space for range_start and range_end. */
2788 GET_BUFFER_SPACE (2);
2789 /* Update the pointer to indicate end of buffer. */
2790 b += 2;
2791 ret = wcs_compile_range (c, &p, pend, translate, syntax, b,
2792 laststart);
2793 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2794 range_start = 0xffffffff;
2795 }
2796
2797 /* See if we're at the beginning of a possible character
2798 class. */
2799 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2800 { /* Leave room for the null. */
2801 char str[CHAR_CLASS_MAX_LENGTH + 1];
2802
2803 PATFETCH (c);
2804 c1 = 0;
2805
2806 /* If pattern is `[[:'. */
2807 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2808
2809 for (;;)
2810 {
2811 PATFETCH (c);
2812 if ((c == ':' && *p == ']') || p == pend)
2813 break;
2814 if (c1 < CHAR_CLASS_MAX_LENGTH)
2815 str[c1++] = c;
2816 else
2817 /* This is in any case an invalid class name. */
2818 str[0] = '\0';
2819 }
2820 str[c1] = '\0';
2821
2822 /* If isn't a word bracketed by `[:' and `:]':
2823 undo the ending character, the letters, and leave
2824 the leading `:' and `[' (but store them as character). */
2825 if (c == ':' && *p == ']')
2826 {
2827 wctype_t wt;
2828 uintptr_t alignedp;
2829
2830 /* Query the character class as wctype_t. */
2831 wt = IS_CHAR_CLASS (str);
2832 if (wt == 0)
2833 FREE_STACK_RETURN (REG_ECTYPE);
2834
2835 /* Throw away the ] at the end of the character
2836 class. */
2837 PATFETCH (c);
2838
2839 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2840
2841 /* Allocate the space for character class. */
2842 GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
2843 /* Update the pointer to indicate end of buffer. */
2844 b += CHAR_CLASS_SIZE;
2845 /* Move data which follow character classes
2846 not to violate the data. */
2847 insert_space(CHAR_CLASS_SIZE,
2848 laststart + 6 + laststart[1],
2849 b - 1);
2850 alignedp = ((uintptr_t)(laststart + 6 + laststart[1])
2851 + __alignof__(wctype_t) - 1)
2852 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
2853 /* Store the character class. */
2854 *((wctype_t*)alignedp) = wt;
2855 /* Update length of char_classes */
2856 laststart[1] += CHAR_CLASS_SIZE;
2857
2858 had_char_class = true;
2859 }
2860 else
2861 {
2862 c1++;
2863 while (c1--)
2864 PATUNFETCH;
2865 BUF_PUSH ('[');
2866 BUF_PUSH (':');
2867 laststart[5] += 2; /* Update the length of characters */
2868 range_start = ':';
2869 had_char_class = false;
2870 }
2871 }
2872 else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '='
2873 || *p == '.'))
2874 {
2875 CHAR_T str[128]; /* Should be large enough. */
2876 CHAR_T delim = *p; /* '=' or '.' */
2877 # ifdef _LIBC
2878 uint32_t nrules =
2879 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2880 # endif
2881 PATFETCH (c);
2882 c1 = 0;
2883
2884 /* If pattern is `[[=' or '[[.'. */
2885 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2886
2887 for (;;)
2888 {
2889 PATFETCH (c);
2890 if ((c == delim && *p == ']') || p == pend)
2891 break;
2892 if (c1 < sizeof (str) - 1)
2893 str[c1++] = c;
2894 else
2895 /* This is in any case an invalid class name. */
2896 str[0] = '\0';
2897 }
2898 str[c1] = '\0';
2899
2900 if (c == delim && *p == ']' && str[0] != '\0')
2901 {
2902 unsigned int i, offset;
2903 /* If we have no collation data we use the default
2904 collation in which each character is in a class
2905 by itself. It also means that ASCII is the
2906 character set and therefore we cannot have character
2907 with more than one byte in the multibyte
2908 representation. */
2909
2910 /* If not defined _LIBC, we push the name and
2911 `\0' for the sake of matching performance. */
2912 int datasize = c1 + 1;
2913
2914 # ifdef _LIBC
2915 int32_t idx = 0;
2916 if (nrules == 0)
2917 # endif
2918 {
2919 if (c1 != 1)
2920 FREE_STACK_RETURN (REG_ECOLLATE);
2921 }
2922 # ifdef _LIBC
2923 else
2924 {
2925 const int32_t *table;
2926 const int32_t *weights;
2927 const int32_t *extra;
2928 const int32_t *indirect;
2929 wint_t *cp;
2930
2931 /* This #include defines a local function! */
2932 # include <locale/weightwc.h>
2933
2934 if(delim == '=')
2935 {
2936 /* We push the index for equivalence class. */
2937 cp = (wint_t*)str;
2938
2939 table = (const int32_t *)
2940 _NL_CURRENT (LC_COLLATE,
2941 _NL_COLLATE_TABLEWC);
2942 weights = (const int32_t *)
2943 _NL_CURRENT (LC_COLLATE,
2944 _NL_COLLATE_WEIGHTWC);
2945 extra = (const int32_t *)
2946 _NL_CURRENT (LC_COLLATE,
2947 _NL_COLLATE_EXTRAWC);
2948 indirect = (const int32_t *)
2949 _NL_CURRENT (LC_COLLATE,
2950 _NL_COLLATE_INDIRECTWC);
2951
2952 idx = findidx ((const wint_t**)&cp);
2953 if (idx == 0 || cp < (wint_t*) str + c1)
2954 /* This is no valid character. */
2955 FREE_STACK_RETURN (REG_ECOLLATE);
2956
2957 str[0] = (wchar_t)idx;
2958 }
2959 else /* delim == '.' */
2960 {
2961 /* We push collation sequence value
2962 for collating symbol. */
2963 int32_t table_size;
2964 const int32_t *symb_table;
2965 const unsigned char *extra;
2966 int32_t idx;
2967 int32_t elem;
2968 int32_t second;
2969 int32_t hash;
2970 char char_str[c1];
2971
2972 /* We have to convert the name to a single-byte
2973 string. This is possible since the names
2974 consist of ASCII characters and the internal
2975 representation is UCS4. */
2976 for (i = 0; i < c1; ++i)
2977 char_str[i] = str[i];
2978
2979 table_size =
2980 _NL_CURRENT_WORD (LC_COLLATE,
2981 _NL_COLLATE_SYMB_HASH_SIZEMB);
2982 symb_table = (const int32_t *)
2983 _NL_CURRENT (LC_COLLATE,
2984 _NL_COLLATE_SYMB_TABLEMB);
2985 extra = (const unsigned char *)
2986 _NL_CURRENT (LC_COLLATE,
2987 _NL_COLLATE_SYMB_EXTRAMB);
2988
2989 /* Locate the character in the hashing table. */
2990 hash = elem_hash (char_str, c1);
2991
2992 idx = 0;
2993 elem = hash % table_size;
2994 second = hash % (table_size - 2);
2995 while (symb_table[2 * elem] != 0)
2996 {
2997 /* First compare the hashing value. */
2998 if (symb_table[2 * elem] == hash
2999 && c1 == extra[symb_table[2 * elem + 1]]
3000 && memcmp (char_str,
3001 &extra[symb_table[2 * elem + 1]
3002 + 1], c1) == 0)
3003 {
3004 /* Yep, this is the entry. */
3005 idx = symb_table[2 * elem + 1];
3006 idx += 1 + extra[idx];
3007 break;
3008 }
3009
3010 /* Next entry. */
3011 elem += second;
3012 }
3013
3014 if (symb_table[2 * elem] != 0)
3015 {
3016 /* Compute the index of the byte sequence
3017 in the table. */
3018 idx += 1 + extra[idx];
3019 /* Adjust for the alignment. */
3020 idx = (idx + 3) & ~3;
3021
3022 str[0] = (wchar_t) idx + 4;
3023 }
3024 else if (symb_table[2 * elem] == 0 && c1 == 1)
3025 {
3026 /* No valid character. Match it as a
3027 single byte character. */
3028 had_char_class = false;
3029 BUF_PUSH(str[0]);
3030 /* Update the length of characters */
3031 laststart[5]++;
3032 range_start = str[0];
3033
3034 /* Throw away the ] at the end of the
3035 collating symbol. */
3036 PATFETCH (c);
3037 /* exit from the switch block. */
3038 continue;
3039 }
3040 else
3041 FREE_STACK_RETURN (REG_ECOLLATE);
3042 }
3043 datasize = 1;
3044 }
3045 # endif
3046 /* Throw away the ] at the end of the equivalence
3047 class (or collating symbol). */
3048 PATFETCH (c);
3049
3050 /* Allocate the space for the equivalence class
3051 (or collating symbol) (and '\0' if needed). */
3052 GET_BUFFER_SPACE(datasize);
3053 /* Update the pointer to indicate end of buffer. */
3054 b += datasize;
3055
3056 if (delim == '=')
3057 { /* equivalence class */
3058 /* Calculate the offset of char_ranges,
3059 which is next to equivalence_classes. */
3060 offset = laststart[1] + laststart[2]
3061 + laststart[3] +6;
3062 /* Insert space. */
3063 insert_space(datasize, laststart + offset, b - 1);
3064
3065 /* Write the equivalence_class and \0. */
3066 for (i = 0 ; i < datasize ; i++)
3067 laststart[offset + i] = str[i];
3068
3069 /* Update the length of equivalence_classes. */
3070 laststart[3] += datasize;
3071 had_char_class = true;
3072 }
3073 else /* delim == '.' */
3074 { /* collating symbol */
3075 /* Calculate the offset of the equivalence_classes,
3076 which is next to collating_symbols. */
3077 offset = laststart[1] + laststart[2] + 6;
3078 /* Insert space and write the collationg_symbol
3079 and \0. */
3080 insert_space(datasize, laststart + offset, b-1);
3081 for (i = 0 ; i < datasize ; i++)
3082 laststart[offset + i] = str[i];
3083
3084 /* In re_match_2_internal if range_start < -1, we
3085 assume -range_start is the offset of the
3086 collating symbol which is specified as
3087 the character of the range start. So we assign
3088 -(laststart[1] + laststart[2] + 6) to
3089 range_start. */
3090 range_start = -(laststart[1] + laststart[2] + 6);
3091 /* Update the length of collating_symbol. */
3092 laststart[2] += datasize;
3093 had_char_class = false;
3094 }
3095 }
3096 else
3097 {
3098 c1++;
3099 while (c1--)
3100 PATUNFETCH;
3101 BUF_PUSH ('[');
3102 BUF_PUSH (delim);
3103 laststart[5] += 2; /* Update the length of characters */
3104 range_start = delim;
3105 had_char_class = false;
3106 }
3107 }
3108 else
3109 {
3110 had_char_class = false;
3111 BUF_PUSH(c);
3112 laststart[5]++; /* Update the length of characters */
3113 range_start = c;
3114 }
3115 }
3116
3117 #else /* BYTE */
3118 /* Ensure that we have enough space to push a charset: the
3119 opcode, the length count, and the bitset; 34 bytes in all. */
3120 GET_BUFFER_SPACE (34);
3121
3122 laststart = b;
3123
3124 /* We test `*p == '^' twice, instead of using an if
3125 statement, so we only need one BUF_PUSH. */
3126 BUF_PUSH (*p == '^' ? charset_not : charset);
3127 if (*p == '^')
3128 p++;
3129
3130 /* Remember the first position in the bracket expression. */
3131 p1 = p;
3132
3133 /* Push the number of bytes in the bitmap. */
3134 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
3135
3136 /* Clear the whole map. */
3137 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
3138
3139 /* charset_not matches newline according to a syntax bit. */
3140 if ((re_opcode_t) b[-2] == charset_not
3141 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
3142 SET_LIST_BIT ('\n');
3143
3144 /* Read in characters and ranges, setting map bits. */
3145 for (;;)
3146 {
3147 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3148
3149 PATFETCH (c);
3150
3151 /* \ might escape characters inside [...] and [^...]. */
3152 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
3153 {
3154 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3155
3156 PATFETCH (c1);
3157 SET_LIST_BIT (c1);
3158 range_start = c1;
3159 continue;
3160 }
3161
3162 /* Could be the end of the bracket expression. If it's
3163 not (i.e., when the bracket expression is `[]' so
3164 far), the ']' character bit gets set way below. */
3165 if (c == ']' && p != p1 + 1)
3166 break;
3167
3168 /* Look ahead to see if it's a range when the last thing
3169 was a character class. */
3170 if (had_char_class && c == '-' && *p != ']')
3171 FREE_STACK_RETURN (REG_ERANGE);
3172
3173 /* Look ahead to see if it's a range when the last thing
3174 was a character: if this is a hyphen not at the
3175 beginning or the end of a list, then it's the range
3176 operator. */
3177 if (c == '-'
3178 && !(p - 2 >= pattern && p[-2] == '[')
3179 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
3180 && *p != ']')
3181 {
3182 reg_errcode_t ret
3183 = byte_compile_range (range_start, &p, pend, translate,
3184 syntax, b);
3185 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3186 range_start = 0xffffffff;
3187 }
3188
3189 else if (p[0] == '-' && p[1] != ']')
3190 { /* This handles ranges made up of characters only. */
3191 reg_errcode_t ret;
3192
3193 /* Move past the `-'. */
3194 PATFETCH (c1);
3195
3196 ret = byte_compile_range (c, &p, pend, translate, syntax, b);
3197 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3198 range_start = 0xffffffff;
3199 }
3200
3201 /* See if we're at the beginning of a possible character
3202 class. */
3203
3204 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
3205 { /* Leave room for the null. */
3206 char str[CHAR_CLASS_MAX_LENGTH + 1];
3207
3208 PATFETCH (c);
3209 c1 = 0;
3210
3211 /* If pattern is `[[:'. */
3212 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3213
3214 for (;;)
3215 {
3216 PATFETCH (c);
3217 if ((c == ':' && *p == ']') || p == pend)
3218 break;
3219 if (c1 < CHAR_CLASS_MAX_LENGTH)
3220 str[c1++] = c;
3221 else
3222 /* This is in any case an invalid class name. */
3223 str[0] = '\0';
3224 }
3225 str[c1] = '\0';
3226
3227 /* If isn't a word bracketed by `[:' and `:]':
3228 undo the ending character, the letters, and leave
3229 the leading `:' and `[' (but set bits for them). */
3230 if (c == ':' && *p == ']')
3231 {
3232 # if defined _LIBC || WIDE_CHAR_SUPPORT
3233 boolean is_lower = STREQ (str, "lower");
3234 boolean is_upper = STREQ (str, "upper");
3235 wctype_t wt;
3236 int ch;
3237
3238 wt = IS_CHAR_CLASS (str);
3239 if (wt == 0)
3240 FREE_STACK_RETURN (REG_ECTYPE);
3241
3242 /* Throw away the ] at the end of the character
3243 class. */
3244 PATFETCH (c);
3245
3246 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3247
3248 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
3249 {
3250 # ifdef _LIBC
3251 if (__iswctype (__btowc (ch), wt))
3252 SET_LIST_BIT (ch);
3253 # else
3254 if (iswctype (btowc (ch), wt))
3255 SET_LIST_BIT (ch);
3256 # endif
3257
3258 if (translate && (is_upper || is_lower)
3259 && (ISUPPER (ch) || ISLOWER (ch)))
3260 SET_LIST_BIT (ch);
3261 }
3262
3263 had_char_class = true;
3264 # else
3265 int ch;
3266 boolean is_alnum = STREQ (str, "alnum");
3267 boolean is_alpha = STREQ (str, "alpha");
3268 boolean is_blank = STREQ (str, "blank");
3269 boolean is_cntrl = STREQ (str, "cntrl");
3270 boolean is_digit = STREQ (str, "digit");
3271 boolean is_graph = STREQ (str, "graph");
3272 boolean is_lower = STREQ (str, "lower");
3273 boolean is_print = STREQ (str, "print");
3274 boolean is_punct = STREQ (str, "punct");
3275 boolean is_space = STREQ (str, "space");
3276 boolean is_upper = STREQ (str, "upper");
3277 boolean is_xdigit = STREQ (str, "xdigit");
3278
3279 if (!IS_CHAR_CLASS (str))
3280 FREE_STACK_RETURN (REG_ECTYPE);
3281
3282 /* Throw away the ] at the end of the character
3283 class. */
3284 PATFETCH (c);
3285
3286 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3287
3288 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
3289 {
3290 /* This was split into 3 if's to
3291 avoid an arbitrary limit in some compiler. */
3292 if ( (is_alnum && ISALNUM (ch))
3293 || (is_alpha && ISALPHA (ch))
3294 || (is_blank && ISBLANK (ch))
3295 || (is_cntrl && ISCNTRL (ch)))
3296 SET_LIST_BIT (ch);
3297 if ( (is_digit && ISDIGIT (ch))
3298 || (is_graph && ISGRAPH (ch))
3299 || (is_lower && ISLOWER (ch))
3300 || (is_print && ISPRINT (ch)))
3301 SET_LIST_BIT (ch);
3302 if ( (is_punct && ISPUNCT (ch))
3303 || (is_space && ISSPACE (ch))
3304 || (is_upper && ISUPPER (ch))
3305 || (is_xdigit && ISXDIGIT (ch)))
3306 SET_LIST_BIT (ch);
3307 if ( translate && (is_upper || is_lower)
3308 && (ISUPPER (ch) || ISLOWER (ch)))
3309 SET_LIST_BIT (ch);
3310 }
3311 had_char_class = true;
3312 # endif /* libc || wctype.h */
3313 }
3314 else
3315 {
3316 c1++;
3317 while (c1--)
3318 PATUNFETCH;
3319 SET_LIST_BIT ('[');
3320 SET_LIST_BIT (':');
3321 range_start = ':';
3322 had_char_class = false;
3323 }
3324 }
3325 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
3326 {
3327 unsigned char str[MB_LEN_MAX + 1];
3328 # ifdef _LIBC
3329 uint32_t nrules =
3330 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3331 # endif
3332
3333 PATFETCH (c);
3334 c1 = 0;
3335
3336 /* If pattern is `[[='. */
3337 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3338
3339 for (;;)
3340 {
3341 PATFETCH (c);
3342 if ((c == '=' && *p == ']') || p == pend)
3343 break;
3344 if (c1 < MB_LEN_MAX)
3345 str[c1++] = c;
3346 else
3347 /* This is in any case an invalid class name. */
3348 str[0] = '\0';
3349 }
3350 str[c1] = '\0';
3351
3352 if (c == '=' && *p == ']' && str[0] != '\0')
3353 {
3354 /* If we have no collation data we use the default
3355 collation in which each character is in a class
3356 by itself. It also means that ASCII is the
3357 character set and therefore we cannot have character
3358 with more than one byte in the multibyte
3359 representation. */
3360 # ifdef _LIBC
3361 if (nrules == 0)
3362 # endif
3363 {
3364 if (c1 != 1)
3365 FREE_STACK_RETURN (REG_ECOLLATE);
3366
3367 /* Throw away the ] at the end of the equivalence
3368 class. */
3369 PATFETCH (c);
3370
3371 /* Set the bit for the character. */
3372 SET_LIST_BIT (str[0]);
3373 }
3374 # ifdef _LIBC
3375 else
3376 {
3377 /* Try to match the byte sequence in `str' against
3378 those known to the collate implementation.
3379 First find out whether the bytes in `str' are
3380 actually from exactly one character. */
3381 const int32_t *table;
3382 const unsigned char *weights;
3383 const unsigned char *extra;
3384 const int32_t *indirect;
3385 int32_t idx;
3386 const unsigned char *cp = str;
3387 int ch;
3388
3389 /* This #include defines a local function! */
3390 # include <locale/weight.h>
3391
3392 table = (const int32_t *)
3393 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3394 weights = (const unsigned char *)
3395 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3396 extra = (const unsigned char *)
3397 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3398 indirect = (const int32_t *)
3399 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3400
3401 idx = findidx (&cp);
3402 if (idx == 0 || cp < str + c1)
3403 /* This is no valid character. */
3404 FREE_STACK_RETURN (REG_ECOLLATE);
3405
3406 /* Throw away the ] at the end of the equivalence
3407 class. */
3408 PATFETCH (c);
3409
3410 /* Now we have to go throught the whole table
3411 and find all characters which have the same
3412 first level weight.
3413
3414 XXX Note that this is not entirely correct.
3415 we would have to match multibyte sequences
3416 but this is not possible with the current
3417 implementation. */
3418 for (ch = 1; ch < 256; ++ch)
3419 /* XXX This test would have to be changed if we
3420 would allow matching multibyte sequences. */
3421 if (table[ch] > 0)
3422 {
3423 int32_t idx2 = table[ch];
3424 size_t len = weights[idx2];
3425
3426 /* Test whether the lenghts match. */
3427 if (weights[idx] == len)
3428 {
3429 /* They do. New compare the bytes of
3430 the weight. */
3431 size_t cnt = 0;
3432
3433 while (cnt < len
3434 && (weights[idx + 1 + cnt]
3435 == weights[idx2 + 1 + cnt]))
3436 ++cnt;
3437
3438 if (cnt == len)
3439 /* They match. Mark the character as
3440 acceptable. */
3441 SET_LIST_BIT (ch);
3442 }
3443 }
3444 }
3445 # endif
3446 had_char_class = true;
3447 }
3448 else
3449 {
3450 c1++;
3451 while (c1--)
3452 PATUNFETCH;
3453 SET_LIST_BIT ('[');
3454 SET_LIST_BIT ('=');
3455 range_start = '=';
3456 had_char_class = false;
3457 }
3458 }
3459 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
3460 {
3461 unsigned char str[128]; /* Should be large enough. */
3462 # ifdef _LIBC
3463 uint32_t nrules =
3464 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3465 # endif
3466
3467 PATFETCH (c);
3468 c1 = 0;
3469
3470 /* If pattern is `[[.'. */
3471 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3472
3473 for (;;)
3474 {
3475 PATFETCH (c);
3476 if ((c == '.' && *p == ']') || p == pend)
3477 break;
3478 if (c1 < sizeof (str))
3479 str[c1++] = c;
3480 else
3481 /* This is in any case an invalid class name. */
3482 str[0] = '\0';
3483 }
3484 str[c1] = '\0';
3485
3486 if (c == '.' && *p == ']' && str[0] != '\0')
3487 {
3488 /* If we have no collation data we use the default
3489 collation in which each character is the name
3490 for its own class which contains only the one
3491 character. It also means that ASCII is the
3492 character set and therefore we cannot have character
3493 with more than one byte in the multibyte
3494 representation. */
3495 # ifdef _LIBC
3496 if (nrules == 0)
3497 # endif
3498 {
3499 if (c1 != 1)
3500 FREE_STACK_RETURN (REG_ECOLLATE);
3501
3502 /* Throw away the ] at the end of the equivalence
3503 class. */
3504 PATFETCH (c);
3505
3506 /* Set the bit for the character. */
3507 SET_LIST_BIT (str[0]);
3508 range_start = ((const unsigned char *) str)[0];
3509 }
3510 # ifdef _LIBC
3511 else
3512 {
3513 /* Try to match the byte sequence in `str' against
3514 those known to the collate implementation.
3515 First find out whether the bytes in `str' are
3516 actually from exactly one character. */
3517 int32_t table_size;
3518 const int32_t *symb_table;
3519 const unsigned char *extra;
3520 int32_t idx;
3521 int32_t elem;
3522 int32_t second;
3523 int32_t hash;
3524
3525 table_size =
3526 _NL_CURRENT_WORD (LC_COLLATE,
3527 _NL_COLLATE_SYMB_HASH_SIZEMB);
3528 symb_table = (const int32_t *)
3529 _NL_CURRENT (LC_COLLATE,
3530 _NL_COLLATE_SYMB_TABLEMB);
3531 extra = (const unsigned char *)
3532 _NL_CURRENT (LC_COLLATE,
3533 _NL_COLLATE_SYMB_EXTRAMB);
3534
3535 /* Locate the character in the hashing table. */
3536 hash = elem_hash (str, c1);
3537
3538 idx = 0;
3539 elem = hash % table_size;
3540 second = hash % (table_size - 2);
3541 while (symb_table[2 * elem] != 0)
3542 {
3543 /* First compare the hashing value. */
3544 if (symb_table[2 * elem] == hash
3545 && c1 == extra[symb_table[2 * elem + 1]]
3546 && memcmp (str,
3547 &extra[symb_table[2 * elem + 1]
3548 + 1],
3549 c1) == 0)
3550 {
3551 /* Yep, this is the entry. */
3552 idx = symb_table[2 * elem + 1];
3553 idx += 1 + extra[idx];
3554 break;
3555 }
3556
3557 /* Next entry. */
3558 elem += second;
3559 }
3560
3561 if (symb_table[2 * elem] == 0)
3562 /* This is no valid character. */
3563 FREE_STACK_RETURN (REG_ECOLLATE);
3564
3565 /* Throw away the ] at the end of the equivalence
3566 class. */
3567 PATFETCH (c);
3568
3569 /* Now add the multibyte character(s) we found
3570 to the accept list.
3571
3572 XXX Note that this is not entirely correct.
3573 we would have to match multibyte sequences
3574 but this is not possible with the current
3575 implementation. Also, we have to match
3576 collating symbols, which expand to more than
3577 one file, as a whole and not allow the
3578 individual bytes. */
3579 c1 = extra[idx++];
3580 if (c1 == 1)
3581 range_start = extra[idx];
3582 while (c1-- > 0)
3583 {
3584 SET_LIST_BIT (extra[idx]);
3585 ++idx;
3586 }
3587 }
3588 # endif
3589 had_char_class = false;
3590 }
3591 else
3592 {
3593 c1++;
3594 while (c1--)
3595 PATUNFETCH;
3596 SET_LIST_BIT ('[');
3597 SET_LIST_BIT ('.');
3598 range_start = '.';
3599 had_char_class = false;
3600 }
3601 }
3602 else
3603 {
3604 had_char_class = false;
3605 SET_LIST_BIT (c);
3606 range_start = c;
3607 }
3608 }
3609
3610 /* Discard any (non)matching list bytes that are all 0 at the
3611 end of the map. Decrease the map-length byte too. */
3612 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3613 b[-1]--;
3614 b += b[-1];
3615 #endif /* WCHAR */
3616 }
3617 break;
3618
3619
3620 case '(':
3621 if (syntax & RE_NO_BK_PARENS)
3622 goto handle_open;
3623 else
3624 goto normal_char;
3625
3626
3627 case ')':
3628 if (syntax & RE_NO_BK_PARENS)
3629 goto handle_close;
3630 else
3631 goto normal_char;
3632
3633
3634 case '\n':
3635 if (syntax & RE_NEWLINE_ALT)
3636 goto handle_alt;
3637 else
3638 goto normal_char;
3639
3640
3641 case '|':
3642 if (syntax & RE_NO_BK_VBAR)
3643 goto handle_alt;
3644 else
3645 goto normal_char;
3646
3647
3648 case '{':
3649 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3650 goto handle_interval;
3651 else
3652 goto normal_char;
3653
3654
3655 case '\\':
3656 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3657
3658 /* Do not translate the character after the \, so that we can
3659 distinguish, e.g., \B from \b, even if we normally would
3660 translate, e.g., B to b. */
3661 PATFETCH_RAW (c);
3662
3663 switch (c)
3664 {
3665 case '(':
3666 if (syntax & RE_NO_BK_PARENS)
3667 goto normal_backslash;
3668
3669 handle_open:
3670 bufp->re_nsub++;
3671 regnum++;
3672
3673 if (COMPILE_STACK_FULL)
3674 {
3675 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3676 compile_stack_elt_t);
3677 if (compile_stack.stack == NULL) return REG_ESPACE;
3678
3679 compile_stack.size <<= 1;
3680 }
3681
3682 /* These are the values to restore when we hit end of this
3683 group. They are all relative offsets, so that if the
3684 whole pattern moves because of realloc, they will still
3685 be valid. */
3686 COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR;
3687 COMPILE_STACK_TOP.fixup_alt_jump
3688 = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0;
3689 COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR;
3690 COMPILE_STACK_TOP.regnum = regnum;
3691
3692 /* We will eventually replace the 0 with the number of
3693 groups inner to this one. But do not push a
3694 start_memory for groups beyond the last one we can
3695 represent in the compiled pattern. */
3696 if (regnum <= MAX_REGNUM)
3697 {
3698 COMPILE_STACK_TOP.inner_group_offset = b
3699 - COMPILED_BUFFER_VAR + 2;
3700 BUF_PUSH_3 (start_memory, regnum, 0);
3701 }
3702
3703 compile_stack.avail++;
3704
3705 fixup_alt_jump = 0;
3706 laststart = 0;
3707 begalt = b;
3708 /* If we've reached MAX_REGNUM groups, then this open
3709 won't actually generate any code, so we'll have to
3710 clear pending_exact explicitly. */
3711 pending_exact = 0;
3712 break;
3713
3714
3715 case ')':
3716 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3717
3718 if (COMPILE_STACK_EMPTY)
3719 {
3720 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3721 goto normal_backslash;
3722 else
3723 FREE_STACK_RETURN (REG_ERPAREN);
3724 }
3725
3726 handle_close:
3727 if (fixup_alt_jump)
3728 { /* Push a dummy failure point at the end of the
3729 alternative for a possible future
3730 `pop_failure_jump' to pop. See comments at
3731 `push_dummy_failure' in `re_match_2'. */
3732 BUF_PUSH (push_dummy_failure);
3733
3734 /* We allocated space for this jump when we assigned
3735 to `fixup_alt_jump', in the `handle_alt' case below. */
3736 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
3737 }
3738
3739 /* See similar code for backslashed left paren above. */
3740 if (COMPILE_STACK_EMPTY)
3741 {
3742 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3743 goto normal_char;
3744 else
3745 FREE_STACK_RETURN (REG_ERPAREN);
3746 }
3747
3748 /* Since we just checked for an empty stack above, this
3749 ``can't happen''. */
3750 assert (compile_stack.avail != 0);
3751 {
3752 /* We don't just want to restore into `regnum', because
3753 later groups should continue to be numbered higher,
3754 as in `(ab)c(de)' -- the second group is #2. */
3755 regnum_t this_group_regnum;
3756
3757 compile_stack.avail--;
3758 begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset;
3759 fixup_alt_jump
3760 = COMPILE_STACK_TOP.fixup_alt_jump
3761 ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1
3762 : 0;
3763 laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset;
3764 this_group_regnum = COMPILE_STACK_TOP.regnum;
3765 /* If we've reached MAX_REGNUM groups, then this open
3766 won't actually generate any code, so we'll have to
3767 clear pending_exact explicitly. */
3768 pending_exact = 0;
3769
3770 /* We're at the end of the group, so now we know how many
3771 groups were inside this one. */
3772 if (this_group_regnum <= MAX_REGNUM)
3773 {
3774 UCHAR_T *inner_group_loc
3775 = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset;
3776
3777 *inner_group_loc = regnum - this_group_regnum;
3778 BUF_PUSH_3 (stop_memory, this_group_regnum,
3779 regnum - this_group_regnum);
3780 }
3781 }
3782 break;
3783
3784
3785 case '|': /* `\|'. */
3786 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3787 goto normal_backslash;
3788 handle_alt:
3789 if (syntax & RE_LIMITED_OPS)
3790 goto normal_char;
3791
3792 /* Insert before the previous alternative a jump which
3793 jumps to this alternative if the former fails. */
3794 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3795 INSERT_JUMP (on_failure_jump, begalt,
3796 b + 2 + 2 * OFFSET_ADDRESS_SIZE);
3797 pending_exact = 0;
3798 b += 1 + OFFSET_ADDRESS_SIZE;
3799
3800 /* The alternative before this one has a jump after it
3801 which gets executed if it gets matched. Adjust that
3802 jump so it will jump to this alternative's analogous
3803 jump (put in below, which in turn will jump to the next
3804 (if any) alternative's such jump, etc.). The last such
3805 jump jumps to the correct final destination. A picture:
3806 _____ _____
3807 | | | |
3808 | v | v
3809 a | b | c
3810
3811 If we are at `b', then fixup_alt_jump right now points to a
3812 three-byte space after `a'. We'll put in the jump, set
3813 fixup_alt_jump to right after `b', and leave behind three
3814 bytes which we'll fill in when we get to after `c'. */
3815
3816 if (fixup_alt_jump)
3817 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3818
3819 /* Mark and leave space for a jump after this alternative,
3820 to be filled in later either by next alternative or
3821 when know we're at the end of a series of alternatives. */
3822 fixup_alt_jump = b;
3823 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3824 b += 1 + OFFSET_ADDRESS_SIZE;
3825
3826 laststart = 0;
3827 begalt = b;
3828 break;
3829
3830
3831 case '{':
3832 /* If \{ is a literal. */
3833 if (!(syntax & RE_INTERVALS)
3834 /* If we're at `\{' and it's not the open-interval
3835 operator. */
3836 || (syntax & RE_NO_BK_BRACES))
3837 goto normal_backslash;
3838
3839 handle_interval:
3840 {
3841 /* If got here, then the syntax allows intervals. */
3842
3843 /* At least (most) this many matches must be made. */
3844 int lower_bound = -1, upper_bound = -1;
3845
3846 /* Place in the uncompiled pattern (i.e., just after
3847 the '{') to go back to if the interval is invalid. */
3848 const CHAR_T *beg_interval = p;
3849
3850 if (p == pend)
3851 goto invalid_interval;
3852
3853 GET_UNSIGNED_NUMBER (lower_bound);
3854
3855 if (c == ',')
3856 {
3857 GET_UNSIGNED_NUMBER (upper_bound);
3858 if (upper_bound < 0)
3859 upper_bound = RE_DUP_MAX;
3860 }
3861 else
3862 /* Interval such as `{1}' => match exactly once. */
3863 upper_bound = lower_bound;
3864
3865 if (! (0 <= lower_bound && lower_bound <= upper_bound))
3866 goto invalid_interval;
3867
3868 if (!(syntax & RE_NO_BK_BRACES))
3869 {
3870 if (c != '\\' || p == pend)
3871 goto invalid_interval;
3872 PATFETCH (c);
3873 }
3874
3875 if (c != '}')
3876 goto invalid_interval;
3877
3878 /* If it's invalid to have no preceding re. */
3879 if (!laststart)
3880 {
3881 if (syntax & RE_CONTEXT_INVALID_OPS
3882 && !(syntax & RE_INVALID_INTERVAL_ORD))
3883 FREE_STACK_RETURN (REG_BADRPT);
3884 else if (syntax & RE_CONTEXT_INDEP_OPS)
3885 laststart = b;
3886 else
3887 goto unfetch_interval;
3888 }
3889
3890 /* We just parsed a valid interval. */
3891
3892 if (RE_DUP_MAX < upper_bound)
3893 FREE_STACK_RETURN (REG_BADBR);
3894
3895 /* If the upper bound is zero, don't want to succeed at
3896 all; jump from `laststart' to `b + 3', which will be
3897 the end of the buffer after we insert the jump. */
3898 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE'
3899 instead of 'b + 3'. */
3900 if (upper_bound == 0)
3901 {
3902 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3903 INSERT_JUMP (jump, laststart, b + 1
3904 + OFFSET_ADDRESS_SIZE);
3905 b += 1 + OFFSET_ADDRESS_SIZE;
3906 }
3907
3908 /* Otherwise, we have a nontrivial interval. When
3909 we're all done, the pattern will look like:
3910 set_number_at <jump count> <upper bound>
3911 set_number_at <succeed_n count> <lower bound>
3912 succeed_n <after jump addr> <succeed_n count>
3913 <body of loop>
3914 jump_n <succeed_n addr> <jump count>
3915 (The upper bound and `jump_n' are omitted if
3916 `upper_bound' is 1, though.) */
3917 else
3918 { /* If the upper bound is > 1, we need to insert
3919 more at the end of the loop. */
3920 unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE +
3921 (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE);
3922
3923 GET_BUFFER_SPACE (nbytes);
3924
3925 /* Initialize lower bound of the `succeed_n', even
3926 though it will be set during matching by its
3927 attendant `set_number_at' (inserted next),
3928 because `re_compile_fastmap' needs to know.
3929 Jump to the `jump_n' we might insert below. */
3930 INSERT_JUMP2 (succeed_n, laststart,
3931 b + 1 + 2 * OFFSET_ADDRESS_SIZE
3932 + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE)
3933 , lower_bound);
3934 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3935
3936 /* Code to initialize the lower bound. Insert
3937 before the `succeed_n'. The `5' is the last two
3938 bytes of this `set_number_at', plus 3 bytes of
3939 the following `succeed_n'. */
3940 /* ifdef WCHAR, The '1+2*OFFSET_ADDRESS_SIZE'
3941 is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE'
3942 of the following `succeed_n'. */
3943 PREFIX(insert_op2) (set_number_at, laststart, 1
3944 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b);
3945 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3946
3947 if (upper_bound > 1)
3948 { /* More than one repetition is allowed, so
3949 append a backward jump to the `succeed_n'
3950 that starts this interval.
3951
3952 When we've reached this during matching,
3953 we'll have matched the interval once, so
3954 jump back only `upper_bound - 1' times. */
3955 STORE_JUMP2 (jump_n, b, laststart
3956 + 2 * OFFSET_ADDRESS_SIZE + 1,
3957 upper_bound - 1);
3958 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3959
3960 /* The location we want to set is the second
3961 parameter of the `jump_n'; that is `b-2' as
3962 an absolute address. `laststart' will be
3963 the `set_number_at' we're about to insert;
3964 `laststart+3' the number to set, the source
3965 for the relative address. But we are
3966 inserting into the middle of the pattern --
3967 so everything is getting moved up by 5.
3968 Conclusion: (b - 2) - (laststart + 3) + 5,
3969 i.e., b - laststart.
3970
3971 We insert this at the beginning of the loop
3972 so that if we fail during matching, we'll
3973 reinitialize the bounds. */
3974 PREFIX(insert_op2) (set_number_at, laststart,
3975 b - laststart,
3976 upper_bound - 1, b);
3977 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3978 }
3979 }
3980 pending_exact = 0;
3981 break;
3982
3983 invalid_interval:
3984 if (!(syntax & RE_INVALID_INTERVAL_ORD))
3985 FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR);
3986 unfetch_interval:
3987 /* Match the characters as literals. */
3988 p = beg_interval;
3989 c = '{';
3990 if (syntax & RE_NO_BK_BRACES)
3991 goto normal_char;
3992 else
3993 goto normal_backslash;
3994 }
3995
3996 #ifdef emacs
3997 /* There is no way to specify the before_dot and after_dot
3998 operators. rms says this is ok. --karl */
3999 case '=':
4000 BUF_PUSH (at_dot);
4001 break;
4002
4003 case 's':
4004 laststart = b;
4005 PATFETCH (c);
4006 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
4007 break;
4008
4009 case 'S':
4010 laststart = b;
4011 PATFETCH (c);
4012 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
4013 break;
4014 #endif /* emacs */
4015
4016
4017 case 'w':
4018 if (syntax & RE_NO_GNU_OPS)
4019 goto normal_char;
4020 laststart = b;
4021 BUF_PUSH (wordchar);
4022 break;
4023
4024
4025 case 'W':
4026 if (syntax & RE_NO_GNU_OPS)
4027 goto normal_char;
4028 laststart = b;
4029 BUF_PUSH (notwordchar);
4030 break;
4031
4032
4033 case '<':
4034 if (syntax & RE_NO_GNU_OPS)
4035 goto normal_char;
4036 BUF_PUSH (wordbeg);
4037 break;
4038
4039 case '>':
4040 if (syntax & RE_NO_GNU_OPS)
4041 goto normal_char;
4042 BUF_PUSH (wordend);
4043 break;
4044
4045 case 'b':
4046 if (syntax & RE_NO_GNU_OPS)
4047 goto normal_char;
4048 BUF_PUSH (wordbound);
4049 break;
4050
4051 case 'B':
4052 if (syntax & RE_NO_GNU_OPS)
4053 goto normal_char;
4054 BUF_PUSH (notwordbound);
4055 break;
4056
4057 case '`':
4058 if (syntax & RE_NO_GNU_OPS)
4059 goto normal_char;
4060 BUF_PUSH (begbuf);
4061 break;
4062
4063 case '\'':
4064 if (syntax & RE_NO_GNU_OPS)
4065 goto normal_char;
4066 BUF_PUSH (endbuf);
4067 break;
4068
4069 case '1': case '2': case '3': case '4': case '5':
4070 case '6': case '7': case '8': case '9':
4071 if (syntax & RE_NO_BK_REFS)
4072 goto normal_char;
4073
4074 c1 = c - '0';
4075
4076 if (c1 > regnum)
4077 FREE_STACK_RETURN (REG_ESUBREG);
4078
4079 /* Can't back reference to a subexpression if inside of it. */
4080 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
4081 goto normal_char;
4082
4083 laststart = b;
4084 BUF_PUSH_2 (duplicate, c1);
4085 break;
4086
4087
4088 case '+':
4089 case '?':
4090 if (syntax & RE_BK_PLUS_QM)
4091 goto handle_plus;
4092 else
4093 goto normal_backslash;
4094
4095 default:
4096 normal_backslash:
4097 /* You might think it would be useful for \ to mean
4098 not to translate; but if we don't translate it
4099 it will never match anything. */
4100 c = TRANSLATE (c);
4101 goto normal_char;
4102 }
4103 break;
4104
4105
4106 default:
4107 /* Expects the character in `c'. */
4108 normal_char:
4109 /* If no exactn currently being built. */
4110 if (!pending_exact
4111 #ifdef WCHAR
4112 /* If last exactn handle binary(or character) and
4113 new exactn handle character(or binary). */
4114 || is_exactn_bin != is_binary[p - 1 - pattern]
4115 #endif /* WCHAR */
4116
4117 /* If last exactn not at current position. */
4118 || pending_exact + *pending_exact + 1 != b
4119
4120 /* We have only one byte following the exactn for the count. */
4121 || *pending_exact == (1 << BYTEWIDTH) - 1
4122
4123 /* If followed by a repetition operator. */
4124 || *p == '*' || *p == '^'
4125 || ((syntax & RE_BK_PLUS_QM)
4126 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
4127 : (*p == '+' || *p == '?'))
4128 || ((syntax & RE_INTERVALS)
4129 && ((syntax & RE_NO_BK_BRACES)
4130 ? *p == '{'
4131 : (p[0] == '\\' && p[1] == '{'))))
4132 {
4133 /* Start building a new exactn. */
4134
4135 laststart = b;
4136
4137 #ifdef WCHAR
4138 /* Is this exactn binary data or character? */
4139 is_exactn_bin = is_binary[p - 1 - pattern];
4140 if (is_exactn_bin)
4141 BUF_PUSH_2 (exactn_bin, 0);
4142 else
4143 BUF_PUSH_2 (exactn, 0);
4144 #else
4145 BUF_PUSH_2 (exactn, 0);
4146 #endif /* WCHAR */
4147 pending_exact = b - 1;
4148 }
4149
4150 BUF_PUSH (c);
4151 (*pending_exact)++;
4152 break;
4153 } /* switch (c) */
4154 } /* while p != pend */
4155
4156
4157 /* Through the pattern now. */
4158
4159 if (fixup_alt_jump)
4160 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
4161
4162 if (!COMPILE_STACK_EMPTY)
4163 FREE_STACK_RETURN (REG_EPAREN);
4164
4165 /* If we don't want backtracking, force success
4166 the first time we reach the end of the compiled pattern. */
4167 if (syntax & RE_NO_POSIX_BACKTRACKING)
4168 BUF_PUSH (succeed);
4169
4170 #ifdef WCHAR
4171 free (pattern);
4172 free (mbs_offset);
4173 free (is_binary);
4174 #endif
4175 free (compile_stack.stack);
4176
4177 /* We have succeeded; set the length of the buffer. */
4178 #ifdef WCHAR
4179 bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR;
4180 #else
4181 bufp->used = b - bufp->buffer;
4182 #endif
4183
4184 #ifdef DEBUG
4185 if (debug)
4186 {
4187 DEBUG_PRINT1 ("\nCompiled pattern: \n");
4188 PREFIX(print_compiled_pattern) (bufp);
4189 }
4190 #endif /* DEBUG */
4191
4192 #ifndef MATCH_MAY_ALLOCATE
4193 /* Initialize the failure stack to the largest possible stack. This
4194 isn't necessary unless we're trying to avoid calling alloca in
4195 the search and match routines. */
4196 {
4197 int num_regs = bufp->re_nsub + 1;
4198
4199 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
4200 is strictly greater than re_max_failures, the largest possible stack
4201 is 2 * re_max_failures failure points. */
4202 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
4203 {
4204 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
4205
4206 # ifdef emacs
4207 if (! fail_stack.stack)
4208 fail_stack.stack
4209 = (PREFIX(fail_stack_elt_t) *) xmalloc (fail_stack.size
4210 * sizeof (PREFIX(fail_stack_elt_t)));
4211 else
4212 fail_stack.stack
4213 = (PREFIX(fail_stack_elt_t) *) xrealloc (fail_stack.stack,
4214 (fail_stack.size
4215 * sizeof (PREFIX(fail_stack_elt_t))));
4216 # else /* not emacs */
4217 if (! fail_stack.stack)
4218 fail_stack.stack
4219 = (PREFIX(fail_stack_elt_t) *) malloc (fail_stack.size
4220 * sizeof (PREFIX(fail_stack_elt_t)));
4221 else
4222 fail_stack.stack
4223 = (PREFIX(fail_stack_elt_t) *) realloc (fail_stack.stack,
4224 (fail_stack.size
4225 * sizeof (PREFIX(fail_stack_elt_t))));
4226 # endif /* not emacs */
4227 }
4228
4229 PREFIX(regex_grow_registers) (num_regs);
4230 }
4231 #endif /* not MATCH_MAY_ALLOCATE */
4232
4233 return REG_NOERROR;
4234 } /* regex_compile */
4235
4236 /* Subroutines for `regex_compile'. */
4237
4238 /* Store OP at LOC followed by two-byte integer parameter ARG. */
4239 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4240
4241 static void
4242 PREFIX(store_op1) (re_opcode_t op, UCHAR_T *loc, int arg)
4243 {
4244 *loc = (UCHAR_T) op;
4245 STORE_NUMBER (loc + 1, arg);
4246 }
4247
4248
4249 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
4250 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4251
4252 static void
4253 PREFIX(store_op2) (re_opcode_t op, UCHAR_T *loc, int arg1, int arg2)
4254 {
4255 *loc = (UCHAR_T) op;
4256 STORE_NUMBER (loc + 1, arg1);
4257 STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2);
4258 }
4259
4260
4261 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
4262 for OP followed by two-byte integer parameter ARG. */
4263 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4264
4265 static void
4266 PREFIX(insert_op1) (re_opcode_t op, UCHAR_T *loc, int arg, UCHAR_T *end)
4267 {
4268 register UCHAR_T *pfrom = end;
4269 register UCHAR_T *pto = end + 1 + OFFSET_ADDRESS_SIZE;
4270
4271 while (pfrom != loc)
4272 *--pto = *--pfrom;
4273
4274 PREFIX(store_op1) (op, loc, arg);
4275 }
4276
4277
4278 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
4279 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4280
4281 static void
4282 PREFIX(insert_op2) (re_opcode_t op, UCHAR_T *loc, int arg1,
4283 int arg2, UCHAR_T *end)
4284 {
4285 register UCHAR_T *pfrom = end;
4286 register UCHAR_T *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE;
4287
4288 while (pfrom != loc)
4289 *--pto = *--pfrom;
4290
4291 PREFIX(store_op2) (op, loc, arg1, arg2);
4292 }
4293
4294
4295 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
4296 after an alternative or a begin-subexpression. We assume there is at
4297 least one character before the ^. */
4298
4299 static boolean
4300 PREFIX(at_begline_loc_p) (const CHAR_T *pattern, const CHAR_T *p,
4301 reg_syntax_t syntax)
4302 {
4303 const CHAR_T *prev = p - 2;
4304 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
4305
4306 return
4307 /* After a subexpression? */
4308 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
4309 /* After an alternative? */
4310 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
4311 }
4312
4313
4314 /* The dual of at_begline_loc_p. This one is for $. We assume there is
4315 at least one character after the $, i.e., `P < PEND'. */
4316
4317 static boolean
4318 PREFIX(at_endline_loc_p) (const CHAR_T *p, const CHAR_T *pend,
4319 reg_syntax_t syntax)
4320 {
4321 const CHAR_T *next = p;
4322 boolean next_backslash = *next == '\\';
4323 const CHAR_T *next_next = p + 1 < pend ? p + 1 : 0;
4324
4325 return
4326 /* Before a subexpression? */
4327 (syntax & RE_NO_BK_PARENS ? *next == ')'
4328 : next_backslash && next_next && *next_next == ')')
4329 /* Before an alternative? */
4330 || (syntax & RE_NO_BK_VBAR ? *next == '|'
4331 : next_backslash && next_next && *next_next == '|');
4332 }
4333
4334 #else /* not INSIDE_RECURSION */
4335
4336 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4337 false if it's not. */
4338
4339 static boolean
4340 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
4341 {
4342 int this_element;
4343
4344 for (this_element = compile_stack.avail - 1;
4345 this_element >= 0;
4346 this_element--)
4347 if (compile_stack.stack[this_element].regnum == regnum)
4348 return true;
4349
4350 return false;
4351 }
4352 #endif /* not INSIDE_RECURSION */
4353
4354 #ifdef INSIDE_RECURSION
4355
4356 #ifdef WCHAR
4357 /* This insert space, which size is "num", into the pattern at "loc".
4358 "end" must point the end of the allocated buffer. */
4359 static void
4360 insert_space (int num, CHAR_T *loc, CHAR_T *end)
4361 {
4362 register CHAR_T *pto = end;
4363 register CHAR_T *pfrom = end - num;
4364
4365 while (pfrom >= loc)
4366 *pto-- = *pfrom--;
4367 }
4368 #endif /* WCHAR */
4369
4370 #ifdef WCHAR
4371 static reg_errcode_t
4372 wcs_compile_range (CHAR_T range_start_char, const CHAR_T **p_ptr,
4373 const CHAR_T *pend, RE_TRANSLATE_TYPE translate,
4374 reg_syntax_t syntax, CHAR_T *b, CHAR_T *char_set)
4375 {
4376 const CHAR_T *p = *p_ptr;
4377 CHAR_T range_start, range_end;
4378 reg_errcode_t ret;
4379 # ifdef _LIBC
4380 uint32_t nrules;
4381 uint32_t start_val, end_val;
4382 # endif
4383 if (p == pend)
4384 return REG_ERANGE;
4385
4386 # ifdef _LIBC
4387 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4388 if (nrules != 0)
4389 {
4390 const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
4391 _NL_COLLATE_COLLSEQWC);
4392 const unsigned char *extra = (const unsigned char *)
4393 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4394
4395 if (range_start_char < -1)
4396 {
4397 /* range_start is a collating symbol. */
4398 int32_t *wextra;
4399 /* Retreive the index and get collation sequence value. */
4400 wextra = (int32_t*)(extra + char_set[-range_start_char]);
4401 start_val = wextra[1 + *wextra];
4402 }
4403 else
4404 start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char));
4405
4406 end_val = collseq_table_lookup (collseq, TRANSLATE (p[0]));
4407
4408 /* Report an error if the range is empty and the syntax prohibits
4409 this. */
4410 ret = ((syntax & RE_NO_EMPTY_RANGES)
4411 && (start_val > end_val))? REG_ERANGE : REG_NOERROR;
4412
4413 /* Insert space to the end of the char_ranges. */
4414 insert_space(2, b - char_set[5] - 2, b - 1);
4415 *(b - char_set[5] - 2) = (wchar_t)start_val;
4416 *(b - char_set[5] - 1) = (wchar_t)end_val;
4417 char_set[4]++; /* ranges_index */
4418 }
4419 else
4420 # endif
4421 {
4422 range_start = (range_start_char >= 0)? TRANSLATE (range_start_char):
4423 range_start_char;
4424 range_end = TRANSLATE (p[0]);
4425 /* Report an error if the range is empty and the syntax prohibits
4426 this. */
4427 ret = ((syntax & RE_NO_EMPTY_RANGES)
4428 && (range_start > range_end))? REG_ERANGE : REG_NOERROR;
4429
4430 /* Insert space to the end of the char_ranges. */
4431 insert_space(2, b - char_set[5] - 2, b - 1);
4432 *(b - char_set[5] - 2) = range_start;
4433 *(b - char_set[5] - 1) = range_end;
4434 char_set[4]++; /* ranges_index */
4435 }
4436 /* Have to increment the pointer into the pattern string, so the
4437 caller isn't still at the ending character. */
4438 (*p_ptr)++;
4439
4440 return ret;
4441 }
4442 #else /* BYTE */
4443 /* Read the ending character of a range (in a bracket expression) from the
4444 uncompiled pattern *P_PTR (which ends at PEND). We assume the
4445 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4446 Then we set the translation of all bits between the starting and
4447 ending characters (inclusive) in the compiled pattern B.
4448
4449 Return an error code.
4450
4451 We use these short variable names so we can use the same macros as
4452 `regex_compile' itself. */
4453
4454 static reg_errcode_t
4455 byte_compile_range (unsigned int range_start_char, const char **p_ptr,
4456 const char *pend, RE_TRANSLATE_TYPE translate,
4457 reg_syntax_t syntax, unsigned char *b)
4458 {
4459 unsigned this_char;
4460 const char *p = *p_ptr;
4461 reg_errcode_t ret;
4462 # if _LIBC
4463 const unsigned char *collseq;
4464 unsigned int start_colseq;
4465 unsigned int end_colseq;
4466 # else
4467 unsigned end_char;
4468 # endif
4469
4470 if (p == pend)
4471 return REG_ERANGE;
4472
4473 /* Have to increment the pointer into the pattern string, so the
4474 caller isn't still at the ending character. */
4475 (*p_ptr)++;
4476
4477 /* Report an error if the range is empty and the syntax prohibits this. */
4478 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
4479
4480 # if _LIBC
4481 collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
4482 _NL_COLLATE_COLLSEQMB);
4483
4484 start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)];
4485 end_colseq = collseq[(unsigned char) TRANSLATE (p[0])];
4486 for (this_char = 0; this_char <= (unsigned char) -1; ++this_char)
4487 {
4488 unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)];
4489
4490 if (start_colseq <= this_colseq && this_colseq <= end_colseq)
4491 {
4492 SET_LIST_BIT (TRANSLATE (this_char));
4493 ret = REG_NOERROR;
4494 }
4495 }
4496 # else
4497 /* Here we see why `this_char' has to be larger than an `unsigned
4498 char' -- we would otherwise go into an infinite loop, since all
4499 characters <= 0xff. */
4500 range_start_char = TRANSLATE (range_start_char);
4501 /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE,
4502 and some compilers cast it to int implicitly, so following for_loop
4503 may fall to (almost) infinite loop.
4504 e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff.
4505 To avoid this, we cast p[0] to unsigned int and truncate it. */
4506 end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1));
4507
4508 for (this_char = range_start_char; this_char <= end_char; ++this_char)
4509 {
4510 SET_LIST_BIT (TRANSLATE (this_char));
4511 ret = REG_NOERROR;
4512 }
4513 # endif
4514
4515 return ret;
4516 }
4517 #endif /* WCHAR */
4518
4519 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4521 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4522 characters can start a string that matches the pattern. This fastmap
4523 is used by re_search to skip quickly over impossible starting points.
4524
4525 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4526 area as BUFP->fastmap.
4527
4528 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4529 the pattern buffer.
4530
4531 Returns 0 if we succeed, -2 if an internal error. */
4532
4533 #ifdef WCHAR
4534 /* local function for re_compile_fastmap.
4535 truncate wchar_t character to char. */
4536 static unsigned char truncate_wchar (CHAR_T c);
4537
4538 static unsigned char
4539 truncate_wchar (CHAR_T c)
4540 {
4541 unsigned char buf[MB_CUR_MAX];
4542 mbstate_t state;
4543 int retval;
4544 memset (&state, '\0', sizeof (state));
4545 # ifdef _LIBC
4546 retval = __wcrtomb (buf, c, &state);
4547 # else
4548 retval = wcrtomb (buf, c, &state);
4549 # endif
4550 return retval > 0 ? buf[0] : (unsigned char) c;
4551 }
4552 #endif /* WCHAR */
4553
4554 static int
4555 PREFIX(re_compile_fastmap) (struct re_pattern_buffer *bufp)
4556 {
4557 int j, k;
4558 #ifdef MATCH_MAY_ALLOCATE
4559 PREFIX(fail_stack_type) fail_stack;
4560 #endif
4561 #ifndef REGEX_MALLOC
4562 char *destination;
4563 #endif
4564
4565 register char *fastmap = bufp->fastmap;
4566
4567 #ifdef WCHAR
4568 /* We need to cast pattern to (wchar_t*), because we casted this compiled
4569 pattern to (char*) in regex_compile. */
4570 UCHAR_T *pattern = (UCHAR_T*)bufp->buffer;
4571 register UCHAR_T *pend = (UCHAR_T*) (bufp->buffer + bufp->used);
4572 #else /* BYTE */
4573 UCHAR_T *pattern = bufp->buffer;
4574 register UCHAR_T *pend = pattern + bufp->used;
4575 #endif /* WCHAR */
4576 UCHAR_T *p = pattern;
4577
4578 #ifdef REL_ALLOC
4579 /* This holds the pointer to the failure stack, when
4580 it is allocated relocatably. */
4581 fail_stack_elt_t *failure_stack_ptr;
4582 #endif
4583
4584 /* Assume that each path through the pattern can be null until
4585 proven otherwise. We set this false at the bottom of switch
4586 statement, to which we get only if a particular path doesn't
4587 match the empty string. */
4588 boolean path_can_be_null = true;
4589
4590 /* We aren't doing a `succeed_n' to begin with. */
4591 boolean succeed_n_p = false;
4592
4593 assert (fastmap != NULL && p != NULL);
4594
4595 INIT_FAIL_STACK ();
4596 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4597 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4598 bufp->can_be_null = 0;
4599
4600 while (1)
4601 {
4602 if (p == pend || *p == (UCHAR_T) succeed)
4603 {
4604 /* We have reached the (effective) end of pattern. */
4605 if (!FAIL_STACK_EMPTY ())
4606 {
4607 bufp->can_be_null |= path_can_be_null;
4608
4609 /* Reset for next path. */
4610 path_can_be_null = true;
4611
4612 p = fail_stack.stack[--fail_stack.avail].pointer;
4613
4614 continue;
4615 }
4616 else
4617 break;
4618 }
4619
4620 /* We should never be about to go beyond the end of the pattern. */
4621 assert (p < pend);
4622
4623 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4624 {
4625
4626 /* I guess the idea here is to simply not bother with a fastmap
4627 if a backreference is used, since it's too hard to figure out
4628 the fastmap for the corresponding group. Setting
4629 `can_be_null' stops `re_search_2' from using the fastmap, so
4630 that is all we do. */
4631 case duplicate:
4632 bufp->can_be_null = 1;
4633 goto done;
4634
4635
4636 /* Following are the cases which match a character. These end
4637 with `break'. */
4638
4639 #ifdef WCHAR
4640 case exactn:
4641 fastmap[truncate_wchar(p[1])] = 1;
4642 break;
4643 #else /* BYTE */
4644 case exactn:
4645 fastmap[p[1]] = 1;
4646 break;
4647 #endif /* WCHAR */
4648 #ifdef MBS_SUPPORT
4649 case exactn_bin:
4650 fastmap[p[1]] = 1;
4651 break;
4652 #endif
4653
4654 #ifdef WCHAR
4655 /* It is hard to distinguish fastmap from (multi byte) characters
4656 which depends on current locale. */
4657 case charset:
4658 case charset_not:
4659 case wordchar:
4660 case notwordchar:
4661 bufp->can_be_null = 1;
4662 goto done;
4663 #else /* BYTE */
4664 case charset:
4665 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4666 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
4667 fastmap[j] = 1;
4668 break;
4669
4670
4671 case charset_not:
4672 /* Chars beyond end of map must be allowed. */
4673 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
4674 fastmap[j] = 1;
4675
4676 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4677 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
4678 fastmap[j] = 1;
4679 break;
4680
4681
4682 case wordchar:
4683 for (j = 0; j < (1 << BYTEWIDTH); j++)
4684 if (SYNTAX (j) == Sword)
4685 fastmap[j] = 1;
4686 break;
4687
4688
4689 case notwordchar:
4690 for (j = 0; j < (1 << BYTEWIDTH); j++)
4691 if (SYNTAX (j) != Sword)
4692 fastmap[j] = 1;
4693 break;
4694 #endif /* WCHAR */
4695
4696 case anychar:
4697 {
4698 int fastmap_newline = fastmap['\n'];
4699
4700 /* `.' matches anything ... */
4701 for (j = 0; j < (1 << BYTEWIDTH); j++)
4702 fastmap[j] = 1;
4703
4704 /* ... except perhaps newline. */
4705 if (!(bufp->syntax & RE_DOT_NEWLINE))
4706 fastmap['\n'] = fastmap_newline;
4707
4708 /* Return if we have already set `can_be_null'; if we have,
4709 then the fastmap is irrelevant. Something's wrong here. */
4710 else if (bufp->can_be_null)
4711 goto done;
4712
4713 /* Otherwise, have to check alternative paths. */
4714 break;
4715 }
4716
4717 #ifdef emacs
4718 case syntaxspec:
4719 k = *p++;
4720 for (j = 0; j < (1 << BYTEWIDTH); j++)
4721 if (SYNTAX (j) == (enum syntaxcode) k)
4722 fastmap[j] = 1;
4723 break;
4724
4725
4726 case notsyntaxspec:
4727 k = *p++;
4728 for (j = 0; j < (1 << BYTEWIDTH); j++)
4729 if (SYNTAX (j) != (enum syntaxcode) k)
4730 fastmap[j] = 1;
4731 break;
4732
4733
4734 /* All cases after this match the empty string. These end with
4735 `continue'. */
4736
4737
4738 case before_dot:
4739 case at_dot:
4740 case after_dot:
4741 continue;
4742 #endif /* emacs */
4743
4744
4745 case no_op:
4746 case begline:
4747 case endline:
4748 case begbuf:
4749 case endbuf:
4750 case wordbound:
4751 case notwordbound:
4752 case wordbeg:
4753 case wordend:
4754 case push_dummy_failure:
4755 continue;
4756
4757
4758 case jump_n:
4759 case pop_failure_jump:
4760 case maybe_pop_jump:
4761 case jump:
4762 case jump_past_alt:
4763 case dummy_failure_jump:
4764 EXTRACT_NUMBER_AND_INCR (j, p);
4765 p += j;
4766 if (j > 0)
4767 continue;
4768
4769 /* Jump backward implies we just went through the body of a
4770 loop and matched nothing. Opcode jumped to should be
4771 `on_failure_jump' or `succeed_n'. Just treat it like an
4772 ordinary jump. For a * loop, it has pushed its failure
4773 point already; if so, discard that as redundant. */
4774 if ((re_opcode_t) *p != on_failure_jump
4775 && (re_opcode_t) *p != succeed_n)
4776 continue;
4777
4778 p++;
4779 EXTRACT_NUMBER_AND_INCR (j, p);
4780 p += j;
4781
4782 /* If what's on the stack is where we are now, pop it. */
4783 if (!FAIL_STACK_EMPTY ()
4784 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
4785 fail_stack.avail--;
4786
4787 continue;
4788
4789
4790 case on_failure_jump:
4791 case on_failure_keep_string_jump:
4792 handle_on_failure_jump:
4793 EXTRACT_NUMBER_AND_INCR (j, p);
4794
4795 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
4796 end of the pattern. We don't want to push such a point,
4797 since when we restore it above, entering the switch will
4798 increment `p' past the end of the pattern. We don't need
4799 to push such a point since we obviously won't find any more
4800 fastmap entries beyond `pend'. Such a pattern can match
4801 the null string, though. */
4802 if (p + j < pend)
4803 {
4804 if (!PUSH_PATTERN_OP (p + j, fail_stack))
4805 {
4806 RESET_FAIL_STACK ();
4807 return -2;
4808 }
4809 }
4810 else
4811 bufp->can_be_null = 1;
4812
4813 if (succeed_n_p)
4814 {
4815 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
4816 succeed_n_p = false;
4817 }
4818
4819 continue;
4820
4821
4822 case succeed_n:
4823 /* Get to the number of times to succeed. */
4824 p += OFFSET_ADDRESS_SIZE;
4825
4826 /* Increment p past the n for when k != 0. */
4827 EXTRACT_NUMBER_AND_INCR (k, p);
4828 if (k == 0)
4829 {
4830 p -= 2 * OFFSET_ADDRESS_SIZE;
4831 succeed_n_p = true; /* Spaghetti code alert. */
4832 goto handle_on_failure_jump;
4833 }
4834 continue;
4835
4836
4837 case set_number_at:
4838 p += 2 * OFFSET_ADDRESS_SIZE;
4839 continue;
4840
4841
4842 case start_memory:
4843 case stop_memory:
4844 p += 2;
4845 continue;
4846
4847
4848 default:
4849 abort (); /* We have listed all the cases. */
4850 } /* switch *p++ */
4851
4852 /* Getting here means we have found the possible starting
4853 characters for one path of the pattern -- and that the empty
4854 string does not match. We need not follow this path further.
4855 Instead, look at the next alternative (remembered on the
4856 stack), or quit if no more. The test at the top of the loop
4857 does these things. */
4858 path_can_be_null = false;
4859 p = pend;
4860 } /* while p */
4861
4862 /* Set `can_be_null' for the last path (also the first path, if the
4863 pattern is empty). */
4864 bufp->can_be_null |= path_can_be_null;
4865
4866 done:
4867 RESET_FAIL_STACK ();
4868 return 0;
4869 }
4870
4871 #else /* not INSIDE_RECURSION */
4872
4873 int
4874 re_compile_fastmap (struct re_pattern_buffer *bufp)
4875 {
4876 # ifdef MBS_SUPPORT
4877 if (MB_CUR_MAX != 1)
4878 return wcs_re_compile_fastmap(bufp);
4879 else
4880 # endif
4881 return byte_re_compile_fastmap(bufp);
4882 } /* re_compile_fastmap */
4883 #ifdef _LIBC
4884 weak_alias (__re_compile_fastmap, re_compile_fastmap)
4885 #endif
4886
4887
4889 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4890 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4891 this memory for recording register information. STARTS and ENDS
4892 must be allocated using the malloc library routine, and must each
4893 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4894
4895 If NUM_REGS == 0, then subsequent matches should allocate their own
4896 register data.
4897
4898 Unless this function is called, the first search or match using
4899 PATTERN_BUFFER will allocate its own register data, without
4900 freeing the old data. */
4901
4902 void
4903 re_set_registers (struct re_pattern_buffer *bufp,
4904 struct re_registers *regs, unsigned num_regs,
4905 regoff_t *starts, regoff_t *ends)
4906 {
4907 if (num_regs)
4908 {
4909 bufp->regs_allocated = REGS_REALLOCATE;
4910 regs->num_regs = num_regs;
4911 regs->start = starts;
4912 regs->end = ends;
4913 }
4914 else
4915 {
4916 bufp->regs_allocated = REGS_UNALLOCATED;
4917 regs->num_regs = 0;
4918 regs->start = regs->end = (regoff_t *) 0;
4919 }
4920 }
4921 #ifdef _LIBC
4922 weak_alias (__re_set_registers, re_set_registers)
4923 #endif
4924
4925 /* Searching routines. */
4927
4928 /* Like re_search_2, below, but only one string is specified, and
4929 doesn't let you say where to stop matching. */
4930
4931 int
4932 re_search (struct re_pattern_buffer *bufp, const char *string, int size,
4933 int startpos, int range, struct re_registers *regs)
4934 {
4935 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
4936 regs, size);
4937 }
4938 #ifdef _LIBC
4939 weak_alias (__re_search, re_search)
4940 #endif
4941
4942
4943 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4944 virtual concatenation of STRING1 and STRING2, starting first at index
4945 STARTPOS, then at STARTPOS + 1, and so on.
4946
4947 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4948
4949 RANGE is how far to scan while trying to match. RANGE = 0 means try
4950 only at STARTPOS; in general, the last start tried is STARTPOS +
4951 RANGE.
4952
4953 In REGS, return the indices of the virtual concatenation of STRING1
4954 and STRING2 that matched the entire BUFP->buffer and its contained
4955 subexpressions.
4956
4957 Do not consider matching one past the index STOP in the virtual
4958 concatenation of STRING1 and STRING2.
4959
4960 We return either the position in the strings at which the match was
4961 found, -1 if no match, or -2 if error (such as failure
4962 stack overflow). */
4963
4964 int
4965 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, int size1,
4966 const char *string2, int size2, int startpos, int range,
4967 struct re_registers *regs, int stop)
4968 {
4969 # ifdef MBS_SUPPORT
4970 if (MB_CUR_MAX != 1)
4971 return wcs_re_search_2 (bufp, string1, size1, string2, size2, startpos,
4972 range, regs, stop);
4973 else
4974 # endif
4975 return byte_re_search_2 (bufp, string1, size1, string2, size2, startpos,
4976 range, regs, stop);
4977 } /* re_search_2 */
4978 #ifdef _LIBC
4979 weak_alias (__re_search_2, re_search_2)
4980 #endif
4981
4982 #endif /* not INSIDE_RECURSION */
4983
4984 #ifdef INSIDE_RECURSION
4985
4986 #ifdef MATCH_MAY_ALLOCATE
4987 # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4988 #else
4989 # define FREE_VAR(var) if (var) free (var); var = NULL
4990 #endif
4991
4992 #ifdef WCHAR
4993 # define MAX_ALLOCA_SIZE 2000
4994
4995 # define FREE_WCS_BUFFERS() \
4996 do { \
4997 if (size1 > MAX_ALLOCA_SIZE) \
4998 { \
4999 free (wcs_string1); \
5000 free (mbs_offset1); \
5001 } \
5002 else \
5003 { \
5004 FREE_VAR (wcs_string1); \
5005 FREE_VAR (mbs_offset1); \
5006 } \
5007 if (size2 > MAX_ALLOCA_SIZE) \
5008 { \
5009 free (wcs_string2); \
5010 free (mbs_offset2); \
5011 } \
5012 else \
5013 { \
5014 FREE_VAR (wcs_string2); \
5015 FREE_VAR (mbs_offset2); \
5016 } \
5017 } while (0)
5018
5019 #endif
5020
5021
5022 static int
5023 PREFIX(re_search_2) (struct re_pattern_buffer *bufp, const char *string1,
5024 int size1, const char *string2, int size2,
5025 int startpos, int range,
5026 struct re_registers *regs, int stop)
5027 {
5028 int val;
5029 register char *fastmap = bufp->fastmap;
5030 register RE_TRANSLATE_TYPE translate = bufp->translate;
5031 int total_size = size1 + size2;
5032 int endpos = startpos + range;
5033 #ifdef WCHAR
5034 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5035 wchar_t *wcs_string1 = NULL, *wcs_string2 = NULL;
5036 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5037 int wcs_size1 = 0, wcs_size2 = 0;
5038 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
5039 int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
5040 /* They hold whether each wchar_t is binary data or not. */
5041 char *is_binary = NULL;
5042 #endif /* WCHAR */
5043
5044 /* Check for out-of-range STARTPOS. */
5045 if (startpos < 0 || startpos > total_size)
5046 return -1;
5047
5048 /* Fix up RANGE if it might eventually take us outside
5049 the virtual concatenation of STRING1 and STRING2.
5050 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
5051 if (endpos < 0)
5052 range = 0 - startpos;
5053 else if (endpos > total_size)
5054 range = total_size - startpos;
5055
5056 /* If the search isn't to be a backwards one, don't waste time in a
5057 search for a pattern that must be anchored. */
5058 if (bufp->used > 0 && range > 0
5059 && ((re_opcode_t) bufp->buffer[0] == begbuf
5060 /* `begline' is like `begbuf' if it cannot match at newlines. */
5061 || ((re_opcode_t) bufp->buffer[0] == begline
5062 && !bufp->newline_anchor)))
5063 {
5064 if (startpos > 0)
5065 return -1;
5066 else
5067 range = 1;
5068 }
5069
5070 #ifdef emacs
5071 /* In a forward search for something that starts with \=.
5072 don't keep searching past point. */
5073 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
5074 {
5075 range = PT - startpos;
5076 if (range <= 0)
5077 return -1;
5078 }
5079 #endif /* emacs */
5080
5081 /* Update the fastmap now if not correct already. */
5082 if (fastmap && !bufp->fastmap_accurate)
5083 if (re_compile_fastmap (bufp) == -2)
5084 return -2;
5085
5086 #ifdef WCHAR
5087 /* Allocate wchar_t array for wcs_string1 and wcs_string2 and
5088 fill them with converted string. */
5089 if (size1 != 0)
5090 {
5091 if (size1 > MAX_ALLOCA_SIZE)
5092 {
5093 wcs_string1 = TALLOC (size1 + 1, CHAR_T);
5094 mbs_offset1 = TALLOC (size1 + 1, int);
5095 is_binary = TALLOC (size1 + 1, char);
5096 }
5097 else
5098 {
5099 wcs_string1 = REGEX_TALLOC (size1 + 1, CHAR_T);
5100 mbs_offset1 = REGEX_TALLOC (size1 + 1, int);
5101 is_binary = REGEX_TALLOC (size1 + 1, char);
5102 }
5103 if (!wcs_string1 || !mbs_offset1 || !is_binary)
5104 {
5105 if (size1 > MAX_ALLOCA_SIZE)
5106 {
5107 free (wcs_string1);
5108 free (mbs_offset1);
5109 free (is_binary);
5110 }
5111 else
5112 {
5113 FREE_VAR (wcs_string1);
5114 FREE_VAR (mbs_offset1);
5115 FREE_VAR (is_binary);
5116 }
5117 return -2;
5118 }
5119 wcs_size1 = convert_mbs_to_wcs(wcs_string1, string1, size1,
5120 mbs_offset1, is_binary);
5121 wcs_string1[wcs_size1] = L'\0'; /* for a sentinel */
5122 if (size1 > MAX_ALLOCA_SIZE)
5123 free (is_binary);
5124 else
5125 FREE_VAR (is_binary);
5126 }
5127 if (size2 != 0)
5128 {
5129 if (size2 > MAX_ALLOCA_SIZE)
5130 {
5131 wcs_string2 = TALLOC (size2 + 1, CHAR_T);
5132 mbs_offset2 = TALLOC (size2 + 1, int);
5133 is_binary = TALLOC (size2 + 1, char);
5134 }
5135 else
5136 {
5137 wcs_string2 = REGEX_TALLOC (size2 + 1, CHAR_T);
5138 mbs_offset2 = REGEX_TALLOC (size2 + 1, int);
5139 is_binary = REGEX_TALLOC (size2 + 1, char);
5140 }
5141 if (!wcs_string2 || !mbs_offset2 || !is_binary)
5142 {
5143 FREE_WCS_BUFFERS ();
5144 if (size2 > MAX_ALLOCA_SIZE)
5145 free (is_binary);
5146 else
5147 FREE_VAR (is_binary);
5148 return -2;
5149 }
5150 wcs_size2 = convert_mbs_to_wcs(wcs_string2, string2, size2,
5151 mbs_offset2, is_binary);
5152 wcs_string2[wcs_size2] = L'\0'; /* for a sentinel */
5153 if (size2 > MAX_ALLOCA_SIZE)
5154 free (is_binary);
5155 else
5156 FREE_VAR (is_binary);
5157 }
5158 #endif /* WCHAR */
5159
5160
5161 /* Loop through the string, looking for a place to start matching. */
5162 for (;;)
5163 {
5164 /* If a fastmap is supplied, skip quickly over characters that
5165 cannot be the start of a match. If the pattern can match the
5166 null string, however, we don't need to skip characters; we want
5167 the first null string. */
5168 if (fastmap && startpos < total_size && !bufp->can_be_null)
5169 {
5170 if (range > 0) /* Searching forwards. */
5171 {
5172 register const char *d;
5173 register int lim = 0;
5174 int irange = range;
5175
5176 if (startpos < size1 && startpos + range >= size1)
5177 lim = range - (size1 - startpos);
5178
5179 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
5180
5181 /* Written out as an if-else to avoid testing `translate'
5182 inside the loop. */
5183 if (translate)
5184 while (range > lim
5185 && !fastmap[(unsigned char)
5186 translate[(unsigned char) *d++]])
5187 range--;
5188 else
5189 while (range > lim && !fastmap[(unsigned char) *d++])
5190 range--;
5191
5192 startpos += irange - range;
5193 }
5194 else /* Searching backwards. */
5195 {
5196 register CHAR_T c = (size1 == 0 || startpos >= size1
5197 ? string2[startpos - size1]
5198 : string1[startpos]);
5199
5200 if (!fastmap[(unsigned char) TRANSLATE (c)])
5201 goto advance;
5202 }
5203 }
5204
5205 /* If can't match the null string, and that's all we have left, fail. */
5206 if (range >= 0 && startpos == total_size && fastmap
5207 && !bufp->can_be_null)
5208 {
5209 #ifdef WCHAR
5210 FREE_WCS_BUFFERS ();
5211 #endif
5212 return -1;
5213 }
5214
5215 #ifdef WCHAR
5216 val = wcs_re_match_2_internal (bufp, string1, size1, string2,
5217 size2, startpos, regs, stop,
5218 wcs_string1, wcs_size1,
5219 wcs_string2, wcs_size2,
5220 mbs_offset1, mbs_offset2);
5221 #else /* BYTE */
5222 val = byte_re_match_2_internal (bufp, string1, size1, string2,
5223 size2, startpos, regs, stop);
5224 #endif /* BYTE */
5225
5226 #ifndef REGEX_MALLOC
5227 # ifdef C_ALLOCA
5228 alloca (0);
5229 # endif
5230 #endif
5231
5232 if (val >= 0)
5233 {
5234 #ifdef WCHAR
5235 FREE_WCS_BUFFERS ();
5236 #endif
5237 return startpos;
5238 }
5239
5240 if (val == -2)
5241 {
5242 #ifdef WCHAR
5243 FREE_WCS_BUFFERS ();
5244 #endif
5245 return -2;
5246 }
5247
5248 advance:
5249 if (!range)
5250 break;
5251 else if (range > 0)
5252 {
5253 range--;
5254 startpos++;
5255 }
5256 else
5257 {
5258 range++;
5259 startpos--;
5260 }
5261 }
5262 #ifdef WCHAR
5263 FREE_WCS_BUFFERS ();
5264 #endif
5265 return -1;
5266 }
5267
5268 #ifdef WCHAR
5269 /* This converts PTR, a pointer into one of the search wchar_t strings
5270 `string1' and `string2' into an multibyte string offset from the
5271 beginning of that string. We use mbs_offset to optimize.
5272 See convert_mbs_to_wcs. */
5273 # define POINTER_TO_OFFSET(ptr) \
5274 (FIRST_STRING_P (ptr) \
5275 ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \
5276 : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \
5277 + csize1)))
5278 #else /* BYTE */
5279 /* This converts PTR, a pointer into one of the search strings `string1'
5280 and `string2' into an offset from the beginning of that string. */
5281 # define POINTER_TO_OFFSET(ptr) \
5282 (FIRST_STRING_P (ptr) \
5283 ? ((regoff_t) ((ptr) - string1)) \
5284 : ((regoff_t) ((ptr) - string2 + size1)))
5285 #endif /* WCHAR */
5286
5287 /* Macros for dealing with the split strings in re_match_2. */
5288
5289 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
5290
5291 /* Call before fetching a character with *d. This switches over to
5292 string2 if necessary. */
5293 #define PREFETCH() \
5294 while (d == dend) \
5295 { \
5296 /* End of string2 => fail. */ \
5297 if (dend == end_match_2) \
5298 goto fail; \
5299 /* End of string1 => advance to string2. */ \
5300 d = string2; \
5301 dend = end_match_2; \
5302 }
5303
5304 /* Test if at very beginning or at very end of the virtual concatenation
5305 of `string1' and `string2'. If only one string, it's `string2'. */
5306 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5307 #define AT_STRINGS_END(d) ((d) == end2)
5308
5309
5310 /* Test if D points to a character which is word-constituent. We have
5311 two special cases to check for: if past the end of string1, look at
5312 the first character in string2; and if before the beginning of
5313 string2, look at the last character in string1. */
5314 #ifdef WCHAR
5315 /* Use internationalized API instead of SYNTAX. */
5316 # define WORDCHAR_P(d) \
5317 (iswalnum ((wint_t)((d) == end1 ? *string2 \
5318 : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0 \
5319 || ((d) == end1 ? *string2 \
5320 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == L'_')
5321 #else /* BYTE */
5322 # define WORDCHAR_P(d) \
5323 (SYNTAX ((d) == end1 ? *string2 \
5324 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
5325 == Sword)
5326 #endif /* WCHAR */
5327
5328 /* Disabled due to a compiler bug -- see comment at case wordbound */
5329 #if 0
5330 /* Test if the character before D and the one at D differ with respect
5331 to being word-constituent. */
5332 #define AT_WORD_BOUNDARY(d) \
5333 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
5334 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
5335 #endif
5336
5337 /* Free everything we malloc. */
5338 #ifdef MATCH_MAY_ALLOCATE
5339 # ifdef WCHAR
5340 # define FREE_VARIABLES() \
5341 do { \
5342 REGEX_FREE_STACK (fail_stack.stack); \
5343 FREE_VAR (regstart); \
5344 FREE_VAR (regend); \
5345 FREE_VAR (old_regstart); \
5346 FREE_VAR (old_regend); \
5347 FREE_VAR (best_regstart); \
5348 FREE_VAR (best_regend); \
5349 FREE_VAR (reg_info); \
5350 FREE_VAR (reg_dummy); \
5351 FREE_VAR (reg_info_dummy); \
5352 if (!cant_free_wcs_buf) \
5353 { \
5354 FREE_VAR (string1); \
5355 FREE_VAR (string2); \
5356 FREE_VAR (mbs_offset1); \
5357 FREE_VAR (mbs_offset2); \
5358 } \
5359 } while (0)
5360 # else /* BYTE */
5361 # define FREE_VARIABLES() \
5362 do { \
5363 REGEX_FREE_STACK (fail_stack.stack); \
5364 FREE_VAR (regstart); \
5365 FREE_VAR (regend); \
5366 FREE_VAR (old_regstart); \
5367 FREE_VAR (old_regend); \
5368 FREE_VAR (best_regstart); \
5369 FREE_VAR (best_regend); \
5370 FREE_VAR (reg_info); \
5371 FREE_VAR (reg_dummy); \
5372 FREE_VAR (reg_info_dummy); \
5373 } while (0)
5374 # endif /* WCHAR */
5375 #else
5376 # ifdef WCHAR
5377 # define FREE_VARIABLES() \
5378 do { \
5379 if (!cant_free_wcs_buf) \
5380 { \
5381 FREE_VAR (string1); \
5382 FREE_VAR (string2); \
5383 FREE_VAR (mbs_offset1); \
5384 FREE_VAR (mbs_offset2); \
5385 } \
5386 } while (0)
5387 # else /* BYTE */
5388 # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
5389 # endif /* WCHAR */
5390 #endif /* not MATCH_MAY_ALLOCATE */
5391
5392 /* These values must meet several constraints. They must not be valid
5393 register values; since we have a limit of 255 registers (because
5394 we use only one byte in the pattern for the register number), we can
5395 use numbers larger than 255. They must differ by 1, because of
5396 NUM_FAILURE_ITEMS above. And the value for the lowest register must
5397 be larger than the value for the highest register, so we do not try
5398 to actually save any registers when none are active. */
5399 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
5400 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
5401
5402 #else /* not INSIDE_RECURSION */
5404 /* Matching routines. */
5405
5406 #ifndef emacs /* Emacs never uses this. */
5407 /* re_match is like re_match_2 except it takes only a single string. */
5408
5409 int
5410 re_match (struct re_pattern_buffer *bufp, const char *string,
5411 int size, int pos, struct re_registers *regs)
5412 {
5413 int result;
5414 # ifdef MBS_SUPPORT
5415 if (MB_CUR_MAX != 1)
5416 result = wcs_re_match_2_internal (bufp, NULL, 0, string, size,
5417 pos, regs, size,
5418 NULL, 0, NULL, 0, NULL, NULL);
5419 else
5420 # endif
5421 result = byte_re_match_2_internal (bufp, NULL, 0, string, size,
5422 pos, regs, size);
5423 # ifndef REGEX_MALLOC
5424 # ifdef C_ALLOCA
5425 alloca (0);
5426 # endif
5427 # endif
5428 return result;
5429 }
5430 # ifdef _LIBC
5431 weak_alias (__re_match, re_match)
5432 # endif
5433 #endif /* not emacs */
5434
5435 #endif /* not INSIDE_RECURSION */
5436
5437 #ifdef INSIDE_RECURSION
5438 static boolean PREFIX(group_match_null_string_p) (UCHAR_T **p,
5439 UCHAR_T *end,
5440 PREFIX(register_info_type) *reg_info);
5441 static boolean PREFIX(alt_match_null_string_p) (UCHAR_T *p,
5442 UCHAR_T *end,
5443 PREFIX(register_info_type) *reg_info);
5444 static boolean PREFIX(common_op_match_null_string_p) (UCHAR_T **p,
5445 UCHAR_T *end,
5446 PREFIX(register_info_type) *reg_info);
5447 static int PREFIX(bcmp_translate) (const CHAR_T *s1, const CHAR_T *s2,
5448 int len, char *translate);
5449 #else /* not INSIDE_RECURSION */
5450
5451 /* re_match_2 matches the compiled pattern in BUFP against the
5452 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5453 and SIZE2, respectively). We start matching at POS, and stop
5454 matching at STOP.
5455
5456 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
5457 store offsets for the substring each group matched in REGS. See the
5458 documentation for exactly how many groups we fill.
5459
5460 We return -1 if no match, -2 if an internal error (such as the
5461 failure stack overflowing). Otherwise, we return the length of the
5462 matched substring. */
5463
5464 int
5465 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int size1,
5466 const char *string2, int size2, int pos,
5467 struct re_registers *regs, int stop)
5468 {
5469 int result;
5470 # ifdef MBS_SUPPORT
5471 if (MB_CUR_MAX != 1)
5472 result = wcs_re_match_2_internal (bufp, string1, size1, string2, size2,
5473 pos, regs, stop,
5474 NULL, 0, NULL, 0, NULL, NULL);
5475 else
5476 # endif
5477 result = byte_re_match_2_internal (bufp, string1, size1, string2, size2,
5478 pos, regs, stop);
5479
5480 #ifndef REGEX_MALLOC
5481 # ifdef C_ALLOCA
5482 alloca (0);
5483 # endif
5484 #endif
5485 return result;
5486 }
5487 #ifdef _LIBC
5488 weak_alias (__re_match_2, re_match_2)
5489 #endif
5490
5491 #endif /* not INSIDE_RECURSION */
5492
5493 #ifdef INSIDE_RECURSION
5494
5495 #ifdef WCHAR
5496 static int count_mbs_length (int *, int);
5497
5498 /* This check the substring (from 0, to length) of the multibyte string,
5499 to which offset_buffer correspond. And count how many wchar_t_characters
5500 the substring occupy. We use offset_buffer to optimization.
5501 See convert_mbs_to_wcs. */
5502
5503 static int
5504 count_mbs_length(int *offset_buffer, int length)
5505 {
5506 int upper, lower;
5507
5508 /* Check whether the size is valid. */
5509 if (length < 0)
5510 return -1;
5511
5512 if (offset_buffer == NULL)
5513 return 0;
5514
5515 /* If there are no multibyte character, offset_buffer[i] == i.
5516 Optmize for this case. */
5517 if (offset_buffer[length] == length)
5518 return length;
5519
5520 /* Set up upper with length. (because for all i, offset_buffer[i] >= i) */
5521 upper = length;
5522 lower = 0;
5523
5524 while (true)
5525 {
5526 int middle = (lower + upper) / 2;
5527 if (middle == lower || middle == upper)
5528 break;
5529 if (offset_buffer[middle] > length)
5530 upper = middle;
5531 else if (offset_buffer[middle] < length)
5532 lower = middle;
5533 else
5534 return middle;
5535 }
5536
5537 return -1;
5538 }
5539 #endif /* WCHAR */
5540
5541 /* This is a separate function so that we can force an alloca cleanup
5542 afterwards. */
5543 #ifdef WCHAR
5544 static int
5545 wcs_re_match_2_internal (struct re_pattern_buffer *bufp,
5546 const char *cstring1, int csize1,
5547 const char *cstring2, int csize2,
5548 int pos,
5549 struct re_registers *regs,
5550 int stop,
5551 /* string1 == string2 == NULL means string1/2, size1/2 and
5552 mbs_offset1/2 need seting up in this function. */
5553 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5554 wchar_t *string1, int size1,
5555 wchar_t *string2, int size2,
5556 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
5557 int *mbs_offset1, int *mbs_offset2)
5558 #else /* BYTE */
5559 static int
5560 byte_re_match_2_internal (struct re_pattern_buffer *bufp,
5561 const char *string1, int size1,
5562 const char *string2, int size2,
5563 int pos,
5564 struct re_registers *regs, int stop)
5565 #endif /* BYTE */
5566 {
5567 /* General temporaries. */
5568 int mcnt;
5569 UCHAR_T *p1;
5570 #ifdef WCHAR
5571 /* They hold whether each wchar_t is binary data or not. */
5572 char *is_binary = NULL;
5573 /* If true, we can't free string1/2, mbs_offset1/2. */
5574 int cant_free_wcs_buf = 1;
5575 #endif /* WCHAR */
5576
5577 /* Just past the end of the corresponding string. */
5578 const CHAR_T *end1, *end2;
5579
5580 /* Pointers into string1 and string2, just past the last characters in
5581 each to consider matching. */
5582 const CHAR_T *end_match_1, *end_match_2;
5583
5584 /* Where we are in the data, and the end of the current string. */
5585 const CHAR_T *d, *dend;
5586
5587 /* Where we are in the pattern, and the end of the pattern. */
5588 #ifdef WCHAR
5589 UCHAR_T *pattern, *p;
5590 register UCHAR_T *pend;
5591 #else /* BYTE */
5592 UCHAR_T *p = bufp->buffer;
5593 register UCHAR_T *pend = p + bufp->used;
5594 #endif /* WCHAR */
5595
5596 /* Mark the opcode just after a start_memory, so we can test for an
5597 empty subpattern when we get to the stop_memory. */
5598 UCHAR_T *just_past_start_mem = 0;
5599
5600 /* We use this to map every character in the string. */
5601 RE_TRANSLATE_TYPE translate = bufp->translate;
5602
5603 /* Failure point stack. Each place that can handle a failure further
5604 down the line pushes a failure point on this stack. It consists of
5605 restart, regend, and reg_info for all registers corresponding to
5606 the subexpressions we're currently inside, plus the number of such
5607 registers, and, finally, two char *'s. The first char * is where
5608 to resume scanning the pattern; the second one is where to resume
5609 scanning the strings. If the latter is zero, the failure point is
5610 a ``dummy''; if a failure happens and the failure point is a dummy,
5611 it gets discarded and the next next one is tried. */
5612 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5613 PREFIX(fail_stack_type) fail_stack;
5614 #endif
5615 #ifdef DEBUG
5616 static unsigned failure_id;
5617 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5618 #endif
5619
5620 #ifdef REL_ALLOC
5621 /* This holds the pointer to the failure stack, when
5622 it is allocated relocatably. */
5623 fail_stack_elt_t *failure_stack_ptr;
5624 #endif
5625
5626 /* We fill all the registers internally, independent of what we
5627 return, for use in backreferences. The number here includes
5628 an element for register zero. */
5629 size_t num_regs = bufp->re_nsub + 1;
5630
5631 /* The currently active registers. */
5632 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5633 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5634
5635 /* Information on the contents of registers. These are pointers into
5636 the input strings; they record just what was matched (on this
5637 attempt) by a subexpression part of the pattern, that is, the
5638 regnum-th regstart pointer points to where in the pattern we began
5639 matching and the regnum-th regend points to right after where we
5640 stopped matching the regnum-th subexpression. (The zeroth register
5641 keeps track of what the whole pattern matches.) */
5642 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5643 const CHAR_T **regstart, **regend;
5644 #endif
5645
5646 /* If a group that's operated upon by a repetition operator fails to
5647 match anything, then the register for its start will need to be
5648 restored because it will have been set to wherever in the string we
5649 are when we last see its open-group operator. Similarly for a
5650 register's end. */
5651 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5652 const CHAR_T **old_regstart, **old_regend;
5653 #endif
5654
5655 /* The is_active field of reg_info helps us keep track of which (possibly
5656 nested) subexpressions we are currently in. The matched_something
5657 field of reg_info[reg_num] helps us tell whether or not we have
5658 matched any of the pattern so far this time through the reg_num-th
5659 subexpression. These two fields get reset each time through any
5660 loop their register is in. */
5661 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5662 PREFIX(register_info_type) *reg_info;
5663 #endif
5664
5665 /* The following record the register info as found in the above
5666 variables when we find a match better than any we've seen before.
5667 This happens as we backtrack through the failure points, which in
5668 turn happens only if we have not yet matched the entire string. */
5669 unsigned best_regs_set = false;
5670 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5671 const CHAR_T **best_regstart, **best_regend;
5672 #endif
5673
5674 /* Logically, this is `best_regend[0]'. But we don't want to have to
5675 allocate space for that if we're not allocating space for anything
5676 else (see below). Also, we never need info about register 0 for
5677 any of the other register vectors, and it seems rather a kludge to
5678 treat `best_regend' differently than the rest. So we keep track of
5679 the end of the best match so far in a separate variable. We
5680 initialize this to NULL so that when we backtrack the first time
5681 and need to test it, it's not garbage. */
5682 const CHAR_T *match_end = NULL;
5683
5684 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
5685 int set_regs_matched_done = 0;
5686
5687 /* Used when we pop values we don't care about. */
5688 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5689 const CHAR_T **reg_dummy;
5690 PREFIX(register_info_type) *reg_info_dummy;
5691 #endif
5692
5693 #ifdef DEBUG
5694 /* Counts the total number of registers pushed. */
5695 unsigned num_regs_pushed = 0;
5696 #endif
5697
5698 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5699
5700 INIT_FAIL_STACK ();
5701
5702 #ifdef MATCH_MAY_ALLOCATE
5703 /* Do not bother to initialize all the register variables if there are
5704 no groups in the pattern, as it takes a fair amount of time. If
5705 there are groups, we include space for register 0 (the whole
5706 pattern), even though we never use it, since it simplifies the
5707 array indexing. We should fix this. */
5708 if (bufp->re_nsub)
5709 {
5710 regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5711 regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5712 old_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5713 old_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5714 best_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5715 best_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5716 reg_info = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5717 reg_dummy = REGEX_TALLOC (num_regs, const CHAR_T *);
5718 reg_info_dummy = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5719
5720 if (!(regstart && regend && old_regstart && old_regend && reg_info
5721 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
5722 {
5723 FREE_VARIABLES ();
5724 return -2;
5725 }
5726 }
5727 else
5728 {
5729 /* We must initialize all our variables to NULL, so that
5730 `FREE_VARIABLES' doesn't try to free them. */
5731 regstart = regend = old_regstart = old_regend = best_regstart
5732 = best_regend = reg_dummy = NULL;
5733 reg_info = reg_info_dummy = (PREFIX(register_info_type) *) NULL;
5734 }
5735 #endif /* MATCH_MAY_ALLOCATE */
5736
5737 /* The starting position is bogus. */
5738 #ifdef WCHAR
5739 if (pos < 0 || pos > csize1 + csize2)
5740 #else /* BYTE */
5741 if (pos < 0 || pos > size1 + size2)
5742 #endif
5743 {
5744 FREE_VARIABLES ();
5745 return -1;
5746 }
5747
5748 #ifdef WCHAR
5749 /* Allocate wchar_t array for string1 and string2 and
5750 fill them with converted string. */
5751 if (string1 == NULL && string2 == NULL)
5752 {
5753 /* We need seting up buffers here. */
5754
5755 /* We must free wcs buffers in this function. */
5756 cant_free_wcs_buf = 0;
5757
5758 if (csize1 != 0)
5759 {
5760 string1 = REGEX_TALLOC (csize1 + 1, CHAR_T);
5761 mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
5762 is_binary = REGEX_TALLOC (csize1 + 1, char);
5763 if (!string1 || !mbs_offset1 || !is_binary)
5764 {
5765 FREE_VAR (string1);
5766 FREE_VAR (mbs_offset1);
5767 FREE_VAR (is_binary);
5768 return -2;
5769 }
5770 }
5771 if (csize2 != 0)
5772 {
5773 string2 = REGEX_TALLOC (csize2 + 1, CHAR_T);
5774 mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
5775 is_binary = REGEX_TALLOC (csize2 + 1, char);
5776 if (!string2 || !mbs_offset2 || !is_binary)
5777 {
5778 FREE_VAR (string1);
5779 FREE_VAR (mbs_offset1);
5780 FREE_VAR (string2);
5781 FREE_VAR (mbs_offset2);
5782 FREE_VAR (is_binary);
5783 return -2;
5784 }
5785 size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
5786 mbs_offset2, is_binary);
5787 string2[size2] = L'\0'; /* for a sentinel */
5788 FREE_VAR (is_binary);
5789 }
5790 }
5791
5792 /* We need to cast pattern to (wchar_t*), because we casted this compiled
5793 pattern to (char*) in regex_compile. */
5794 p = pattern = (CHAR_T*)bufp->buffer;
5795 pend = (CHAR_T*)(bufp->buffer + bufp->used);
5796
5797 #endif /* WCHAR */
5798
5799 /* Initialize subexpression text positions to -1 to mark ones that no
5800 start_memory/stop_memory has been seen for. Also initialize the
5801 register information struct. */
5802 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5803 {
5804 regstart[mcnt] = regend[mcnt]
5805 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
5806
5807 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
5808 IS_ACTIVE (reg_info[mcnt]) = 0;
5809 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5810 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5811 }
5812
5813 /* We move `string1' into `string2' if the latter's empty -- but not if
5814 `string1' is null. */
5815 if (size2 == 0 && string1 != NULL)
5816 {
5817 string2 = string1;
5818 size2 = size1;
5819 string1 = 0;
5820 size1 = 0;
5821 #ifdef WCHAR
5822 mbs_offset2 = mbs_offset1;
5823 csize2 = csize1;
5824 mbs_offset1 = NULL;
5825 csize1 = 0;
5826 #endif
5827 }
5828 end1 = string1 + size1;
5829 end2 = string2 + size2;
5830
5831 /* Compute where to stop matching, within the two strings. */
5832 #ifdef WCHAR
5833 if (stop <= csize1)
5834 {
5835 mcnt = count_mbs_length(mbs_offset1, stop);
5836 end_match_1 = string1 + mcnt;
5837 end_match_2 = string2;
5838 }
5839 else
5840 {
5841 if (stop > csize1 + csize2)
5842 stop = csize1 + csize2;
5843 end_match_1 = end1;
5844 mcnt = count_mbs_length(mbs_offset2, stop-csize1);
5845 end_match_2 = string2 + mcnt;
5846 }
5847 if (mcnt < 0)
5848 { /* count_mbs_length return error. */
5849 FREE_VARIABLES ();
5850 return -1;
5851 }
5852 #else
5853 if (stop <= size1)
5854 {
5855 end_match_1 = string1 + stop;
5856 end_match_2 = string2;
5857 }
5858 else
5859 {
5860 end_match_1 = end1;
5861 end_match_2 = string2 + stop - size1;
5862 }
5863 #endif /* WCHAR */
5864
5865 /* `p' scans through the pattern as `d' scans through the data.
5866 `dend' is the end of the input string that `d' points within. `d'
5867 is advanced into the following input string whenever necessary, but
5868 this happens before fetching; therefore, at the beginning of the
5869 loop, `d' can be pointing at the end of a string, but it cannot
5870 equal `string2'. */
5871 #ifdef WCHAR
5872 if (size1 > 0 && pos <= csize1)
5873 {
5874 mcnt = count_mbs_length(mbs_offset1, pos);
5875 d = string1 + mcnt;
5876 dend = end_match_1;
5877 }
5878 else
5879 {
5880 mcnt = count_mbs_length(mbs_offset2, pos-csize1);
5881 d = string2 + mcnt;
5882 dend = end_match_2;
5883 }
5884
5885 if (mcnt < 0)
5886 { /* count_mbs_length return error. */
5887 FREE_VARIABLES ();
5888 return -1;
5889 }
5890 #else
5891 if (size1 > 0 && pos <= size1)
5892 {
5893 d = string1 + pos;
5894 dend = end_match_1;
5895 }
5896 else
5897 {
5898 d = string2 + pos - size1;
5899 dend = end_match_2;
5900 }
5901 #endif /* WCHAR */
5902
5903 DEBUG_PRINT1 ("The compiled pattern is:\n");
5904 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5905 DEBUG_PRINT1 ("The string to match is: `");
5906 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5907 DEBUG_PRINT1 ("'\n");
5908
5909 /* This loops over pattern commands. It exits by returning from the
5910 function if the match is complete, or it drops through if the match
5911 fails at this starting point in the input data. */
5912 for (;;)
5913 {
5914 #ifdef _LIBC
5915 DEBUG_PRINT2 ("\n%p: ", p);
5916 #else
5917 DEBUG_PRINT2 ("\n0x%x: ", p);
5918 #endif
5919
5920 if (p == pend)
5921 { /* End of pattern means we might have succeeded. */
5922 DEBUG_PRINT1 ("end of pattern ... ");
5923
5924 /* If we haven't matched the entire string, and we want the
5925 longest match, try backtracking. */
5926 if (d != end_match_2)
5927 {
5928 /* 1 if this match ends in the same string (string1 or string2)
5929 as the best previous match. */
5930 boolean same_str_p;
5931
5932 /* 1 if this match is the best seen so far. */
5933 boolean best_match_p;
5934
5935 same_str_p = (FIRST_STRING_P (match_end)
5936 == MATCHING_IN_FIRST_STRING);
5937
5938 /* AIX compiler got confused when this was combined
5939 with the previous declaration. */
5940 if (same_str_p)
5941 best_match_p = d > match_end;
5942 else
5943 best_match_p = !MATCHING_IN_FIRST_STRING;
5944
5945 DEBUG_PRINT1 ("backtracking.\n");
5946
5947 if (!FAIL_STACK_EMPTY ())
5948 { /* More failure points to try. */
5949
5950 /* If exceeds best match so far, save it. */
5951 if (!best_regs_set || best_match_p)
5952 {
5953 best_regs_set = true;
5954 match_end = d;
5955
5956 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5957
5958 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5959 {
5960 best_regstart[mcnt] = regstart[mcnt];
5961 best_regend[mcnt] = regend[mcnt];
5962 }
5963 }
5964 goto fail;
5965 }
5966
5967 /* If no failure points, don't restore garbage. And if
5968 last match is real best match, don't restore second
5969 best one. */
5970 else if (best_regs_set && !best_match_p)
5971 {
5972 restore_best_regs:
5973 /* Restore best match. It may happen that `dend ==
5974 end_match_1' while the restored d is in string2.
5975 For example, the pattern `x.*y.*z' against the
5976 strings `x-' and `y-z-', if the two strings are
5977 not consecutive in memory. */
5978 DEBUG_PRINT1 ("Restoring best registers.\n");
5979
5980 d = match_end;
5981 dend = ((d >= string1 && d <= end1)
5982 ? end_match_1 : end_match_2);
5983
5984 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5985 {
5986 regstart[mcnt] = best_regstart[mcnt];
5987 regend[mcnt] = best_regend[mcnt];
5988 }
5989 }
5990 } /* d != end_match_2 */
5991
5992 succeed_label:
5993 DEBUG_PRINT1 ("Accepting match.\n");
5994 /* If caller wants register contents data back, do it. */
5995 if (regs && !bufp->no_sub)
5996 {
5997 /* Have the register data arrays been allocated? */
5998 if (bufp->regs_allocated == REGS_UNALLOCATED)
5999 { /* No. So allocate them with malloc. We need one
6000 extra element beyond `num_regs' for the `-1' marker
6001 GNU code uses. */
6002 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
6003 regs->start = TALLOC (regs->num_regs, regoff_t);
6004 regs->end = TALLOC (regs->num_regs, regoff_t);
6005 if (regs->start == NULL || regs->end == NULL)
6006 {
6007 FREE_VARIABLES ();
6008 return -2;
6009 }
6010 bufp->regs_allocated = REGS_REALLOCATE;
6011 }
6012 else if (bufp->regs_allocated == REGS_REALLOCATE)
6013 { /* Yes. If we need more elements than were already
6014 allocated, reallocate them. If we need fewer, just
6015 leave it alone. */
6016 if (regs->num_regs < num_regs + 1)
6017 {
6018 regs->num_regs = num_regs + 1;
6019 RETALLOC (regs->start, regs->num_regs, regoff_t);
6020 RETALLOC (regs->end, regs->num_regs, regoff_t);
6021 if (regs->start == NULL || regs->end == NULL)
6022 {
6023 FREE_VARIABLES ();
6024 return -2;
6025 }
6026 }
6027 }
6028 else
6029 {
6030 /* These braces fend off a "empty body in an else-statement"
6031 warning under GCC when assert expands to nothing. */
6032 assert (bufp->regs_allocated == REGS_FIXED);
6033 }
6034
6035 /* Convert the pointer data in `regstart' and `regend' to
6036 indices. Register zero has to be set differently,
6037 since we haven't kept track of any info for it. */
6038 if (regs->num_regs > 0)
6039 {
6040 regs->start[0] = pos;
6041 #ifdef WCHAR
6042 if (MATCHING_IN_FIRST_STRING)
6043 regs->end[0] = mbs_offset1 != NULL ?
6044 mbs_offset1[d-string1] : 0;
6045 else
6046 regs->end[0] = csize1 + (mbs_offset2 != NULL ?
6047 mbs_offset2[d-string2] : 0);
6048 #else
6049 regs->end[0] = (MATCHING_IN_FIRST_STRING
6050 ? ((regoff_t) (d - string1))
6051 : ((regoff_t) (d - string2 + size1)));
6052 #endif /* WCHAR */
6053 }
6054
6055 /* Go through the first `min (num_regs, regs->num_regs)'
6056 registers, since that is all we initialized. */
6057 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
6058 mcnt++)
6059 {
6060 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
6061 regs->start[mcnt] = regs->end[mcnt] = -1;
6062 else
6063 {
6064 regs->start[mcnt]
6065 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
6066 regs->end[mcnt]
6067 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
6068 }
6069 }
6070
6071 /* If the regs structure we return has more elements than
6072 were in the pattern, set the extra elements to -1. If
6073 we (re)allocated the registers, this is the case,
6074 because we always allocate enough to have at least one
6075 -1 at the end. */
6076 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
6077 regs->start[mcnt] = regs->end[mcnt] = -1;
6078 } /* regs && !bufp->no_sub */
6079
6080 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
6081 nfailure_points_pushed, nfailure_points_popped,
6082 nfailure_points_pushed - nfailure_points_popped);
6083 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
6084
6085 #ifdef WCHAR
6086 if (MATCHING_IN_FIRST_STRING)
6087 mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
6088 else
6089 mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
6090 csize1;
6091 mcnt -= pos;
6092 #else
6093 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
6094 ? string1
6095 : string2 - size1);
6096 #endif /* WCHAR */
6097
6098 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
6099
6100 FREE_VARIABLES ();
6101 return mcnt;
6102 }
6103
6104 /* Otherwise match next pattern command. */
6105 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
6106 {
6107 /* Ignore these. Used to ignore the n of succeed_n's which
6108 currently have n == 0. */
6109 case no_op:
6110 DEBUG_PRINT1 ("EXECUTING no_op.\n");
6111 break;
6112
6113 case succeed:
6114 DEBUG_PRINT1 ("EXECUTING succeed.\n");
6115 goto succeed_label;
6116
6117 /* Match the next n pattern characters exactly. The following
6118 byte in the pattern defines n, and the n bytes after that
6119 are the characters to match. */
6120 case exactn:
6121 #ifdef MBS_SUPPORT
6122 case exactn_bin:
6123 #endif
6124 mcnt = *p++;
6125 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
6126
6127 /* This is written out as an if-else so we don't waste time
6128 testing `translate' inside the loop. */
6129 if (translate)
6130 {
6131 do
6132 {
6133 PREFETCH ();
6134 #ifdef WCHAR
6135 if (*d <= 0xff)
6136 {
6137 if ((UCHAR_T) translate[(unsigned char) *d++]
6138 != (UCHAR_T) *p++)
6139 goto fail;
6140 }
6141 else
6142 {
6143 if (*d++ != (CHAR_T) *p++)
6144 goto fail;
6145 }
6146 #else
6147 if ((UCHAR_T) translate[(unsigned char) *d++]
6148 != (UCHAR_T) *p++)
6149 goto fail;
6150 #endif /* WCHAR */
6151 }
6152 while (--mcnt);
6153 }
6154 else
6155 {
6156 do
6157 {
6158 PREFETCH ();
6159 if (*d++ != (CHAR_T) *p++) goto fail;
6160 }
6161 while (--mcnt);
6162 }
6163 SET_REGS_MATCHED ();
6164 break;
6165
6166
6167 /* Match any character except possibly a newline or a null. */
6168 case anychar:
6169 DEBUG_PRINT1 ("EXECUTING anychar.\n");
6170
6171 PREFETCH ();
6172
6173 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
6174 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
6175 goto fail;
6176
6177 SET_REGS_MATCHED ();
6178 DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d);
6179 d++;
6180 break;
6181
6182
6183 case charset:
6184 case charset_not:
6185 {
6186 register UCHAR_T c;
6187 #ifdef WCHAR
6188 unsigned int i, char_class_length, coll_symbol_length,
6189 equiv_class_length, ranges_length, chars_length, length;
6190 CHAR_T *workp, *workp2, *charset_top;
6191 #define WORK_BUFFER_SIZE 128
6192 CHAR_T str_buf[WORK_BUFFER_SIZE];
6193 # ifdef _LIBC
6194 uint32_t nrules;
6195 # endif /* _LIBC */
6196 #endif /* WCHAR */
6197 boolean negate = (re_opcode_t) *(p - 1) == charset_not;
6198
6199 DEBUG_PRINT2 ("EXECUTING charset%s.\n", negate ? "_not" : "");
6200 PREFETCH ();
6201 c = TRANSLATE (*d); /* The character to match. */
6202 #ifdef WCHAR
6203 # ifdef _LIBC
6204 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
6205 # endif /* _LIBC */
6206 charset_top = p - 1;
6207 char_class_length = *p++;
6208 coll_symbol_length = *p++;
6209 equiv_class_length = *p++;
6210 ranges_length = *p++;
6211 chars_length = *p++;
6212 /* p points charset[6], so the address of the next instruction
6213 (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'],
6214 where l=length of char_classes, m=length of collating_symbol,
6215 n=equivalence_class, o=length of char_range,
6216 p'=length of character. */
6217 workp = p;
6218 /* Update p to indicate the next instruction. */
6219 p += char_class_length + coll_symbol_length+ equiv_class_length +
6220 2*ranges_length + chars_length;
6221
6222 /* match with char_class? */
6223 for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
6224 {
6225 wctype_t wctype;
6226 uintptr_t alignedp = ((uintptr_t)workp
6227 + __alignof__(wctype_t) - 1)
6228 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
6229 wctype = *((wctype_t*)alignedp);
6230 workp += CHAR_CLASS_SIZE;
6231 # ifdef _LIBC
6232 if (__iswctype((wint_t)c, wctype))
6233 goto char_set_matched;
6234 # else
6235 if (iswctype((wint_t)c, wctype))
6236 goto char_set_matched;
6237 # endif
6238 }
6239
6240 /* match with collating_symbol? */
6241 # ifdef _LIBC
6242 if (nrules != 0)
6243 {
6244 const unsigned char *extra = (const unsigned char *)
6245 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
6246
6247 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
6248 workp++)
6249 {
6250 int32_t *wextra;
6251 wextra = (int32_t*)(extra + *workp++);
6252 for (i = 0; i < *wextra; ++i)
6253 if (TRANSLATE(d[i]) != wextra[1 + i])
6254 break;
6255
6256 if (i == *wextra)
6257 {
6258 /* Update d, however d will be incremented at
6259 char_set_matched:, we decrement d here. */
6260 d += i - 1;
6261 goto char_set_matched;
6262 }
6263 }
6264 }
6265 else /* (nrules == 0) */
6266 # endif
6267 /* If we can't look up collation data, we use wcscoll
6268 instead. */
6269 {
6270 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;)
6271 {
6272 const CHAR_T *backup_d = d, *backup_dend = dend;
6273 # ifdef _LIBC
6274 length = __wcslen (workp);
6275 # else
6276 length = wcslen (workp);
6277 # endif
6278
6279 /* If wcscoll(the collating symbol, whole string) > 0,
6280 any substring of the string never match with the
6281 collating symbol. */
6282 # ifdef _LIBC
6283 if (__wcscoll (workp, d) > 0)
6284 # else
6285 if (wcscoll (workp, d) > 0)
6286 # endif
6287 {
6288 workp += length + 1;
6289 continue;
6290 }
6291
6292 /* First, we compare the collating symbol with
6293 the first character of the string.
6294 If it don't match, we add the next character to
6295 the compare buffer in turn. */
6296 for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++)
6297 {
6298 int match;
6299 if (d == dend)
6300 {
6301 if (dend == end_match_2)
6302 break;
6303 d = string2;
6304 dend = end_match_2;
6305 }
6306
6307 /* add next character to the compare buffer. */
6308 str_buf[i] = TRANSLATE(*d);
6309 str_buf[i+1] = '\0';
6310
6311 # ifdef _LIBC
6312 match = __wcscoll (workp, str_buf);
6313 # else
6314 match = wcscoll (workp, str_buf);
6315 # endif
6316 if (match == 0)
6317 goto char_set_matched;
6318
6319 if (match < 0)
6320 /* (str_buf > workp) indicate (str_buf + X > workp),
6321 because for all X (str_buf + X > str_buf).
6322 So we don't need continue this loop. */
6323 break;
6324
6325 /* Otherwise(str_buf < workp),
6326 (str_buf+next_character) may equals (workp).
6327 So we continue this loop. */
6328 }
6329 /* not matched */
6330 d = backup_d;
6331 dend = backup_dend;
6332 workp += length + 1;
6333 }
6334 }
6335 /* match with equivalence_class? */
6336 # ifdef _LIBC
6337 if (nrules != 0)
6338 {
6339 const CHAR_T *backup_d = d, *backup_dend = dend;
6340 /* Try to match the equivalence class against
6341 those known to the collate implementation. */
6342 const int32_t *table;
6343 const int32_t *weights;
6344 const int32_t *extra;
6345 const int32_t *indirect;
6346 int32_t idx, idx2;
6347 wint_t *cp;
6348 size_t len;
6349
6350 /* This #include defines a local function! */
6351 # include <locale/weightwc.h>
6352
6353 table = (const int32_t *)
6354 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC);
6355 weights = (const wint_t *)
6356 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC);
6357 extra = (const wint_t *)
6358 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC);
6359 indirect = (const int32_t *)
6360 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC);
6361
6362 /* Write 1 collating element to str_buf, and
6363 get its index. */
6364 idx2 = 0;
6365
6366 for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++)
6367 {
6368 cp = (wint_t*)str_buf;
6369 if (d == dend)
6370 {
6371 if (dend == end_match_2)
6372 break;
6373 d = string2;
6374 dend = end_match_2;
6375 }
6376 str_buf[i] = TRANSLATE(*(d+i));
6377 str_buf[i+1] = '\0'; /* sentinel */
6378 idx2 = findidx ((const wint_t**)&cp);
6379 }
6380
6381 /* Update d, however d will be incremented at
6382 char_set_matched:, we decrement d here. */
6383 d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1);
6384 if (d >= dend)
6385 {
6386 if (dend == end_match_2)
6387 d = dend;
6388 else
6389 {
6390 d = string2;
6391 dend = end_match_2;
6392 }
6393 }
6394
6395 len = weights[idx2];
6396
6397 for (workp2 = workp + equiv_class_length ; workp < workp2 ;
6398 workp++)
6399 {
6400 idx = (int32_t)*workp;
6401 /* We already checked idx != 0 in regex_compile. */
6402
6403 if (idx2 != 0 && len == weights[idx])
6404 {
6405 int cnt = 0;
6406 while (cnt < len && (weights[idx + 1 + cnt]
6407 == weights[idx2 + 1 + cnt]))
6408 ++cnt;
6409
6410 if (cnt == len)
6411 goto char_set_matched;
6412 }
6413 }
6414 /* not matched */
6415 d = backup_d;
6416 dend = backup_dend;
6417 }
6418 else /* (nrules == 0) */
6419 # endif
6420 /* If we can't look up collation data, we use wcscoll
6421 instead. */
6422 {
6423 for (workp2 = workp + equiv_class_length ; workp < workp2 ;)
6424 {
6425 const CHAR_T *backup_d = d, *backup_dend = dend;
6426 # ifdef _LIBC
6427 length = __wcslen (workp);
6428 # else
6429 length = wcslen (workp);
6430 # endif
6431
6432 /* If wcscoll(the collating symbol, whole string) > 0,
6433 any substring of the string never match with the
6434 collating symbol. */
6435 # ifdef _LIBC
6436 if (__wcscoll (workp, d) > 0)
6437 # else
6438 if (wcscoll (workp, d) > 0)
6439 # endif
6440 {
6441 workp += length + 1;
6442 break;
6443 }
6444
6445 /* First, we compare the equivalence class with
6446 the first character of the string.
6447 If it don't match, we add the next character to
6448 the compare buffer in turn. */
6449 for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++)
6450 {
6451 int match;
6452 if (d == dend)
6453 {
6454 if (dend == end_match_2)
6455 break;
6456 d = string2;
6457 dend = end_match_2;
6458 }
6459
6460 /* add next character to the compare buffer. */
6461 str_buf[i] = TRANSLATE(*d);
6462 str_buf[i+1] = '\0';
6463
6464 # ifdef _LIBC
6465 match = __wcscoll (workp, str_buf);
6466 # else
6467 match = wcscoll (workp, str_buf);
6468 # endif
6469
6470 if (match == 0)
6471 goto char_set_matched;
6472
6473 if (match < 0)
6474 /* (str_buf > workp) indicate (str_buf + X > workp),
6475 because for all X (str_buf + X > str_buf).
6476 So we don't need continue this loop. */
6477 break;
6478
6479 /* Otherwise(str_buf < workp),
6480 (str_buf+next_character) may equals (workp).
6481 So we continue this loop. */
6482 }
6483 /* not matched */
6484 d = backup_d;
6485 dend = backup_dend;
6486 workp += length + 1;
6487 }
6488 }
6489
6490 /* match with char_range? */
6491 # ifdef _LIBC
6492 if (nrules != 0)
6493 {
6494 uint32_t collseqval;
6495 const char *collseq = (const char *)
6496 _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC);
6497
6498 collseqval = collseq_table_lookup (collseq, c);
6499
6500 for (; workp < p - chars_length ;)
6501 {
6502 uint32_t start_val, end_val;
6503
6504 /* We already compute the collation sequence value
6505 of the characters (or collating symbols). */
6506 start_val = (uint32_t) *workp++; /* range_start */
6507 end_val = (uint32_t) *workp++; /* range_end */
6508
6509 if (start_val <= collseqval && collseqval <= end_val)
6510 goto char_set_matched;
6511 }
6512 }
6513 else
6514 # endif
6515 {
6516 /* We set range_start_char at str_buf[0], range_end_char
6517 at str_buf[4], and compared char at str_buf[2]. */
6518 str_buf[1] = 0;
6519 str_buf[2] = c;
6520 str_buf[3] = 0;
6521 str_buf[5] = 0;
6522 for (; workp < p - chars_length ;)
6523 {
6524 wchar_t *range_start_char, *range_end_char;
6525
6526 /* match if (range_start_char <= c <= range_end_char). */
6527
6528 /* If range_start(or end) < 0, we assume -range_start(end)
6529 is the offset of the collating symbol which is specified
6530 as the character of the range start(end). */
6531
6532 /* range_start */
6533 if (*workp < 0)
6534 range_start_char = charset_top - (*workp++);
6535 else
6536 {
6537 str_buf[0] = *workp++;
6538 range_start_char = str_buf;
6539 }
6540
6541 /* range_end */
6542 if (*workp < 0)
6543 range_end_char = charset_top - (*workp++);
6544 else
6545 {
6546 str_buf[4] = *workp++;
6547 range_end_char = str_buf + 4;
6548 }
6549
6550 # ifdef _LIBC
6551 if (__wcscoll (range_start_char, str_buf+2) <= 0
6552 && __wcscoll (str_buf+2, range_end_char) <= 0)
6553 # else
6554 if (wcscoll (range_start_char, str_buf+2) <= 0
6555 && wcscoll (str_buf+2, range_end_char) <= 0)
6556 # endif
6557 goto char_set_matched;
6558 }
6559 }
6560
6561 /* match with char? */
6562 for (; workp < p ; workp++)
6563 if (c == *workp)
6564 goto char_set_matched;
6565
6566 negate = !negate;
6567
6568 char_set_matched:
6569 if (negate) goto fail;
6570 #else
6571 /* Cast to `unsigned' instead of `unsigned char' in case the
6572 bit list is a full 32 bytes long. */
6573 if (c < (unsigned) (*p * BYTEWIDTH)
6574 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
6575 negate = !negate;
6576
6577 p += 1 + *p;
6578
6579 if (!negate) goto fail;
6580 #undef WORK_BUFFER_SIZE
6581 #endif /* WCHAR */
6582 SET_REGS_MATCHED ();
6583 d++;
6584 break;
6585 }
6586
6587
6588 /* The beginning of a group is represented by start_memory.
6589 The arguments are the register number in the next byte, and the
6590 number of groups inner to this one in the next. The text
6591 matched within the group is recorded (in the internal
6592 registers data structure) under the register number. */
6593 case start_memory:
6594 DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n",
6595 (long int) *p, (long int) p[1]);
6596
6597 /* Find out if this group can match the empty string. */
6598 p1 = p; /* To send to group_match_null_string_p. */
6599
6600 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
6601 REG_MATCH_NULL_STRING_P (reg_info[*p])
6602 = PREFIX(group_match_null_string_p) (&p1, pend, reg_info);
6603
6604 /* Save the position in the string where we were the last time
6605 we were at this open-group operator in case the group is
6606 operated upon by a repetition operator, e.g., with `(a*)*b'
6607 against `ab'; then we want to ignore where we are now in
6608 the string in case this attempt to match fails. */
6609 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6610 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
6611 : regstart[*p];
6612 DEBUG_PRINT2 (" old_regstart: %d\n",
6613 POINTER_TO_OFFSET (old_regstart[*p]));
6614
6615 regstart[*p] = d;
6616 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
6617
6618 IS_ACTIVE (reg_info[*p]) = 1;
6619 MATCHED_SOMETHING (reg_info[*p]) = 0;
6620
6621 /* Clear this whenever we change the register activity status. */
6622 set_regs_matched_done = 0;
6623
6624 /* This is the new highest active register. */
6625 highest_active_reg = *p;
6626
6627 /* If nothing was active before, this is the new lowest active
6628 register. */
6629 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6630 lowest_active_reg = *p;
6631
6632 /* Move past the register number and inner group count. */
6633 p += 2;
6634 just_past_start_mem = p;
6635
6636 break;
6637
6638
6639 /* The stop_memory opcode represents the end of a group. Its
6640 arguments are the same as start_memory's: the register
6641 number, and the number of inner groups. */
6642 case stop_memory:
6643 DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n",
6644 (long int) *p, (long int) p[1]);
6645
6646 /* We need to save the string position the last time we were at
6647 this close-group operator in case the group is operated
6648 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
6649 against `aba'; then we want to ignore where we are now in
6650 the string in case this attempt to match fails. */
6651 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6652 ? REG_UNSET (regend[*p]) ? d : regend[*p]
6653 : regend[*p];
6654 DEBUG_PRINT2 (" old_regend: %d\n",
6655 POINTER_TO_OFFSET (old_regend[*p]));
6656
6657 regend[*p] = d;
6658 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
6659
6660 /* This register isn't active anymore. */
6661 IS_ACTIVE (reg_info[*p]) = 0;
6662
6663 /* Clear this whenever we change the register activity status. */
6664 set_regs_matched_done = 0;
6665
6666 /* If this was the only register active, nothing is active
6667 anymore. */
6668 if (lowest_active_reg == highest_active_reg)
6669 {
6670 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6671 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6672 }
6673 else
6674 { /* We must scan for the new highest active register, since
6675 it isn't necessarily one less than now: consider
6676 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
6677 new highest active register is 1. */
6678 UCHAR_T r = *p - 1;
6679 while (r > 0 && !IS_ACTIVE (reg_info[r]))
6680 r--;
6681
6682 /* If we end up at register zero, that means that we saved
6683 the registers as the result of an `on_failure_jump', not
6684 a `start_memory', and we jumped to past the innermost
6685 `stop_memory'. For example, in ((.)*) we save
6686 registers 1 and 2 as a result of the *, but when we pop
6687 back to the second ), we are at the stop_memory 1.
6688 Thus, nothing is active. */
6689 if (r == 0)
6690 {
6691 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6692 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6693 }
6694 else
6695 highest_active_reg = r;
6696 }
6697
6698 /* If just failed to match something this time around with a
6699 group that's operated on by a repetition operator, try to
6700 force exit from the ``loop'', and restore the register
6701 information for this group that we had before trying this
6702 last match. */
6703 if ((!MATCHED_SOMETHING (reg_info[*p])
6704 || just_past_start_mem == p - 1)
6705 && (p + 2) < pend)
6706 {
6707 boolean is_a_jump_n = false;
6708
6709 p1 = p + 2;
6710 mcnt = 0;
6711 switch ((re_opcode_t) *p1++)
6712 {
6713 case jump_n:
6714 is_a_jump_n = true;
6715 case pop_failure_jump:
6716 case maybe_pop_jump:
6717 case jump:
6718 case dummy_failure_jump:
6719 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6720 if (is_a_jump_n)
6721 p1 += OFFSET_ADDRESS_SIZE;
6722 break;
6723
6724 default:
6725 /* do nothing */ ;
6726 }
6727 p1 += mcnt;
6728
6729 /* If the next operation is a jump backwards in the pattern
6730 to an on_failure_jump right before the start_memory
6731 corresponding to this stop_memory, exit from the loop
6732 by forcing a failure after pushing on the stack the
6733 on_failure_jump's jump in the pattern, and d. */
6734 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
6735 && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory
6736 && p1[2+OFFSET_ADDRESS_SIZE] == *p)
6737 {
6738 /* If this group ever matched anything, then restore
6739 what its registers were before trying this last
6740 failed match, e.g., with `(a*)*b' against `ab' for
6741 regstart[1], and, e.g., with `((a*)*(b*)*)*'
6742 against `aba' for regend[3].
6743
6744 Also restore the registers for inner groups for,
6745 e.g., `((a*)(b*))*' against `aba' (register 3 would
6746 otherwise get trashed). */
6747
6748 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
6749 {
6750 unsigned r;
6751
6752 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
6753
6754 /* Restore this and inner groups' (if any) registers. */
6755 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
6756 r++)
6757 {
6758 regstart[r] = old_regstart[r];
6759
6760 /* xx why this test? */
6761 if (old_regend[r] >= regstart[r])
6762 regend[r] = old_regend[r];
6763 }
6764 }
6765 p1++;
6766 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6767 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
6768
6769 goto fail;
6770 }
6771 }
6772
6773 /* Move past the register number and the inner group count. */
6774 p += 2;
6775 break;
6776
6777
6778 /* \<digit> has been turned into a `duplicate' command which is
6779 followed by the numeric value of <digit> as the register number. */
6780 case duplicate:
6781 {
6782 register const CHAR_T *d2, *dend2;
6783 int regno = *p++; /* Get which register to match against. */
6784 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
6785
6786 /* Can't back reference a group which we've never matched. */
6787 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
6788 goto fail;
6789
6790 /* Where in input to try to start matching. */
6791 d2 = regstart[regno];
6792
6793 /* Where to stop matching; if both the place to start and
6794 the place to stop matching are in the same string, then
6795 set to the place to stop, otherwise, for now have to use
6796 the end of the first string. */
6797
6798 dend2 = ((FIRST_STRING_P (regstart[regno])
6799 == FIRST_STRING_P (regend[regno]))
6800 ? regend[regno] : end_match_1);
6801 for (;;)
6802 {
6803 /* If necessary, advance to next segment in register
6804 contents. */
6805 while (d2 == dend2)
6806 {
6807 if (dend2 == end_match_2) break;
6808 if (dend2 == regend[regno]) break;
6809
6810 /* End of string1 => advance to string2. */
6811 d2 = string2;
6812 dend2 = regend[regno];
6813 }
6814 /* At end of register contents => success */
6815 if (d2 == dend2) break;
6816
6817 /* If necessary, advance to next segment in data. */
6818 PREFETCH ();
6819
6820 /* How many characters left in this segment to match. */
6821 mcnt = dend - d;
6822
6823 /* Want how many consecutive characters we can match in
6824 one shot, so, if necessary, adjust the count. */
6825 if (mcnt > dend2 - d2)
6826 mcnt = dend2 - d2;
6827
6828 /* Compare that many; failure if mismatch, else move
6829 past them. */
6830 if (translate
6831 ? PREFIX(bcmp_translate) (d, d2, mcnt, translate)
6832 : memcmp (d, d2, mcnt*sizeof(UCHAR_T)))
6833 goto fail;
6834 d += mcnt, d2 += mcnt;
6835
6836 /* Do this because we've match some characters. */
6837 SET_REGS_MATCHED ();
6838 }
6839 }
6840 break;
6841
6842
6843 /* begline matches the empty string at the beginning of the string
6844 (unless `not_bol' is set in `bufp'), and, if
6845 `newline_anchor' is set, after newlines. */
6846 case begline:
6847 DEBUG_PRINT1 ("EXECUTING begline.\n");
6848
6849 if (AT_STRINGS_BEG (d))
6850 {
6851 if (!bufp->not_bol) break;
6852 }
6853 else if (d[-1] == '\n' && bufp->newline_anchor)
6854 {
6855 break;
6856 }
6857 /* In all other cases, we fail. */
6858 goto fail;
6859
6860
6861 /* endline is the dual of begline. */
6862 case endline:
6863 DEBUG_PRINT1 ("EXECUTING endline.\n");
6864
6865 if (AT_STRINGS_END (d))
6866 {
6867 if (!bufp->not_eol) break;
6868 }
6869
6870 /* We have to ``prefetch'' the next character. */
6871 else if ((d == end1 ? *string2 : *d) == '\n'
6872 && bufp->newline_anchor)
6873 {
6874 break;
6875 }
6876 goto fail;
6877
6878
6879 /* Match at the very beginning of the data. */
6880 case begbuf:
6881 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
6882 if (AT_STRINGS_BEG (d))
6883 break;
6884 goto fail;
6885
6886
6887 /* Match at the very end of the data. */
6888 case endbuf:
6889 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
6890 if (AT_STRINGS_END (d))
6891 break;
6892 goto fail;
6893
6894
6895 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
6896 pushes NULL as the value for the string on the stack. Then
6897 `pop_failure_point' will keep the current value for the
6898 string, instead of restoring it. To see why, consider
6899 matching `foo\nbar' against `.*\n'. The .* matches the foo;
6900 then the . fails against the \n. But the next thing we want
6901 to do is match the \n against the \n; if we restored the
6902 string value, we would be back at the foo.
6903
6904 Because this is used only in specific cases, we don't need to
6905 check all the things that `on_failure_jump' does, to make
6906 sure the right things get saved on the stack. Hence we don't
6907 share its code. The only reason to push anything on the
6908 stack at all is that otherwise we would have to change
6909 `anychar's code to do something besides goto fail in this
6910 case; that seems worse than this. */
6911 case on_failure_keep_string_jump:
6912 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
6913
6914 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6915 #ifdef _LIBC
6916 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
6917 #else
6918 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
6919 #endif
6920
6921 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
6922 break;
6923
6924
6925 /* Uses of on_failure_jump:
6926
6927 Each alternative starts with an on_failure_jump that points
6928 to the beginning of the next alternative. Each alternative
6929 except the last ends with a jump that in effect jumps past
6930 the rest of the alternatives. (They really jump to the
6931 ending jump of the following alternative, because tensioning
6932 these jumps is a hassle.)
6933
6934 Repeats start with an on_failure_jump that points past both
6935 the repetition text and either the following jump or
6936 pop_failure_jump back to this on_failure_jump. */
6937 case on_failure_jump:
6938 on_failure:
6939 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
6940
6941 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6942 #ifdef _LIBC
6943 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
6944 #else
6945 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
6946 #endif
6947
6948 /* If this on_failure_jump comes right before a group (i.e.,
6949 the original * applied to a group), save the information
6950 for that group and all inner ones, so that if we fail back
6951 to this point, the group's information will be correct.
6952 For example, in \(a*\)*\1, we need the preceding group,
6953 and in \(zz\(a*\)b*\)\2, we need the inner group. */
6954
6955 /* We can't use `p' to check ahead because we push
6956 a failure point to `p + mcnt' after we do this. */
6957 p1 = p;
6958
6959 /* We need to skip no_op's before we look for the
6960 start_memory in case this on_failure_jump is happening as
6961 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
6962 against aba. */
6963 while (p1 < pend && (re_opcode_t) *p1 == no_op)
6964 p1++;
6965
6966 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
6967 {
6968 /* We have a new highest active register now. This will
6969 get reset at the start_memory we are about to get to,
6970 but we will have saved all the registers relevant to
6971 this repetition op, as described above. */
6972 highest_active_reg = *(p1 + 1) + *(p1 + 2);
6973 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6974 lowest_active_reg = *(p1 + 1);
6975 }
6976
6977 DEBUG_PRINT1 (":\n");
6978 PUSH_FAILURE_POINT (p + mcnt, d, -2);
6979 break;
6980
6981
6982 /* A smart repeat ends with `maybe_pop_jump'.
6983 We change it to either `pop_failure_jump' or `jump'. */
6984 case maybe_pop_jump:
6985 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6986 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
6987 {
6988 register UCHAR_T *p2 = p;
6989
6990 /* Compare the beginning of the repeat with what in the
6991 pattern follows its end. If we can establish that there
6992 is nothing that they would both match, i.e., that we
6993 would have to backtrack because of (as in, e.g., `a*a')
6994 then we can change to pop_failure_jump, because we'll
6995 never have to backtrack.
6996
6997 This is not true in the case of alternatives: in
6998 `(a|ab)*' we do need to backtrack to the `ab' alternative
6999 (e.g., if the string was `ab'). But instead of trying to
7000 detect that here, the alternative has put on a dummy
7001 failure point which is what we will end up popping. */
7002
7003 /* Skip over open/close-group commands.
7004 If what follows this loop is a ...+ construct,
7005 look at what begins its body, since we will have to
7006 match at least one of that. */
7007 while (1)
7008 {
7009 if (p2 + 2 < pend
7010 && ((re_opcode_t) *p2 == stop_memory
7011 || (re_opcode_t) *p2 == start_memory))
7012 p2 += 3;
7013 else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend
7014 && (re_opcode_t) *p2 == dummy_failure_jump)
7015 p2 += 2 + 2 * OFFSET_ADDRESS_SIZE;
7016 else
7017 break;
7018 }
7019
7020 p1 = p + mcnt;
7021 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
7022 to the `maybe_finalize_jump' of this case. Examine what
7023 follows. */
7024
7025 /* If we're at the end of the pattern, we can change. */
7026 if (p2 == pend)
7027 {
7028 /* Consider what happens when matching ":\(.*\)"
7029 against ":/". I don't really understand this code
7030 yet. */
7031 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7032 pop_failure_jump;
7033 DEBUG_PRINT1
7034 (" End of pattern: change to `pop_failure_jump'.\n");
7035 }
7036
7037 else if ((re_opcode_t) *p2 == exactn
7038 #ifdef MBS_SUPPORT
7039 || (re_opcode_t) *p2 == exactn_bin
7040 #endif
7041 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
7042 {
7043 register UCHAR_T c
7044 = *p2 == (UCHAR_T) endline ? '\n' : p2[2];
7045
7046 if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn
7047 #ifdef MBS_SUPPORT
7048 || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin
7049 #endif
7050 ) && p1[3+OFFSET_ADDRESS_SIZE] != c)
7051 {
7052 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7053 pop_failure_jump;
7054 #ifdef WCHAR
7055 DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n",
7056 (wint_t) c,
7057 (wint_t) p1[3+OFFSET_ADDRESS_SIZE]);
7058 #else
7059 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
7060 (char) c,
7061 (char) p1[3+OFFSET_ADDRESS_SIZE]);
7062 #endif
7063 }
7064
7065 #ifndef WCHAR
7066 else if ((re_opcode_t) p1[3] == charset
7067 || (re_opcode_t) p1[3] == charset_not)
7068 {
7069 int negate = (re_opcode_t) p1[3] == charset_not;
7070
7071 if (c < (unsigned) (p1[4] * BYTEWIDTH)
7072 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
7073 negate = !negate;
7074
7075 /* `negate' is equal to 1 if c would match, which means
7076 that we can't change to pop_failure_jump. */
7077 if (!negate)
7078 {
7079 p[-3] = (unsigned char) pop_failure_jump;
7080 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7081 }
7082 }
7083 #endif /* not WCHAR */
7084 }
7085 #ifndef WCHAR
7086 else if ((re_opcode_t) *p2 == charset)
7087 {
7088 /* We win if the first character of the loop is not part
7089 of the charset. */
7090 if ((re_opcode_t) p1[3] == exactn
7091 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
7092 && (p2[2 + p1[5] / BYTEWIDTH]
7093 & (1 << (p1[5] % BYTEWIDTH)))))
7094 {
7095 p[-3] = (unsigned char) pop_failure_jump;
7096 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7097 }
7098
7099 else if ((re_opcode_t) p1[3] == charset_not)
7100 {
7101 int idx;
7102 /* We win if the charset_not inside the loop
7103 lists every character listed in the charset after. */
7104 for (idx = 0; idx < (int) p2[1]; idx++)
7105 if (! (p2[2 + idx] == 0
7106 || (idx < (int) p1[4]
7107 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
7108 break;
7109
7110 if (idx == p2[1])
7111 {
7112 p[-3] = (unsigned char) pop_failure_jump;
7113 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7114 }
7115 }
7116 else if ((re_opcode_t) p1[3] == charset)
7117 {
7118 int idx;
7119 /* We win if the charset inside the loop
7120 has no overlap with the one after the loop. */
7121 for (idx = 0;
7122 idx < (int) p2[1] && idx < (int) p1[4];
7123 idx++)
7124 if ((p2[2 + idx] & p1[5 + idx]) != 0)
7125 break;
7126
7127 if (idx == p2[1] || idx == p1[4])
7128 {
7129 p[-3] = (unsigned char) pop_failure_jump;
7130 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7131 }
7132 }
7133 }
7134 #endif /* not WCHAR */
7135 }
7136 p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */
7137 if ((re_opcode_t) p[-1] != pop_failure_jump)
7138 {
7139 p[-1] = (UCHAR_T) jump;
7140 DEBUG_PRINT1 (" Match => jump.\n");
7141 goto unconditional_jump;
7142 }
7143 /* Note fall through. */
7144
7145
7146 /* The end of a simple repeat has a pop_failure_jump back to
7147 its matching on_failure_jump, where the latter will push a
7148 failure point. The pop_failure_jump takes off failure
7149 points put on by this pop_failure_jump's matching
7150 on_failure_jump; we got through the pattern to here from the
7151 matching on_failure_jump, so didn't fail. */
7152 case pop_failure_jump:
7153 {
7154 /* We need to pass separate storage for the lowest and
7155 highest registers, even though we don't care about the
7156 actual values. Otherwise, we will restore only one
7157 register from the stack, since lowest will == highest in
7158 `pop_failure_point'. */
7159 active_reg_t dummy_low_reg, dummy_high_reg;
7160 UCHAR_T *pdummy ATTRIBUTE_UNUSED = NULL;
7161 const CHAR_T *sdummy ATTRIBUTE_UNUSED = NULL;
7162
7163 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
7164 POP_FAILURE_POINT (sdummy, pdummy,
7165 dummy_low_reg, dummy_high_reg,
7166 reg_dummy, reg_dummy, reg_info_dummy);
7167 }
7168 /* Note fall through. */
7169
7170 unconditional_jump:
7171 #ifdef _LIBC
7172 DEBUG_PRINT2 ("\n%p: ", p);
7173 #else
7174 DEBUG_PRINT2 ("\n0x%x: ", p);
7175 #endif
7176 /* Note fall through. */
7177
7178 /* Unconditionally jump (without popping any failure points). */
7179 case jump:
7180 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
7181 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7182 p += mcnt; /* Do the jump. */
7183 #ifdef _LIBC
7184 DEBUG_PRINT2 ("(to %p).\n", p);
7185 #else
7186 DEBUG_PRINT2 ("(to 0x%x).\n", p);
7187 #endif
7188 break;
7189
7190
7191 /* We need this opcode so we can detect where alternatives end
7192 in `group_match_null_string_p' et al. */
7193 case jump_past_alt:
7194 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
7195 goto unconditional_jump;
7196
7197
7198 /* Normally, the on_failure_jump pushes a failure point, which
7199 then gets popped at pop_failure_jump. We will end up at
7200 pop_failure_jump, also, and with a pattern of, say, `a+', we
7201 are skipping over the on_failure_jump, so we have to push
7202 something meaningless for pop_failure_jump to pop. */
7203 case dummy_failure_jump:
7204 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
7205 /* It doesn't matter what we push for the string here. What
7206 the code at `fail' tests is the value for the pattern. */
7207 PUSH_FAILURE_POINT (NULL, NULL, -2);
7208 goto unconditional_jump;
7209
7210
7211 /* At the end of an alternative, we need to push a dummy failure
7212 point in case we are followed by a `pop_failure_jump', because
7213 we don't want the failure point for the alternative to be
7214 popped. For example, matching `(a|ab)*' against `aab'
7215 requires that we match the `ab' alternative. */
7216 case push_dummy_failure:
7217 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
7218 /* See comments just above at `dummy_failure_jump' about the
7219 two zeroes. */
7220 PUSH_FAILURE_POINT (NULL, NULL, -2);
7221 break;
7222
7223 /* Have to succeed matching what follows at least n times.
7224 After that, handle like `on_failure_jump'. */
7225 case succeed_n:
7226 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7227 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
7228
7229 assert (mcnt >= 0);
7230 /* Originally, this is how many times we HAVE to succeed. */
7231 if (mcnt > 0)
7232 {
7233 mcnt--;
7234 p += OFFSET_ADDRESS_SIZE;
7235 STORE_NUMBER_AND_INCR (p, mcnt);
7236 #ifdef _LIBC
7237 DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE
7238 , mcnt);
7239 #else
7240 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE
7241 , mcnt);
7242 #endif
7243 }
7244 else if (mcnt == 0)
7245 {
7246 #ifdef _LIBC
7247 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n",
7248 p + OFFSET_ADDRESS_SIZE);
7249 #else
7250 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n",
7251 p + OFFSET_ADDRESS_SIZE);
7252 #endif /* _LIBC */
7253
7254 #ifdef WCHAR
7255 p[1] = (UCHAR_T) no_op;
7256 #else
7257 p[2] = (UCHAR_T) no_op;
7258 p[3] = (UCHAR_T) no_op;
7259 #endif /* WCHAR */
7260 goto on_failure;
7261 }
7262 break;
7263
7264 case jump_n:
7265 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7266 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
7267
7268 /* Originally, this is how many times we CAN jump. */
7269 if (mcnt)
7270 {
7271 mcnt--;
7272 STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt);
7273
7274 #ifdef _LIBC
7275 DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE,
7276 mcnt);
7277 #else
7278 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE,
7279 mcnt);
7280 #endif /* _LIBC */
7281 goto unconditional_jump;
7282 }
7283 /* If don't have to jump any more, skip over the rest of command. */
7284 else
7285 p += 2 * OFFSET_ADDRESS_SIZE;
7286 break;
7287
7288 case set_number_at:
7289 {
7290 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
7291
7292 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7293 p1 = p + mcnt;
7294 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7295 #ifdef _LIBC
7296 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
7297 #else
7298 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
7299 #endif
7300 STORE_NUMBER (p1, mcnt);
7301 break;
7302 }
7303
7304 #if 0
7305 /* The DEC Alpha C compiler 3.x generates incorrect code for the
7306 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7307 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
7308 macro and introducing temporary variables works around the bug. */
7309
7310 case wordbound:
7311 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7312 if (AT_WORD_BOUNDARY (d))
7313 break;
7314 goto fail;
7315
7316 case notwordbound:
7317 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7318 if (AT_WORD_BOUNDARY (d))
7319 goto fail;
7320 break;
7321 #else
7322 case wordbound:
7323 {
7324 boolean prevchar, thischar;
7325
7326 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7327 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7328 break;
7329
7330 prevchar = WORDCHAR_P (d - 1);
7331 thischar = WORDCHAR_P (d);
7332 if (prevchar != thischar)
7333 break;
7334 goto fail;
7335 }
7336
7337 case notwordbound:
7338 {
7339 boolean prevchar, thischar;
7340
7341 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7342 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7343 goto fail;
7344
7345 prevchar = WORDCHAR_P (d - 1);
7346 thischar = WORDCHAR_P (d);
7347 if (prevchar != thischar)
7348 goto fail;
7349 break;
7350 }
7351 #endif
7352
7353 case wordbeg:
7354 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
7355 if (!AT_STRINGS_END (d) && WORDCHAR_P (d)
7356 && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
7357 break;
7358 goto fail;
7359
7360 case wordend:
7361 DEBUG_PRINT1 ("EXECUTING wordend.\n");
7362 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
7363 && (AT_STRINGS_END (d) || !WORDCHAR_P (d)))
7364 break;
7365 goto fail;
7366
7367 #ifdef emacs
7368 case before_dot:
7369 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
7370 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
7371 goto fail;
7372 break;
7373
7374 case at_dot:
7375 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
7376 if (PTR_CHAR_POS ((unsigned char *) d) != point)
7377 goto fail;
7378 break;
7379
7380 case after_dot:
7381 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
7382 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
7383 goto fail;
7384 break;
7385
7386 case syntaxspec:
7387 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
7388 mcnt = *p++;
7389 goto matchsyntax;
7390
7391 case wordchar:
7392 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
7393 mcnt = (int) Sword;
7394 matchsyntax:
7395 PREFETCH ();
7396 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7397 d++;
7398 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
7399 goto fail;
7400 SET_REGS_MATCHED ();
7401 break;
7402
7403 case notsyntaxspec:
7404 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
7405 mcnt = *p++;
7406 goto matchnotsyntax;
7407
7408 case notwordchar:
7409 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
7410 mcnt = (int) Sword;
7411 matchnotsyntax:
7412 PREFETCH ();
7413 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7414 d++;
7415 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
7416 goto fail;
7417 SET_REGS_MATCHED ();
7418 break;
7419
7420 #else /* not emacs */
7421 case wordchar:
7422 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
7423 PREFETCH ();
7424 if (!WORDCHAR_P (d))
7425 goto fail;
7426 SET_REGS_MATCHED ();
7427 d++;
7428 break;
7429
7430 case notwordchar:
7431 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
7432 PREFETCH ();
7433 if (WORDCHAR_P (d))
7434 goto fail;
7435 SET_REGS_MATCHED ();
7436 d++;
7437 break;
7438 #endif /* not emacs */
7439
7440 default:
7441 abort ();
7442 }
7443 continue; /* Successfully executed one pattern command; keep going. */
7444
7445
7446 /* We goto here if a matching operation fails. */
7447 fail:
7448 if (!FAIL_STACK_EMPTY ())
7449 { /* A restart point is known. Restore to that state. */
7450 DEBUG_PRINT1 ("\nFAIL:\n");
7451 POP_FAILURE_POINT (d, p,
7452 lowest_active_reg, highest_active_reg,
7453 regstart, regend, reg_info);
7454
7455 /* If this failure point is a dummy, try the next one. */
7456 if (!p)
7457 goto fail;
7458
7459 /* If we failed to the end of the pattern, don't examine *p. */
7460 assert (p <= pend);
7461 if (p < pend)
7462 {
7463 boolean is_a_jump_n = false;
7464
7465 /* If failed to a backwards jump that's part of a repetition
7466 loop, need to pop this failure point and use the next one. */
7467 switch ((re_opcode_t) *p)
7468 {
7469 case jump_n:
7470 is_a_jump_n = true;
7471 case maybe_pop_jump:
7472 case pop_failure_jump:
7473 case jump:
7474 p1 = p + 1;
7475 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7476 p1 += mcnt;
7477
7478 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
7479 || (!is_a_jump_n
7480 && (re_opcode_t) *p1 == on_failure_jump))
7481 goto fail;
7482 break;
7483 default:
7484 /* do nothing */ ;
7485 }
7486 }
7487
7488 if (d >= string1 && d <= end1)
7489 dend = end_match_1;
7490 }
7491 else
7492 break; /* Matching at this starting point really fails. */
7493 } /* for (;;) */
7494
7495 if (best_regs_set)
7496 goto restore_best_regs;
7497
7498 FREE_VARIABLES ();
7499
7500 return -1; /* Failure to match. */
7501 } /* re_match_2 */
7502
7503 /* Subroutine definitions for re_match_2. */
7505
7506
7507 /* We are passed P pointing to a register number after a start_memory.
7508
7509 Return true if the pattern up to the corresponding stop_memory can
7510 match the empty string, and false otherwise.
7511
7512 If we find the matching stop_memory, sets P to point to one past its number.
7513 Otherwise, sets P to an undefined byte less than or equal to END.
7514
7515 We don't handle duplicates properly (yet). */
7516
7517 static boolean
7518 PREFIX(group_match_null_string_p) (UCHAR_T **p, UCHAR_T *end,
7519 PREFIX(register_info_type) *reg_info)
7520 {
7521 int mcnt;
7522 /* Point to after the args to the start_memory. */
7523 UCHAR_T *p1 = *p + 2;
7524
7525 while (p1 < end)
7526 {
7527 /* Skip over opcodes that can match nothing, and return true or
7528 false, as appropriate, when we get to one that can't, or to the
7529 matching stop_memory. */
7530
7531 switch ((re_opcode_t) *p1)
7532 {
7533 /* Could be either a loop or a series of alternatives. */
7534 case on_failure_jump:
7535 p1++;
7536 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7537
7538 /* If the next operation is not a jump backwards in the
7539 pattern. */
7540
7541 if (mcnt >= 0)
7542 {
7543 /* Go through the on_failure_jumps of the alternatives,
7544 seeing if any of the alternatives cannot match nothing.
7545 The last alternative starts with only a jump,
7546 whereas the rest start with on_failure_jump and end
7547 with a jump, e.g., here is the pattern for `a|b|c':
7548
7549 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
7550 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
7551 /exactn/1/c
7552
7553 So, we have to first go through the first (n-1)
7554 alternatives and then deal with the last one separately. */
7555
7556
7557 /* Deal with the first (n-1) alternatives, which start
7558 with an on_failure_jump (see above) that jumps to right
7559 past a jump_past_alt. */
7560
7561 while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] ==
7562 jump_past_alt)
7563 {
7564 /* `mcnt' holds how many bytes long the alternative
7565 is, including the ending `jump_past_alt' and
7566 its number. */
7567
7568 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt -
7569 (1 + OFFSET_ADDRESS_SIZE),
7570 reg_info))
7571 return false;
7572
7573 /* Move to right after this alternative, including the
7574 jump_past_alt. */
7575 p1 += mcnt;
7576
7577 /* Break if it's the beginning of an n-th alternative
7578 that doesn't begin with an on_failure_jump. */
7579 if ((re_opcode_t) *p1 != on_failure_jump)
7580 break;
7581
7582 /* Still have to check that it's not an n-th
7583 alternative that starts with an on_failure_jump. */
7584 p1++;
7585 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7586 if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] !=
7587 jump_past_alt)
7588 {
7589 /* Get to the beginning of the n-th alternative. */
7590 p1 -= 1 + OFFSET_ADDRESS_SIZE;
7591 break;
7592 }
7593 }
7594
7595 /* Deal with the last alternative: go back and get number
7596 of the `jump_past_alt' just before it. `mcnt' contains
7597 the length of the alternative. */
7598 EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE);
7599
7600 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt, reg_info))
7601 return false;
7602
7603 p1 += mcnt; /* Get past the n-th alternative. */
7604 } /* if mcnt > 0 */
7605 break;
7606
7607
7608 case stop_memory:
7609 assert (p1[1] == **p);
7610 *p = p1 + 2;
7611 return true;
7612
7613
7614 default:
7615 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7616 return false;
7617 }
7618 } /* while p1 < end */
7619
7620 return false;
7621 } /* group_match_null_string_p */
7622
7623
7624 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
7625 It expects P to be the first byte of a single alternative and END one
7626 byte past the last. The alternative can contain groups. */
7627
7628 static boolean
7629 PREFIX(alt_match_null_string_p) (UCHAR_T *p, UCHAR_T *end,
7630 PREFIX(register_info_type) *reg_info)
7631 {
7632 int mcnt;
7633 UCHAR_T *p1 = p;
7634
7635 while (p1 < end)
7636 {
7637 /* Skip over opcodes that can match nothing, and break when we get
7638 to one that can't. */
7639
7640 switch ((re_opcode_t) *p1)
7641 {
7642 /* It's a loop. */
7643 case on_failure_jump:
7644 p1++;
7645 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7646 p1 += mcnt;
7647 break;
7648
7649 default:
7650 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7651 return false;
7652 }
7653 } /* while p1 < end */
7654
7655 return true;
7656 } /* alt_match_null_string_p */
7657
7658
7659 /* Deals with the ops common to group_match_null_string_p and
7660 alt_match_null_string_p.
7661
7662 Sets P to one after the op and its arguments, if any. */
7663
7664 static boolean
7665 PREFIX(common_op_match_null_string_p) (UCHAR_T **p, UCHAR_T *end,
7666 PREFIX(register_info_type) *reg_info)
7667 {
7668 int mcnt;
7669 boolean ret;
7670 int reg_no;
7671 UCHAR_T *p1 = *p;
7672
7673 switch ((re_opcode_t) *p1++)
7674 {
7675 case no_op:
7676 case begline:
7677 case endline:
7678 case begbuf:
7679 case endbuf:
7680 case wordbeg:
7681 case wordend:
7682 case wordbound:
7683 case notwordbound:
7684 #ifdef emacs
7685 case before_dot:
7686 case at_dot:
7687 case after_dot:
7688 #endif
7689 break;
7690
7691 case start_memory:
7692 reg_no = *p1;
7693 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
7694 ret = PREFIX(group_match_null_string_p) (&p1, end, reg_info);
7695
7696 /* Have to set this here in case we're checking a group which
7697 contains a group and a back reference to it. */
7698
7699 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
7700 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
7701
7702 if (!ret)
7703 return false;
7704 break;
7705
7706 /* If this is an optimized succeed_n for zero times, make the jump. */
7707 case jump:
7708 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7709 if (mcnt >= 0)
7710 p1 += mcnt;
7711 else
7712 return false;
7713 break;
7714
7715 case succeed_n:
7716 /* Get to the number of times to succeed. */
7717 p1 += OFFSET_ADDRESS_SIZE;
7718 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7719
7720 if (mcnt == 0)
7721 {
7722 p1 -= 2 * OFFSET_ADDRESS_SIZE;
7723 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7724 p1 += mcnt;
7725 }
7726 else
7727 return false;
7728 break;
7729
7730 case duplicate:
7731 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
7732 return false;
7733 break;
7734
7735 case set_number_at:
7736 p1 += 2 * OFFSET_ADDRESS_SIZE;
7737
7738 default:
7739 /* All other opcodes mean we cannot match the empty string. */
7740 return false;
7741 }
7742
7743 *p = p1;
7744 return true;
7745 } /* common_op_match_null_string_p */
7746
7747
7748 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
7749 bytes; nonzero otherwise. */
7750
7751 static int
7752 PREFIX(bcmp_translate) (const CHAR_T *s1, const CHAR_T *s2, register int len,
7753 RE_TRANSLATE_TYPE translate)
7754 {
7755 register const UCHAR_T *p1 = (const UCHAR_T *) s1;
7756 register const UCHAR_T *p2 = (const UCHAR_T *) s2;
7757 while (len)
7758 {
7759 #ifdef WCHAR
7760 if (((*p1<=0xff)?translate[*p1++]:*p1++)
7761 != ((*p2<=0xff)?translate[*p2++]:*p2++))
7762 return 1;
7763 #else /* BYTE */
7764 if (translate[*p1++] != translate[*p2++]) return 1;
7765 #endif /* WCHAR */
7766 len--;
7767 }
7768 return 0;
7769 }
7770
7771
7773 #else /* not INSIDE_RECURSION */
7774
7775 /* Entry points for GNU code. */
7776
7777 /* re_compile_pattern is the GNU regular expression compiler: it
7778 compiles PATTERN (of length SIZE) and puts the result in BUFP.
7779 Returns 0 if the pattern was valid, otherwise an error string.
7780
7781 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
7782 are set in BUFP on entry.
7783
7784 We call regex_compile to do the actual compilation. */
7785
7786 const char *
7787 re_compile_pattern (const char *pattern, size_t length,
7788 struct re_pattern_buffer *bufp)
7789 {
7790 reg_errcode_t ret;
7791
7792 /* GNU code is written to assume at least RE_NREGS registers will be set
7793 (and at least one extra will be -1). */
7794 bufp->regs_allocated = REGS_UNALLOCATED;
7795
7796 /* And GNU code determines whether or not to get register information
7797 by passing null for the REGS argument to re_match, etc., not by
7798 setting no_sub. */
7799 bufp->no_sub = 0;
7800
7801 /* Match anchors at newline. */
7802 bufp->newline_anchor = 1;
7803
7804 # ifdef MBS_SUPPORT
7805 if (MB_CUR_MAX != 1)
7806 ret = wcs_regex_compile (pattern, length, re_syntax_options, bufp);
7807 else
7808 # endif
7809 ret = byte_regex_compile (pattern, length, re_syntax_options, bufp);
7810
7811 if (!ret)
7812 return NULL;
7813 return gettext (re_error_msgid[(int) ret]);
7814 }
7815 #ifdef _LIBC
7816 weak_alias (__re_compile_pattern, re_compile_pattern)
7817 #endif
7818
7819 /* Entry points compatible with 4.2 BSD regex library. We don't define
7821 them unless specifically requested. */
7822
7823 #if defined _REGEX_RE_COMP || defined _LIBC
7824
7825 /* BSD has one and only one pattern buffer. */
7826 static struct re_pattern_buffer re_comp_buf;
7827
7828 char *
7829 #ifdef _LIBC
7830 /* Make these definitions weak in libc, so POSIX programs can redefine
7831 these names if they don't use our functions, and still use
7832 regcomp/regexec below without link errors. */
7833 weak_function
7834 #endif
7835 re_comp (const char *s)
7836 {
7837 reg_errcode_t ret;
7838
7839 if (!s)
7840 {
7841 if (!re_comp_buf.buffer)
7842 return (char *) gettext ("No previous regular expression");
7843 return 0;
7844 }
7845
7846 if (!re_comp_buf.buffer)
7847 {
7848 re_comp_buf.buffer = (unsigned char *) malloc (200);
7849 if (re_comp_buf.buffer == NULL)
7850 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
7851 re_comp_buf.allocated = 200;
7852
7853 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
7854 if (re_comp_buf.fastmap == NULL)
7855 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
7856 }
7857
7858 /* Since `re_exec' always passes NULL for the `regs' argument, we
7859 don't need to initialize the pattern buffer fields which affect it. */
7860
7861 /* Match anchors at newlines. */
7862 re_comp_buf.newline_anchor = 1;
7863
7864 # ifdef MBS_SUPPORT
7865 if (MB_CUR_MAX != 1)
7866 ret = wcs_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
7867 else
7868 # endif
7869 ret = byte_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
7870
7871 if (!ret)
7872 return NULL;
7873
7874 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
7875 return (char *) gettext (re_error_msgid[(int) ret]);
7876 }
7877
7878
7879 int
7880 #ifdef _LIBC
7881 weak_function
7882 #endif
7883 re_exec (const char *s)
7884 {
7885 const int len = strlen (s);
7886 return
7887 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
7888 }
7889
7890 #endif /* _REGEX_RE_COMP */
7891
7892 /* POSIX.2 functions. Don't define these for Emacs. */
7894
7895 #ifndef emacs
7896
7897 /* regcomp takes a regular expression as a string and compiles it.
7898
7899 PREG is a regex_t *. We do not expect any fields to be initialized,
7900 since POSIX says we shouldn't. Thus, we set
7901
7902 `buffer' to the compiled pattern;
7903 `used' to the length of the compiled pattern;
7904 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
7905 REG_EXTENDED bit in CFLAGS is set; otherwise, to
7906 RE_SYNTAX_POSIX_BASIC;
7907 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
7908 `fastmap' to an allocated space for the fastmap;
7909 `fastmap_accurate' to zero;
7910 `re_nsub' to the number of subexpressions in PATTERN.
7911
7912 PATTERN is the address of the pattern string.
7913
7914 CFLAGS is a series of bits which affect compilation.
7915
7916 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
7917 use POSIX basic syntax.
7918
7919 If REG_NEWLINE is set, then . and [^...] don't match newline.
7920 Also, regexec will try a match beginning after every newline.
7921
7922 If REG_ICASE is set, then we considers upper- and lowercase
7923 versions of letters to be equivalent when matching.
7924
7925 If REG_NOSUB is set, then when PREG is passed to regexec, that
7926 routine will report only success or failure, and nothing about the
7927 registers.
7928
7929 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
7930 the return codes and their meanings.) */
7931
7932 int
7933 regcomp (regex_t *preg, const char *pattern, int cflags)
7934 {
7935 reg_errcode_t ret;
7936 reg_syntax_t syntax
7937 = (cflags & REG_EXTENDED) ?
7938 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
7939
7940 /* regex_compile will allocate the space for the compiled pattern. */
7941 preg->buffer = 0;
7942 preg->allocated = 0;
7943 preg->used = 0;
7944
7945 /* Try to allocate space for the fastmap. */
7946 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
7947
7948 if (cflags & REG_ICASE)
7949 {
7950 int i;
7951
7952 preg->translate
7953 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
7954 * sizeof (*(RE_TRANSLATE_TYPE)0));
7955 if (preg->translate == NULL)
7956 return (int) REG_ESPACE;
7957
7958 /* Map uppercase characters to corresponding lowercase ones. */
7959 for (i = 0; i < CHAR_SET_SIZE; i++)
7960 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
7961 }
7962 else
7963 preg->translate = NULL;
7964
7965 /* If REG_NEWLINE is set, newlines are treated differently. */
7966 if (cflags & REG_NEWLINE)
7967 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
7968 syntax &= ~RE_DOT_NEWLINE;
7969 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
7970 /* It also changes the matching behavior. */
7971 preg->newline_anchor = 1;
7972 }
7973 else
7974 preg->newline_anchor = 0;
7975
7976 preg->no_sub = !!(cflags & REG_NOSUB);
7977
7978 /* POSIX says a null character in the pattern terminates it, so we
7979 can use strlen here in compiling the pattern. */
7980 # ifdef MBS_SUPPORT
7981 if (MB_CUR_MAX != 1)
7982 ret = wcs_regex_compile (pattern, strlen (pattern), syntax, preg);
7983 else
7984 # endif
7985 ret = byte_regex_compile (pattern, strlen (pattern), syntax, preg);
7986
7987 /* POSIX doesn't distinguish between an unmatched open-group and an
7988 unmatched close-group: both are REG_EPAREN. */
7989 if (ret == REG_ERPAREN) ret = REG_EPAREN;
7990
7991 if (ret == REG_NOERROR && preg->fastmap)
7992 {
7993 /* Compute the fastmap now, since regexec cannot modify the pattern
7994 buffer. */
7995 if (re_compile_fastmap (preg) == -2)
7996 {
7997 /* Some error occurred while computing the fastmap, just forget
7998 about it. */
7999 free (preg->fastmap);
8000 preg->fastmap = NULL;
8001 }
8002 }
8003
8004 return (int) ret;
8005 }
8006 #ifdef _LIBC
8007 weak_alias (__regcomp, regcomp)
8008 #endif
8009
8010
8011 /* regexec searches for a given pattern, specified by PREG, in the
8012 string STRING.
8013
8014 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
8015 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
8016 least NMATCH elements, and we set them to the offsets of the
8017 corresponding matched substrings.
8018
8019 EFLAGS specifies `execution flags' which affect matching: if
8020 REG_NOTBOL is set, then ^ does not match at the beginning of the
8021 string; if REG_NOTEOL is set, then $ does not match at the end.
8022
8023 We return 0 if we find a match and REG_NOMATCH if not. */
8024
8025 int
8026 regexec (const regex_t *preg, const char *string, size_t nmatch,
8027 regmatch_t pmatch[], int eflags)
8028 {
8029 int ret;
8030 struct re_registers regs;
8031 regex_t private_preg;
8032 int len = strlen (string);
8033 boolean want_reg_info = !preg->no_sub && nmatch > 0;
8034
8035 private_preg = *preg;
8036
8037 private_preg.not_bol = !!(eflags & REG_NOTBOL);
8038 private_preg.not_eol = !!(eflags & REG_NOTEOL);
8039
8040 /* The user has told us exactly how many registers to return
8041 information about, via `nmatch'. We have to pass that on to the
8042 matching routines. */
8043 private_preg.regs_allocated = REGS_FIXED;
8044
8045 if (want_reg_info)
8046 {
8047 regs.num_regs = nmatch;
8048 regs.start = TALLOC (nmatch * 2, regoff_t);
8049 if (regs.start == NULL)
8050 return (int) REG_NOMATCH;
8051 regs.end = regs.start + nmatch;
8052 }
8053
8054 /* Perform the searching operation. */
8055 ret = re_search (&private_preg, string, len,
8056 /* start: */ 0, /* range: */ len,
8057 want_reg_info ? ®s : (struct re_registers *) 0);
8058
8059 /* Copy the register information to the POSIX structure. */
8060 if (want_reg_info)
8061 {
8062 if (ret >= 0)
8063 {
8064 unsigned r;
8065
8066 for (r = 0; r < nmatch; r++)
8067 {
8068 pmatch[r].rm_so = regs.start[r];
8069 pmatch[r].rm_eo = regs.end[r];
8070 }
8071 }
8072
8073 /* If we needed the temporary register info, free the space now. */
8074 free (regs.start);
8075 }
8076
8077 /* We want zero return to mean success, unlike `re_search'. */
8078 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
8079 }
8080 #ifdef _LIBC
8081 weak_alias (__regexec, regexec)
8082 #endif
8083
8084
8085 /* Returns a message corresponding to an error code, ERRCODE, returned
8086 from either regcomp or regexec. We don't use PREG here. */
8087
8088 size_t
8089 regerror (int errcode, const regex_t *preg ATTRIBUTE_UNUSED,
8090 char *errbuf, size_t errbuf_size)
8091 {
8092 const char *msg;
8093 size_t msg_size;
8094
8095 if (errcode < 0
8096 || errcode >= (int) (sizeof (re_error_msgid)
8097 / sizeof (re_error_msgid[0])))
8098 /* Only error codes returned by the rest of the code should be passed
8099 to this routine. If we are given anything else, or if other regex
8100 code generates an invalid error code, then the program has a bug.
8101 Dump core so we can fix it. */
8102 abort ();
8103
8104 msg = gettext (re_error_msgid[errcode]);
8105
8106 msg_size = strlen (msg) + 1; /* Includes the null. */
8107
8108 if (errbuf_size != 0)
8109 {
8110 if (msg_size > errbuf_size)
8111 {
8112 #if defined HAVE_MEMPCPY || defined _LIBC
8113 *((char *) mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
8114 #else
8115 memcpy (errbuf, msg, errbuf_size - 1);
8116 errbuf[errbuf_size - 1] = 0;
8117 #endif
8118 }
8119 else
8120 memcpy (errbuf, msg, msg_size);
8121 }
8122
8123 return msg_size;
8124 }
8125 #ifdef _LIBC
8126 weak_alias (__regerror, regerror)
8127 #endif
8128
8129
8130 /* Free dynamically allocated space used by PREG. */
8131
8132 void
8133 regfree (regex_t *preg)
8134 {
8135 if (preg->buffer != NULL)
8136 free (preg->buffer);
8137 preg->buffer = NULL;
8138
8139 preg->allocated = 0;
8140 preg->used = 0;
8141
8142 if (preg->fastmap != NULL)
8143 free (preg->fastmap);
8144 preg->fastmap = NULL;
8145 preg->fastmap_accurate = 0;
8146
8147 if (preg->translate != NULL)
8148 free (preg->translate);
8149 preg->translate = NULL;
8150 }
8151 #ifdef _LIBC
8152 weak_alias (__regfree, regfree)
8153 #endif
8154
8155 #endif /* not emacs */
8156
8157 #endif /* not INSIDE_RECURSION */
8158
8159
8160 #undef STORE_NUMBER
8162 #undef STORE_NUMBER_AND_INCR
8163 #undef EXTRACT_NUMBER
8164 #undef EXTRACT_NUMBER_AND_INCR
8165
8166 #undef DEBUG_PRINT_COMPILED_PATTERN
8167 #undef DEBUG_PRINT_DOUBLE_STRING
8168
8169 #undef INIT_FAIL_STACK
8170 #undef RESET_FAIL_STACK
8171 #undef DOUBLE_FAIL_STACK
8172 #undef PUSH_PATTERN_OP
8173 #undef PUSH_FAILURE_POINTER
8174 #undef PUSH_FAILURE_INT
8175 #undef PUSH_FAILURE_ELT
8176 #undef POP_FAILURE_POINTER
8177 #undef POP_FAILURE_INT
8178 #undef POP_FAILURE_ELT
8179 #undef DEBUG_PUSH
8180 #undef DEBUG_POP
8181 #undef PUSH_FAILURE_POINT
8182 #undef POP_FAILURE_POINT
8183
8184 #undef REG_UNSET_VALUE
8185 #undef REG_UNSET
8186
8187 #undef PATFETCH
8188 #undef PATFETCH_RAW
8189 #undef PATUNFETCH
8190 #undef TRANSLATE
8191
8192 #undef INIT_BUF_SIZE
8193 #undef GET_BUFFER_SPACE
8194 #undef BUF_PUSH
8195 #undef BUF_PUSH_2
8196 #undef BUF_PUSH_3
8197 #undef STORE_JUMP
8198 #undef STORE_JUMP2
8199 #undef INSERT_JUMP
8200 #undef INSERT_JUMP2
8201 #undef EXTEND_BUFFER
8202 #undef GET_UNSIGNED_NUMBER
8203 #undef FREE_STACK_RETURN
8204
8205 # undef POINTER_TO_OFFSET
8206 # undef MATCHING_IN_FRST_STRING
8207 # undef PREFETCH
8208 # undef AT_STRINGS_BEG
8209 # undef AT_STRINGS_END
8210 # undef WORDCHAR_P
8211 # undef FREE_VAR
8212 # undef FREE_VARIABLES
8213 # undef NO_HIGHEST_ACTIVE_REG
8214 # undef NO_LOWEST_ACTIVE_REG
8215
8216 # undef CHAR_T
8217 # undef UCHAR_T
8218 # undef COMPILED_BUFFER_VAR
8219 # undef OFFSET_ADDRESS_SIZE
8220 # undef CHAR_CLASS_SIZE
8221 # undef PREFIX
8222 # undef ARG_PREFIX
8223 # undef PUT_CHAR
8224 # undef BYTE
8225 # undef WCHAR
8226
8227 # define DEFINED_ONCE
8228