regex.c revision 1.1 1 /* $NetBSD: regex.c,v 1.1 2016/01/13 03:15:30 christos Exp $ */
2
3 /* Extended regular expression matching and search library,
4 version 0.12.
5 (Implements POSIX draft P1003.2/D11.2, except for some of the
6 internationalization features.)
7 Copyright (C) 1993-1999, 2000, 2001 Free Software Foundation, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2, or (at your option)
12 any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software Foundation,
21 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
22
23 /* AIX requires this to be the first thing in the file. */
24 #if defined _AIX && !defined REGEX_MALLOC
25 #pragma alloca
26 #endif
27
28 #undef _GNU_SOURCE
29 #define _GNU_SOURCE
30
31 #ifdef HAVE_CONFIG_H
32 # include <config.h>
33 #endif
34
35 #ifndef PARAMS
36 # if defined __GNUC__ || (defined __STDC__ && __STDC__)
37 # define PARAMS(args) args
38 # else
39 # define PARAMS(args) ()
40 # endif /* GCC. */
41 #endif /* Not PARAMS. */
42
43 #ifndef INSIDE_RECURSION
44
45 # if defined STDC_HEADERS && !defined emacs
46 # include <stddef.h>
47 # else
48 /* We need this for `regex.h', and perhaps for the Emacs include files. */
49 # include <sys/types.h>
50 # endif
51
52 # define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
53
54 /* For platform which support the ISO C amendement 1 functionality we
55 support user defined character classes. */
56 # if defined _LIBC || WIDE_CHAR_SUPPORT
57 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
58 # include <wchar.h>
59 # include <wctype.h>
60 # endif
61
62 # ifdef _LIBC
63 /* We have to keep the namespace clean. */
64 # define regfree(preg) __regfree (preg)
65 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
66 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
67 # define regerror(errcode, preg, errbuf, errbuf_size) \
68 __regerror(errcode, preg, errbuf, errbuf_size)
69 # define re_set_registers(bu, re, nu, st, en) \
70 __re_set_registers (bu, re, nu, st, en)
71 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
72 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
73 # define re_match(bufp, string, size, pos, regs) \
74 __re_match (bufp, string, size, pos, regs)
75 # define re_search(bufp, string, size, startpos, range, regs) \
76 __re_search (bufp, string, size, startpos, range, regs)
77 # define re_compile_pattern(pattern, length, bufp) \
78 __re_compile_pattern (pattern, length, bufp)
79 # define re_set_syntax(syntax) __re_set_syntax (syntax)
80 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
81 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
82 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
83
84 # define btowc __btowc
85 # define iswctype __iswctype
86 # define mbrtowc __mbrtowc
87 # define wcslen __wcslen
88 # define wcscoll __wcscoll
89 # define wcrtomb __wcrtomb
90
91 /* We are also using some library internals. */
92 # include <locale/localeinfo.h>
93 # include <locale/elem-hash.h>
94 # include <langinfo.h>
95 # include <locale/coll-lookup.h>
96 # endif
97
98 /* This is for other GNU distributions with internationalized messages. */
99 # if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
100 # include <libintl.h>
101 # ifdef _LIBC
102 # undef gettext
103 # define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES)
104 # endif
105 # else
106 # define gettext(msgid) (msgid)
107 # endif
108
109 # ifndef gettext_noop
110 /* This define is so xgettext can find the internationalizable
111 strings. */
112 # define gettext_noop(String) String
113 # endif
114
115 /* Support for bounded pointers. */
116 # if !defined _LIBC && !defined __BOUNDED_POINTERS__
117 # define __bounded /* nothing */
118 # define __unbounded /* nothing */
119 # define __ptrvalue /* nothing */
120 # endif
121
122 /* The `emacs' switch turns on certain matching commands
123 that make sense only in Emacs. */
124 # ifdef emacs
125
126 # include "lisp.h"
127 # include "buffer.h"
128 # include "syntax.h"
129
130 # else /* not emacs */
131
132 /* If we are not linking with Emacs proper,
133 we can't use the relocating allocator
134 even if config.h says that we can. */
135 # undef REL_ALLOC
136
137 # if defined STDC_HEADERS || defined _LIBC
138 # include <stdlib.h>
139 # else
140 char *malloc ();
141 char *realloc ();
142 # endif
143
144 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
145 If nothing else has been done, use the method below. */
146 # ifdef INHIBIT_STRING_HEADER
147 # if !(defined HAVE_BZERO && defined HAVE_BCOPY)
148 # if !defined bzero && !defined bcopy
149 # undef INHIBIT_STRING_HEADER
150 # endif
151 # endif
152 # endif
153
154 /* This is the normal way of making sure we have a bcopy and a bzero.
155 This is used in most programs--a few other programs avoid this
156 by defining INHIBIT_STRING_HEADER. */
157 # ifndef INHIBIT_STRING_HEADER
158 # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
159 # include <string.h>
160 # ifndef bzero
161 # ifndef _LIBC
162 # define bzero(s, n) (memset (s, '\0', n), (s))
163 # else
164 # define bzero(s, n) __bzero (s, n)
165 # endif
166 # endif
167 # else
168 # include <strings.h>
169 # ifndef memcmp
170 # define memcmp(s1, s2, n) bcmp (s1, s2, n)
171 # endif
172 # ifndef memcpy
173 # define memcpy(d, s, n) (bcopy (s, d, n), (d))
174 # endif
175 # endif
176 # endif
177
178 /* Define the syntax stuff for \<, \>, etc. */
179
180 /* This must be nonzero for the wordchar and notwordchar pattern
181 commands in re_match_2. */
182 # ifndef Sword
183 # define Sword 1
184 # endif
185
186 # ifdef SWITCH_ENUM_BUG
187 # define SWITCH_ENUM_CAST(x) ((int)(x))
188 # else
189 # define SWITCH_ENUM_CAST(x) (x)
190 # endif
191
192 # endif /* not emacs */
193
194 # if defined _LIBC || HAVE_LIMITS_H
195 # include <limits.h>
196 # endif
197
198 # ifndef MB_LEN_MAX
199 # define MB_LEN_MAX 1
200 # endif
201
202 /* Get the interface, including the syntax bits. */
204 # include <regex.h>
205
206 /* isalpha etc. are used for the character classes. */
207 # include <ctype.h>
208
209 /* Jim Meyering writes:
210
211 "... Some ctype macros are valid only for character codes that
212 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
213 using /bin/cc or gcc but without giving an ansi option). So, all
214 ctype uses should be through macros like ISPRINT... If
215 STDC_HEADERS is defined, then autoconf has verified that the ctype
216 macros don't need to be guarded with references to isascii. ...
217 Defining isascii to 1 should let any compiler worth its salt
218 eliminate the && through constant folding."
219 Solaris defines some of these symbols so we must undefine them first. */
220
221 # if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
222 # define IN_CTYPE_DOMAIN(c) 1
223 # else
224 # define IN_CTYPE_DOMAIN(c) isascii(c)
225 # endif
226
227 # ifdef isblank
228 # define ISBLANK(c) (IN_CTYPE_DOMAIN (c) && isblank (c))
229 # else
230 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
231 # endif
232 # ifdef isgraph
233 # define ISGRAPH(c) (IN_CTYPE_DOMAIN (c) && isgraph (c))
234 # else
235 # define ISGRAPH(c) (IN_CTYPE_DOMAIN (c) && isprint (c) && !isspace (c))
236 # endif
237
238 # undef ISPRINT
239 # define ISPRINT(c) (IN_CTYPE_DOMAIN (c) && isprint (c))
240 # define ISDIGIT(c) (IN_CTYPE_DOMAIN (c) && isdigit (c))
241 # define ISALNUM(c) (IN_CTYPE_DOMAIN (c) && isalnum (c))
242 # define ISALPHA(c) (IN_CTYPE_DOMAIN (c) && isalpha (c))
243 # define ISCNTRL(c) (IN_CTYPE_DOMAIN (c) && iscntrl (c))
244 # define ISLOWER(c) (IN_CTYPE_DOMAIN (c) && islower (c))
245 # define ISPUNCT(c) (IN_CTYPE_DOMAIN (c) && ispunct (c))
246 # define ISSPACE(c) (IN_CTYPE_DOMAIN (c) && isspace (c))
247 # define ISUPPER(c) (IN_CTYPE_DOMAIN (c) && isupper (c))
248 # define ISXDIGIT(c) (IN_CTYPE_DOMAIN (c) && isxdigit (c))
249
250 # ifdef _tolower
251 # define TOLOWER(c) _tolower(c)
252 # else
253 # define TOLOWER(c) tolower(c)
254 # endif
255
256 # ifndef NULL
257 # define NULL (void *)0
258 # endif
259
260 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
261 since ours (we hope) works properly with all combinations of
262 machines, compilers, `char' and `unsigned char' argument types.
263 (Per Bothner suggested the basic approach.) */
264 # undef SIGN_EXTEND_CHAR
265 # if __STDC__
266 # define SIGN_EXTEND_CHAR(c) ((signed char) (c))
267 # else /* not __STDC__ */
268 /* As in Harbison and Steele. */
269 # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
270 # endif
271
272 # ifndef emacs
274 /* How many characters in the character set. */
275 # define CHAR_SET_SIZE 256
276
277 # ifdef SYNTAX_TABLE
278
279 extern char *re_syntax_table;
280
281 # else /* not SYNTAX_TABLE */
282
283 static char re_syntax_table[CHAR_SET_SIZE];
284
285 static void init_syntax_once PARAMS ((void));
286
287 static void
288 init_syntax_once ()
289 {
290 register int c;
291 static int done = 0;
292
293 if (done)
294 return;
295 bzero (re_syntax_table, sizeof re_syntax_table);
296
297 for (c = 0; c < CHAR_SET_SIZE; ++c)
298 if (ISALNUM (c))
299 re_syntax_table[c] = Sword;
300
301 re_syntax_table['_'] = Sword;
302
303 done = 1;
304 }
305
306 # endif /* not SYNTAX_TABLE */
307
308 # define SYNTAX(c) re_syntax_table[(unsigned char) (c)]
309
310 # endif /* emacs */
311
312 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
314 use `alloca' instead of `malloc'. This is because using malloc in
315 re_search* or re_match* could cause memory leaks when C-g is used in
316 Emacs; also, malloc is slower and causes storage fragmentation. On
317 the other hand, malloc is more portable, and easier to debug.
318
319 Because we sometimes use alloca, some routines have to be macros,
320 not functions -- `alloca'-allocated space disappears at the end of the
321 function it is called in. */
322
323 # ifdef REGEX_MALLOC
324
325 # define REGEX_ALLOCATE malloc
326 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
327 # define REGEX_FREE free
328
329 # else /* not REGEX_MALLOC */
330
331 /* Emacs already defines alloca, sometimes. */
332 # ifndef alloca
333
334 /* Make alloca work the best possible way. */
335 # ifdef __GNUC__
336 # define alloca __builtin_alloca
337 # else /* not __GNUC__ */
338 # if HAVE_ALLOCA_H
339 # include <alloca.h>
340 # endif /* HAVE_ALLOCA_H */
341 # endif /* not __GNUC__ */
342
343 # endif /* not alloca */
344
345 # define REGEX_ALLOCATE alloca
346
347 /* Assumes a `char *destination' variable. */
348 # define REGEX_REALLOCATE(source, osize, nsize) \
349 (destination = (char *) alloca (nsize), \
350 memcpy (destination, source, osize))
351
352 /* No need to do anything to free, after alloca. */
353 # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
354
355 # endif /* not REGEX_MALLOC */
356
357 /* Define how to allocate the failure stack. */
358
359 # if defined REL_ALLOC && defined REGEX_MALLOC
360
361 # define REGEX_ALLOCATE_STACK(size) \
362 r_alloc (&failure_stack_ptr, (size))
363 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
364 r_re_alloc (&failure_stack_ptr, (nsize))
365 # define REGEX_FREE_STACK(ptr) \
366 r_alloc_free (&failure_stack_ptr)
367
368 # else /* not using relocating allocator */
369
370 # ifdef REGEX_MALLOC
371
372 # define REGEX_ALLOCATE_STACK malloc
373 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
374 # define REGEX_FREE_STACK free
375
376 # else /* not REGEX_MALLOC */
377
378 # define REGEX_ALLOCATE_STACK alloca
379
380 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
381 REGEX_REALLOCATE (source, osize, nsize)
382 /* No need to explicitly free anything. */
383 # define REGEX_FREE_STACK(arg)
384
385 # endif /* not REGEX_MALLOC */
386 # endif /* not using relocating allocator */
387
388
389 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
390 `string1' or just past its end. This works if PTR is NULL, which is
391 a good thing. */
392 # define FIRST_STRING_P(ptr) \
393 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
394
395 /* (Re)Allocate N items of type T using malloc, or fail. */
396 # define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
397 # define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
398 # define RETALLOC_IF(addr, n, t) \
399 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
400 # define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
401
402 # define BYTEWIDTH 8 /* In bits. */
403
404 # define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
405
406 # undef MAX
407 # undef MIN
408 # define MAX(a, b) ((a) > (b) ? (a) : (b))
409 # define MIN(a, b) ((a) < (b) ? (a) : (b))
410
411 typedef char boolean;
412 # define false 0
413 # define true 1
414
415 static reg_errcode_t byte_regex_compile _RE_ARGS ((const char *pattern, size_t size,
416 reg_syntax_t syntax,
417 struct re_pattern_buffer *bufp));
418
419 static int byte_re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
420 const char *string1, int size1,
421 const char *string2, int size2,
422 int pos,
423 struct re_registers *regs,
424 int stop));
425 static int byte_re_search_2 PARAMS ((struct re_pattern_buffer *bufp,
426 const char *string1, int size1,
427 const char *string2, int size2,
428 int startpos, int range,
429 struct re_registers *regs, int stop));
430 static int byte_re_compile_fastmap PARAMS ((struct re_pattern_buffer *bufp));
431
432 #ifdef MBS_SUPPORT
433 static reg_errcode_t wcs_regex_compile _RE_ARGS ((const char *pattern, size_t size,
434 reg_syntax_t syntax,
435 struct re_pattern_buffer *bufp));
436
437
438 static int wcs_re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
439 const char *cstring1, int csize1,
440 const char *cstring2, int csize2,
441 int pos,
442 struct re_registers *regs,
443 int stop,
444 wchar_t *string1, int size1,
445 wchar_t *string2, int size2,
446 int *mbs_offset1, int *mbs_offset2));
447 static int wcs_re_search_2 PARAMS ((struct re_pattern_buffer *bufp,
448 const char *string1, int size1,
449 const char *string2, int size2,
450 int startpos, int range,
451 struct re_registers *regs, int stop));
452 static int wcs_re_compile_fastmap PARAMS ((struct re_pattern_buffer *bufp));
453 #endif
454
455 /* These are the command codes that appear in compiled regular
457 expressions. Some opcodes are followed by argument bytes. A
458 command code can specify any interpretation whatsoever for its
459 arguments. Zero bytes may appear in the compiled regular expression. */
460
461 typedef enum
462 {
463 no_op = 0,
464
465 /* Succeed right away--no more backtracking. */
466 succeed,
467
468 /* Followed by one byte giving n, then by n literal bytes. */
469 exactn,
470
471 # ifdef MBS_SUPPORT
472 /* Same as exactn, but contains binary data. */
473 exactn_bin,
474 # endif
475
476 /* Matches any (more or less) character. */
477 anychar,
478
479 /* Matches any one char belonging to specified set. First
480 following byte is number of bitmap bytes. Then come bytes
481 for a bitmap saying which chars are in. Bits in each byte
482 are ordered low-bit-first. A character is in the set if its
483 bit is 1. A character too large to have a bit in the map is
484 automatically not in the set. */
485 /* ifdef MBS_SUPPORT, following element is length of character
486 classes, length of collating symbols, length of equivalence
487 classes, length of character ranges, and length of characters.
488 Next, character class element, collating symbols elements,
489 equivalence class elements, range elements, and character
490 elements follow.
491 See regex_compile function. */
492 charset,
493
494 /* Same parameters as charset, but match any character that is
495 not one of those specified. */
496 charset_not,
497
498 /* Start remembering the text that is matched, for storing in a
499 register. Followed by one byte with the register number, in
500 the range 0 to one less than the pattern buffer's re_nsub
501 field. Then followed by one byte with the number of groups
502 inner to this one. (This last has to be part of the
503 start_memory only because we need it in the on_failure_jump
504 of re_match_2.) */
505 start_memory,
506
507 /* Stop remembering the text that is matched and store it in a
508 memory register. Followed by one byte with the register
509 number, in the range 0 to one less than `re_nsub' in the
510 pattern buffer, and one byte with the number of inner groups,
511 just like `start_memory'. (We need the number of inner
512 groups here because we don't have any easy way of finding the
513 corresponding start_memory when we're at a stop_memory.) */
514 stop_memory,
515
516 /* Match a duplicate of something remembered. Followed by one
517 byte containing the register number. */
518 duplicate,
519
520 /* Fail unless at beginning of line. */
521 begline,
522
523 /* Fail unless at end of line. */
524 endline,
525
526 /* Succeeds if at beginning of buffer (if emacs) or at beginning
527 of string to be matched (if not). */
528 begbuf,
529
530 /* Analogously, for end of buffer/string. */
531 endbuf,
532
533 /* Followed by two byte relative address to which to jump. */
534 jump,
535
536 /* Same as jump, but marks the end of an alternative. */
537 jump_past_alt,
538
539 /* Followed by two-byte relative address of place to resume at
540 in case of failure. */
541 /* ifdef MBS_SUPPORT, the size of address is 1. */
542 on_failure_jump,
543
544 /* Like on_failure_jump, but pushes a placeholder instead of the
545 current string position when executed. */
546 on_failure_keep_string_jump,
547
548 /* Throw away latest failure point and then jump to following
549 two-byte relative address. */
550 /* ifdef MBS_SUPPORT, the size of address is 1. */
551 pop_failure_jump,
552
553 /* Change to pop_failure_jump if know won't have to backtrack to
554 match; otherwise change to jump. This is used to jump
555 back to the beginning of a repeat. If what follows this jump
556 clearly won't match what the repeat does, such that we can be
557 sure that there is no use backtracking out of repetitions
558 already matched, then we change it to a pop_failure_jump.
559 Followed by two-byte address. */
560 /* ifdef MBS_SUPPORT, the size of address is 1. */
561 maybe_pop_jump,
562
563 /* Jump to following two-byte address, and push a dummy failure
564 point. This failure point will be thrown away if an attempt
565 is made to use it for a failure. A `+' construct makes this
566 before the first repeat. Also used as an intermediary kind
567 of jump when compiling an alternative. */
568 /* ifdef MBS_SUPPORT, the size of address is 1. */
569 dummy_failure_jump,
570
571 /* Push a dummy failure point and continue. Used at the end of
572 alternatives. */
573 push_dummy_failure,
574
575 /* Followed by two-byte relative address and two-byte number n.
576 After matching N times, jump to the address upon failure. */
577 /* ifdef MBS_SUPPORT, the size of address is 1. */
578 succeed_n,
579
580 /* Followed by two-byte relative address, and two-byte number n.
581 Jump to the address N times, then fail. */
582 /* ifdef MBS_SUPPORT, the size of address is 1. */
583 jump_n,
584
585 /* Set the following two-byte relative address to the
586 subsequent two-byte number. The address *includes* the two
587 bytes of number. */
588 /* ifdef MBS_SUPPORT, the size of address is 1. */
589 set_number_at,
590
591 wordchar, /* Matches any word-constituent character. */
592 notwordchar, /* Matches any char that is not a word-constituent. */
593
594 wordbeg, /* Succeeds if at word beginning. */
595 wordend, /* Succeeds if at word end. */
596
597 wordbound, /* Succeeds if at a word boundary. */
598 notwordbound /* Succeeds if not at a word boundary. */
599
600 # ifdef emacs
601 ,before_dot, /* Succeeds if before point. */
602 at_dot, /* Succeeds if at point. */
603 after_dot, /* Succeeds if after point. */
604
605 /* Matches any character whose syntax is specified. Followed by
606 a byte which contains a syntax code, e.g., Sword. */
607 syntaxspec,
608
609 /* Matches any character whose syntax is not that specified. */
610 notsyntaxspec
611 # endif /* emacs */
612 } re_opcode_t;
613 #endif /* not INSIDE_RECURSION */
614
615
617 #ifdef BYTE
618 # define CHAR_T char
619 # define UCHAR_T unsigned char
620 # define COMPILED_BUFFER_VAR bufp->buffer
621 # define OFFSET_ADDRESS_SIZE 2
622 # define PREFIX(name) byte_##name
623 # define ARG_PREFIX(name) name
624 # define PUT_CHAR(c) putchar (c)
625 #else
626 # ifdef WCHAR
627 # define CHAR_T wchar_t
628 # define UCHAR_T wchar_t
629 # define COMPILED_BUFFER_VAR wc_buffer
630 # define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
631 # define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_T)+1)
632 # define PREFIX(name) wcs_##name
633 # define ARG_PREFIX(name) c##name
634 /* Should we use wide stream?? */
635 # define PUT_CHAR(c) printf ("%C", c);
636 # define TRUE 1
637 # define FALSE 0
638 # else
639 # ifdef MBS_SUPPORT
640 # define WCHAR
641 # define INSIDE_RECURSION
642 # include "regex.c"
643 # undef INSIDE_RECURSION
644 # endif
645 # define BYTE
646 # define INSIDE_RECURSION
647 # include "regex.c"
648 # undef INSIDE_RECURSION
649 # endif
650 #endif
651 #include "unlocked-io.h"
652
653 #ifdef INSIDE_RECURSION
654 /* Common operations on the compiled pattern. */
655
656 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
657 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
658
659 # ifdef WCHAR
660 # define STORE_NUMBER(destination, number) \
661 do { \
662 *(destination) = (UCHAR_T)(number); \
663 } while (0)
664 # else /* BYTE */
665 # define STORE_NUMBER(destination, number) \
666 do { \
667 (destination)[0] = (number) & 0377; \
668 (destination)[1] = (number) >> 8; \
669 } while (0)
670 # endif /* WCHAR */
671
672 /* Same as STORE_NUMBER, except increment DESTINATION to
673 the byte after where the number is stored. Therefore, DESTINATION
674 must be an lvalue. */
675 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
676
677 # define STORE_NUMBER_AND_INCR(destination, number) \
678 do { \
679 STORE_NUMBER (destination, number); \
680 (destination) += OFFSET_ADDRESS_SIZE; \
681 } while (0)
682
683 /* Put into DESTINATION a number stored in two contiguous bytes starting
684 at SOURCE. */
685 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
686
687 # ifdef WCHAR
688 # define EXTRACT_NUMBER(destination, source) \
689 do { \
690 (destination) = *(source); \
691 } while (0)
692 # else /* BYTE */
693 # define EXTRACT_NUMBER(destination, source) \
694 do { \
695 (destination) = *(source) & 0377; \
696 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
697 } while (0)
698 # endif
699
700 # ifdef DEBUG
701 static void PREFIX(extract_number) _RE_ARGS ((int *dest, UCHAR_T *source));
702 static void
703 PREFIX(extract_number) (dest, source)
704 int *dest;
705 UCHAR_T *source;
706 {
707 # ifdef WCHAR
708 *dest = *source;
709 # else /* BYTE */
710 int temp = SIGN_EXTEND_CHAR (*(source + 1));
711 *dest = *source & 0377;
712 *dest += temp << 8;
713 # endif
714 }
715
716 # ifndef EXTRACT_MACROS /* To debug the macros. */
717 # undef EXTRACT_NUMBER
718 # define EXTRACT_NUMBER(dest, src) PREFIX(extract_number) (&dest, src)
719 # endif /* not EXTRACT_MACROS */
720
721 # endif /* DEBUG */
722
723 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
724 SOURCE must be an lvalue. */
725
726 # define EXTRACT_NUMBER_AND_INCR(destination, source) \
727 do { \
728 EXTRACT_NUMBER (destination, source); \
729 (source) += OFFSET_ADDRESS_SIZE; \
730 } while (0)
731
732 # ifdef DEBUG
733 static void PREFIX(extract_number_and_incr) _RE_ARGS ((int *destination,
734 UCHAR_T **source));
735 static void
736 PREFIX(extract_number_and_incr) (destination, source)
737 int *destination;
738 UCHAR_T **source;
739 {
740 PREFIX(extract_number) (destination, *source);
741 *source += OFFSET_ADDRESS_SIZE;
742 }
743
744 # ifndef EXTRACT_MACROS
745 # undef EXTRACT_NUMBER_AND_INCR
746 # define EXTRACT_NUMBER_AND_INCR(dest, src) \
747 PREFIX(extract_number_and_incr) (&dest, &src)
748 # endif /* not EXTRACT_MACROS */
749
750 # endif /* DEBUG */
751
752
753
755 /* If DEBUG is defined, Regex prints many voluminous messages about what
756 it is doing (if the variable `debug' is nonzero). If linked with the
757 main program in `iregex.c', you can enter patterns and strings
758 interactively. And if linked with the main program in `main.c' and
759 the other test files, you can run the already-written tests. */
760
761 # ifdef DEBUG
762
763 # ifndef DEFINED_ONCE
764
765 /* We use standard I/O for debugging. */
766 # include <stdio.h>
767
768 /* It is useful to test things that ``must'' be true when debugging. */
769 # include <assert.h>
770
771 static int debug;
772
773 # define DEBUG_STATEMENT(e) e
774 # define DEBUG_PRINT1(x) if (debug) printf (x)
775 # define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
776 # define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
777 # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
778 # endif /* not DEFINED_ONCE */
779
780 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
781 if (debug) PREFIX(print_partial_compiled_pattern) (s, e)
782 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
783 if (debug) PREFIX(print_double_string) (w, s1, sz1, s2, sz2)
784
785
786 /* Print the fastmap in human-readable form. */
787
788 # ifndef DEFINED_ONCE
789 void
790 print_fastmap (fastmap)
791 char *fastmap;
792 {
793 unsigned was_a_range = 0;
794 unsigned i = 0;
795
796 while (i < (1 << BYTEWIDTH))
797 {
798 if (fastmap[i++])
799 {
800 was_a_range = 0;
801 putchar (i - 1);
802 while (i < (1 << BYTEWIDTH) && fastmap[i])
803 {
804 was_a_range = 1;
805 i++;
806 }
807 if (was_a_range)
808 {
809 printf ("-");
810 putchar (i - 1);
811 }
812 }
813 }
814 putchar ('\n');
815 }
816 # endif /* not DEFINED_ONCE */
817
818
819 /* Print a compiled pattern string in human-readable form, starting at
820 the START pointer into it and ending just before the pointer END. */
821
822 void
823 PREFIX(print_partial_compiled_pattern) (start, end)
824 UCHAR_T *start;
825 UCHAR_T *end;
826 {
827 int mcnt, mcnt2;
828 UCHAR_T *p1;
829 UCHAR_T *p = start;
830 UCHAR_T *pend = end;
831
832 if (start == NULL)
833 {
834 printf ("(null)\n");
835 return;
836 }
837
838 /* Loop over pattern commands. */
839 while (p < pend)
840 {
841 # ifdef _LIBC
842 printf ("%td:\t", p - start);
843 # else
844 printf ("%ld:\t", (long int) (p - start));
845 # endif
846
847 switch ((re_opcode_t) *p++)
848 {
849 case no_op:
850 printf ("/no_op");
851 break;
852
853 case exactn:
854 mcnt = *p++;
855 printf ("/exactn/%d", mcnt);
856 do
857 {
858 putchar ('/');
859 PUT_CHAR (*p++);
860 }
861 while (--mcnt);
862 break;
863
864 # ifdef MBS_SUPPORT
865 case exactn_bin:
866 mcnt = *p++;
867 printf ("/exactn_bin/%d", mcnt);
868 do
869 {
870 printf("/%lx", (long int) *p++);
871 }
872 while (--mcnt);
873 break;
874 # endif /* MBS_SUPPORT */
875
876 case start_memory:
877 mcnt = *p++;
878 printf ("/start_memory/%d/%ld", mcnt, (long int) *p++);
879 break;
880
881 case stop_memory:
882 mcnt = *p++;
883 printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++);
884 break;
885
886 case duplicate:
887 printf ("/duplicate/%ld", (long int) *p++);
888 break;
889
890 case anychar:
891 printf ("/anychar");
892 break;
893
894 case charset:
895 case charset_not:
896 {
897 # ifdef WCHAR
898 int i, length;
899 wchar_t *workp = p;
900 printf ("/charset [%s",
901 (re_opcode_t) *(workp - 1) == charset_not ? "^" : "");
902 p += 5;
903 length = *workp++; /* the length of char_classes */
904 for (i=0 ; i<length ; i++)
905 printf("[:%lx:]", (long int) *p++);
906 length = *workp++; /* the length of collating_symbol */
907 for (i=0 ; i<length ;)
908 {
909 printf("[.");
910 while(*p != 0)
911 PUT_CHAR((i++,*p++));
912 i++,p++;
913 printf(".]");
914 }
915 length = *workp++; /* the length of equivalence_class */
916 for (i=0 ; i<length ;)
917 {
918 printf("[=");
919 while(*p != 0)
920 PUT_CHAR((i++,*p++));
921 i++,p++;
922 printf("=]");
923 }
924 length = *workp++; /* the length of char_range */
925 for (i=0 ; i<length ; i++)
926 {
927 wchar_t range_start = *p++;
928 wchar_t range_end = *p++;
929 printf("%C-%C", range_start, range_end);
930 }
931 length = *workp++; /* the length of char */
932 for (i=0 ; i<length ; i++)
933 printf("%C", *p++);
934 putchar (']');
935 # else
936 register int c, last = -100;
937 register int in_range = 0;
938
939 printf ("/charset [%s",
940 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
941
942 assert (p + *p < pend);
943
944 for (c = 0; c < 256; c++)
945 if (c / 8 < *p
946 && (p[1 + (c/8)] & (1 << (c % 8))))
947 {
948 /* Are we starting a range? */
949 if (last + 1 == c && ! in_range)
950 {
951 putchar ('-');
952 in_range = 1;
953 }
954 /* Have we broken a range? */
955 else if (last + 1 != c && in_range)
956 {
957 putchar (last);
958 in_range = 0;
959 }
960
961 if (! in_range)
962 putchar (c);
963
964 last = c;
965 }
966
967 if (in_range)
968 putchar (last);
969
970 putchar (']');
971
972 p += 1 + *p;
973 # endif /* WCHAR */
974 }
975 break;
976
977 case begline:
978 printf ("/begline");
979 break;
980
981 case endline:
982 printf ("/endline");
983 break;
984
985 case on_failure_jump:
986 PREFIX(extract_number_and_incr) (&mcnt, &p);
987 # ifdef _LIBC
988 printf ("/on_failure_jump to %td", p + mcnt - start);
989 # else
990 printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start));
991 # endif
992 break;
993
994 case on_failure_keep_string_jump:
995 PREFIX(extract_number_and_incr) (&mcnt, &p);
996 # ifdef _LIBC
997 printf ("/on_failure_keep_string_jump to %td", p + mcnt - start);
998 # else
999 printf ("/on_failure_keep_string_jump to %ld",
1000 (long int) (p + mcnt - start));
1001 # endif
1002 break;
1003
1004 case dummy_failure_jump:
1005 PREFIX(extract_number_and_incr) (&mcnt, &p);
1006 # ifdef _LIBC
1007 printf ("/dummy_failure_jump to %td", p + mcnt - start);
1008 # else
1009 printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start));
1010 # endif
1011 break;
1012
1013 case push_dummy_failure:
1014 printf ("/push_dummy_failure");
1015 break;
1016
1017 case maybe_pop_jump:
1018 PREFIX(extract_number_and_incr) (&mcnt, &p);
1019 # ifdef _LIBC
1020 printf ("/maybe_pop_jump to %td", p + mcnt - start);
1021 # else
1022 printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start));
1023 # endif
1024 break;
1025
1026 case pop_failure_jump:
1027 PREFIX(extract_number_and_incr) (&mcnt, &p);
1028 # ifdef _LIBC
1029 printf ("/pop_failure_jump to %td", p + mcnt - start);
1030 # else
1031 printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start));
1032 # endif
1033 break;
1034
1035 case jump_past_alt:
1036 PREFIX(extract_number_and_incr) (&mcnt, &p);
1037 # ifdef _LIBC
1038 printf ("/jump_past_alt to %td", p + mcnt - start);
1039 # else
1040 printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start));
1041 # endif
1042 break;
1043
1044 case jump:
1045 PREFIX(extract_number_and_incr) (&mcnt, &p);
1046 # ifdef _LIBC
1047 printf ("/jump to %td", p + mcnt - start);
1048 # else
1049 printf ("/jump to %ld", (long int) (p + mcnt - start));
1050 # endif
1051 break;
1052
1053 case succeed_n:
1054 PREFIX(extract_number_and_incr) (&mcnt, &p);
1055 p1 = p + mcnt;
1056 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1057 # ifdef _LIBC
1058 printf ("/succeed_n to %td, %d times", p1 - start, mcnt2);
1059 # else
1060 printf ("/succeed_n to %ld, %d times",
1061 (long int) (p1 - start), mcnt2);
1062 # endif
1063 break;
1064
1065 case jump_n:
1066 PREFIX(extract_number_and_incr) (&mcnt, &p);
1067 p1 = p + mcnt;
1068 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1069 printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
1070 break;
1071
1072 case set_number_at:
1073 PREFIX(extract_number_and_incr) (&mcnt, &p);
1074 p1 = p + mcnt;
1075 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1076 # ifdef _LIBC
1077 printf ("/set_number_at location %td to %d", p1 - start, mcnt2);
1078 # else
1079 printf ("/set_number_at location %ld to %d",
1080 (long int) (p1 - start), mcnt2);
1081 # endif
1082 break;
1083
1084 case wordbound:
1085 printf ("/wordbound");
1086 break;
1087
1088 case notwordbound:
1089 printf ("/notwordbound");
1090 break;
1091
1092 case wordbeg:
1093 printf ("/wordbeg");
1094 break;
1095
1096 case wordend:
1097 printf ("/wordend");
1098 break;
1099
1100 # ifdef emacs
1101 case before_dot:
1102 printf ("/before_dot");
1103 break;
1104
1105 case at_dot:
1106 printf ("/at_dot");
1107 break;
1108
1109 case after_dot:
1110 printf ("/after_dot");
1111 break;
1112
1113 case syntaxspec:
1114 printf ("/syntaxspec");
1115 mcnt = *p++;
1116 printf ("/%d", mcnt);
1117 break;
1118
1119 case notsyntaxspec:
1120 printf ("/notsyntaxspec");
1121 mcnt = *p++;
1122 printf ("/%d", mcnt);
1123 break;
1124 # endif /* emacs */
1125
1126 case wordchar:
1127 printf ("/wordchar");
1128 break;
1129
1130 case notwordchar:
1131 printf ("/notwordchar");
1132 break;
1133
1134 case begbuf:
1135 printf ("/begbuf");
1136 break;
1137
1138 case endbuf:
1139 printf ("/endbuf");
1140 break;
1141
1142 default:
1143 printf ("?%ld", (long int) *(p-1));
1144 }
1145
1146 putchar ('\n');
1147 }
1148
1149 # ifdef _LIBC
1150 printf ("%td:\tend of pattern.\n", p - start);
1151 # else
1152 printf ("%ld:\tend of pattern.\n", (long int) (p - start));
1153 # endif
1154 }
1155
1156
1157 void
1158 PREFIX(print_compiled_pattern) (bufp)
1159 struct re_pattern_buffer *bufp;
1160 {
1161 UCHAR_T *buffer = (UCHAR_T*) bufp->buffer;
1162
1163 PREFIX(print_partial_compiled_pattern) (buffer, buffer
1164 + bufp->used / sizeof(UCHAR_T));
1165 printf ("%ld bytes used/%ld bytes allocated.\n",
1166 bufp->used, bufp->allocated);
1167
1168 if (bufp->fastmap_accurate && bufp->fastmap)
1169 {
1170 printf ("fastmap: ");
1171 print_fastmap (bufp->fastmap);
1172 }
1173
1174 # ifdef _LIBC
1175 printf ("re_nsub: %Zd\t", bufp->re_nsub);
1176 # else
1177 printf ("re_nsub: %ld\t", (long int) bufp->re_nsub);
1178 # endif
1179 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1180 printf ("can_be_null: %d\t", bufp->can_be_null);
1181 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1182 printf ("no_sub: %d\t", bufp->no_sub);
1183 printf ("not_bol: %d\t", bufp->not_bol);
1184 printf ("not_eol: %d\t", bufp->not_eol);
1185 printf ("syntax: %lx\n", bufp->syntax);
1186 /* Perhaps we should print the translate table? */
1187 }
1188
1189
1190 void
1191 PREFIX(print_double_string) (where, string1, size1, string2, size2)
1192 const CHAR_T *where;
1193 const CHAR_T *string1;
1194 const CHAR_T *string2;
1195 int size1;
1196 int size2;
1197 {
1198 int this_char;
1199
1200 if (where == NULL)
1201 printf ("(null)");
1202 else
1203 {
1204 int cnt;
1205
1206 if (FIRST_STRING_P (where))
1207 {
1208 for (this_char = where - string1; this_char < size1; this_char++)
1209 PUT_CHAR (string1[this_char]);
1210
1211 where = string2;
1212 }
1213
1214 cnt = 0;
1215 for (this_char = where - string2; this_char < size2; this_char++)
1216 {
1217 PUT_CHAR (string2[this_char]);
1218 if (++cnt > 100)
1219 {
1220 fputs ("...", stdout);
1221 break;
1222 }
1223 }
1224 }
1225 }
1226
1227 # ifndef DEFINED_ONCE
1228 void
1229 printchar (c)
1230 int c;
1231 {
1232 putc (c, stderr);
1233 }
1234 # endif
1235
1236 # else /* not DEBUG */
1237
1238 # ifndef DEFINED_ONCE
1239 # undef assert
1240 # define assert(e)
1241
1242 # define DEBUG_STATEMENT(e)
1243 # define DEBUG_PRINT1(x)
1244 # define DEBUG_PRINT2(x1, x2)
1245 # define DEBUG_PRINT3(x1, x2, x3)
1246 # define DEBUG_PRINT4(x1, x2, x3, x4)
1247 # endif /* not DEFINED_ONCE */
1248 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1249 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1250
1251 # endif /* not DEBUG */
1252
1253
1254
1256 # ifdef WCHAR
1257 /* This convert a multibyte string to a wide character string.
1258 And write their correspondances to offset_buffer(see below)
1259 and write whether each wchar_t is binary data to is_binary.
1260 This assume invalid multibyte sequences as binary data.
1261 We assume offset_buffer and is_binary is already allocated
1262 enough space. */
1263
1264 static size_t convert_mbs_to_wcs (CHAR_T *dest, const unsigned char* src,
1265 size_t len, int *offset_buffer,
1266 char *is_binary);
1267 static size_t
1268 convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary)
1269 CHAR_T *dest;
1270 const unsigned char* src;
1271 size_t len; /* the length of multibyte string. */
1272
1273 /* It hold correspondances between src(char string) and
1274 dest(wchar_t string) for optimization.
1275 e.g. src = "xxxyzz"
1276 dest = {'X', 'Y', 'Z'}
1277 (each "xxx", "y" and "zz" represent one multibyte character
1278 corresponding to 'X', 'Y' and 'Z'.)
1279 offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")}
1280 = {0, 3, 4, 6}
1281 */
1282 int *offset_buffer;
1283 char *is_binary;
1284 {
1285 wchar_t *pdest = dest;
1286 const unsigned char *psrc = src;
1287 size_t wc_count = 0;
1288
1289 mbstate_t mbs;
1290 int i, consumed;
1291 size_t mb_remain = len;
1292 size_t mb_count = 0;
1293
1294 /* Initialize the conversion state. */
1295 memset (&mbs, 0, sizeof (mbstate_t));
1296
1297 offset_buffer[0] = 0;
1298 for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed,
1299 psrc += consumed)
1300 {
1301 consumed = mbrtowc (pdest, psrc, mb_remain, &mbs);
1302
1303 if (consumed <= 0)
1304 /* failed to convert. maybe src contains binary data.
1305 So we consume 1 byte manualy. */
1306 {
1307 *pdest = *psrc;
1308 consumed = 1;
1309 is_binary[wc_count] = TRUE;
1310 }
1311 else
1312 is_binary[wc_count] = FALSE;
1313 /* In sjis encoding, we use yen sign as escape character in
1314 place of reverse solidus. So we convert 0x5c(yen sign in
1315 sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse
1316 solidus in UCS2). */
1317 if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5)
1318 *pdest = (wchar_t) *psrc;
1319
1320 offset_buffer[wc_count + 1] = mb_count += consumed;
1321 }
1322
1323 /* Fill remain of the buffer with sentinel. */
1324 for (i = wc_count + 1 ; i <= len ; i++)
1325 offset_buffer[i] = mb_count + 1;
1326
1327 return wc_count;
1328 }
1329
1330 # endif /* WCHAR */
1331
1332 #else /* not INSIDE_RECURSION */
1333
1334 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1335 also be assigned to arbitrarily: each pattern buffer stores its own
1336 syntax, so it can be changed between regex compilations. */
1337 /* This has no initializer because initialized variables in Emacs
1338 become read-only after dumping. */
1339 reg_syntax_t re_syntax_options;
1340
1341
1342 /* Specify the precise syntax of regexps for compilation. This provides
1343 for compatibility for various utilities which historically have
1344 different, incompatible syntaxes.
1345
1346 The argument SYNTAX is a bit mask comprised of the various bits
1347 defined in regex.h. We return the old syntax. */
1348
1349 reg_syntax_t
1350 re_set_syntax (syntax)
1351 reg_syntax_t syntax;
1352 {
1353 reg_syntax_t ret = re_syntax_options;
1354
1355 re_syntax_options = syntax;
1356 # ifdef DEBUG
1357 if (syntax & RE_DEBUG)
1358 debug = 1;
1359 else if (debug) /* was on but now is not */
1360 debug = 0;
1361 # endif /* DEBUG */
1362 return ret;
1363 }
1364 # ifdef _LIBC
1365 weak_alias (__re_set_syntax, re_set_syntax)
1366 # endif
1367
1368 /* This table gives an error message for each of the error codes listed
1370 in regex.h. Obviously the order here has to be same as there.
1371 POSIX doesn't require that we do anything for REG_NOERROR,
1372 but why not be nice? */
1373
1374 static const char re_error_msgid[] =
1375 {
1376 # define REG_NOERROR_IDX 0
1377 gettext_noop ("Success") /* REG_NOERROR */
1378 "\0"
1379 # define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
1380 gettext_noop ("No match") /* REG_NOMATCH */
1381 "\0"
1382 # define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
1383 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
1384 "\0"
1385 # define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
1386 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
1387 "\0"
1388 # define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
1389 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
1390 "\0"
1391 # define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
1392 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
1393 "\0"
1394 # define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
1395 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
1396 "\0"
1397 # define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
1398 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
1399 "\0"
1400 # define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
1401 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
1402 "\0"
1403 # define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
1404 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
1405 "\0"
1406 # define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
1407 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
1408 "\0"
1409 # define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
1410 gettext_noop ("Invalid range end") /* REG_ERANGE */
1411 "\0"
1412 # define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
1413 gettext_noop ("Memory exhausted") /* REG_ESPACE */
1414 "\0"
1415 # define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
1416 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
1417 "\0"
1418 # define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
1419 gettext_noop ("Premature end of regular expression") /* REG_EEND */
1420 "\0"
1421 # define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
1422 gettext_noop ("Regular expression too big") /* REG_ESIZE */
1423 "\0"
1424 # define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
1425 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
1426 };
1427
1428 static const size_t re_error_msgid_idx[] =
1429 {
1430 REG_NOERROR_IDX,
1431 REG_NOMATCH_IDX,
1432 REG_BADPAT_IDX,
1433 REG_ECOLLATE_IDX,
1434 REG_ECTYPE_IDX,
1435 REG_EESCAPE_IDX,
1436 REG_ESUBREG_IDX,
1437 REG_EBRACK_IDX,
1438 REG_EPAREN_IDX,
1439 REG_EBRACE_IDX,
1440 REG_BADBR_IDX,
1441 REG_ERANGE_IDX,
1442 REG_ESPACE_IDX,
1443 REG_BADRPT_IDX,
1444 REG_EEND_IDX,
1445 REG_ESIZE_IDX,
1446 REG_ERPAREN_IDX
1447 };
1448
1449 #endif /* INSIDE_RECURSION */
1451
1452 #ifndef DEFINED_ONCE
1453 /* Avoiding alloca during matching, to placate r_alloc. */
1454
1455 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1456 searching and matching functions should not call alloca. On some
1457 systems, alloca is implemented in terms of malloc, and if we're
1458 using the relocating allocator routines, then malloc could cause a
1459 relocation, which might (if the strings being searched are in the
1460 ralloc heap) shift the data out from underneath the regexp
1461 routines.
1462
1463 Here's another reason to avoid allocation: Emacs
1464 processes input from X in a signal handler; processing X input may
1465 call malloc; if input arrives while a matching routine is calling
1466 malloc, then we're scrod. But Emacs can't just block input while
1467 calling matching routines; then we don't notice interrupts when
1468 they come in. So, Emacs blocks input around all regexp calls
1469 except the matching calls, which it leaves unprotected, in the
1470 faith that they will not malloc. */
1471
1472 /* Normally, this is fine. */
1473 # define MATCH_MAY_ALLOCATE
1474
1475 /* When using GNU C, we are not REALLY using the C alloca, no matter
1476 what config.h may say. So don't take precautions for it. */
1477 # ifdef __GNUC__
1478 # undef C_ALLOCA
1479 # endif
1480
1481 /* The match routines may not allocate if (1) they would do it with malloc
1482 and (2) it's not safe for them to use malloc.
1483 Note that if REL_ALLOC is defined, matching would not use malloc for the
1484 failure stack, but we would still use it for the register vectors;
1485 so REL_ALLOC should not affect this. */
1486 # if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1487 # undef MATCH_MAY_ALLOCATE
1488 # endif
1489 #endif /* not DEFINED_ONCE */
1490
1491 #ifdef INSIDE_RECURSION
1493 /* Failure stack declarations and macros; both re_compile_fastmap and
1494 re_match_2 use a failure stack. These have to be macros because of
1495 REGEX_ALLOCATE_STACK. */
1496
1497
1498 /* Number of failure points for which to initially allocate space
1499 when matching. If this number is exceeded, we allocate more
1500 space, so it is not a hard limit. */
1501 # ifndef INIT_FAILURE_ALLOC
1502 # define INIT_FAILURE_ALLOC 5
1503 # endif
1504
1505 /* Roughly the maximum number of failure points on the stack. Would be
1506 exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1507 This is a variable only so users of regex can assign to it; we never
1508 change it ourselves. */
1509
1510 # ifdef INT_IS_16BIT
1511
1512 # ifndef DEFINED_ONCE
1513 # if defined MATCH_MAY_ALLOCATE
1514 /* 4400 was enough to cause a crash on Alpha OSF/1,
1515 whose default stack limit is 2mb. */
1516 long int re_max_failures = 4000;
1517 # else
1518 long int re_max_failures = 2000;
1519 # endif
1520 # endif
1521
1522 union PREFIX(fail_stack_elt)
1523 {
1524 UCHAR_T *pointer;
1525 long int integer;
1526 };
1527
1528 typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1529
1530 typedef struct
1531 {
1532 PREFIX(fail_stack_elt_t) *stack;
1533 unsigned long int size;
1534 unsigned long int avail; /* Offset of next open position. */
1535 } PREFIX(fail_stack_type);
1536
1537 # else /* not INT_IS_16BIT */
1538
1539 # ifndef DEFINED_ONCE
1540 # if defined MATCH_MAY_ALLOCATE
1541 /* 4400 was enough to cause a crash on Alpha OSF/1,
1542 whose default stack limit is 2mb. */
1543 int re_max_failures = 4000;
1544 # else
1545 int re_max_failures = 2000;
1546 # endif
1547 # endif
1548
1549 union PREFIX(fail_stack_elt)
1550 {
1551 UCHAR_T *pointer;
1552 int integer;
1553 };
1554
1555 typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1556
1557 typedef struct
1558 {
1559 PREFIX(fail_stack_elt_t) *stack;
1560 unsigned size;
1561 unsigned avail; /* Offset of next open position. */
1562 } PREFIX(fail_stack_type);
1563
1564 # endif /* INT_IS_16BIT */
1565
1566 # ifndef DEFINED_ONCE
1567 # define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1568 # define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1569 # define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1570 # endif
1571
1572
1573 /* Define macros to initialize and free the failure stack.
1574 Do `return -2' if the alloc fails. */
1575
1576 # ifdef MATCH_MAY_ALLOCATE
1577 # define INIT_FAIL_STACK() \
1578 do { \
1579 fail_stack.stack = (PREFIX(fail_stack_elt_t) *) \
1580 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (PREFIX(fail_stack_elt_t))); \
1581 \
1582 if (fail_stack.stack == NULL) \
1583 return -2; \
1584 \
1585 fail_stack.size = INIT_FAILURE_ALLOC; \
1586 fail_stack.avail = 0; \
1587 } while (0)
1588
1589 # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1590 # else
1591 # define INIT_FAIL_STACK() \
1592 do { \
1593 fail_stack.avail = 0; \
1594 } while (0)
1595
1596 # define RESET_FAIL_STACK()
1597 # endif
1598
1599
1600 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1601
1602 Return 1 if succeeds, and 0 if either ran out of memory
1603 allocating space for it or it was already too large.
1604
1605 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1606
1607 # define DOUBLE_FAIL_STACK(fail_stack) \
1608 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1609 ? 0 \
1610 : ((fail_stack).stack = (PREFIX(fail_stack_elt_t) *) \
1611 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1612 (fail_stack).size * sizeof (PREFIX(fail_stack_elt_t)), \
1613 ((fail_stack).size << 1) * sizeof (PREFIX(fail_stack_elt_t))),\
1614 \
1615 (fail_stack).stack == NULL \
1616 ? 0 \
1617 : ((fail_stack).size <<= 1, \
1618 1)))
1619
1620
1621 /* Push pointer POINTER on FAIL_STACK.
1622 Return 1 if was able to do so and 0 if ran out of memory allocating
1623 space to do so. */
1624 # define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1625 ((FAIL_STACK_FULL () \
1626 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1627 ? 0 \
1628 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1629 1))
1630
1631 /* Push a pointer value onto the failure stack.
1632 Assumes the variable `fail_stack'. Probably should only
1633 be called from within `PUSH_FAILURE_POINT'. */
1634 # define PUSH_FAILURE_POINTER(item) \
1635 fail_stack.stack[fail_stack.avail++].pointer = (UCHAR_T *) (item)
1636
1637 /* This pushes an integer-valued item onto the failure stack.
1638 Assumes the variable `fail_stack'. Probably should only
1639 be called from within `PUSH_FAILURE_POINT'. */
1640 # define PUSH_FAILURE_INT(item) \
1641 fail_stack.stack[fail_stack.avail++].integer = (item)
1642
1643 /* Push a fail_stack_elt_t value onto the failure stack.
1644 Assumes the variable `fail_stack'. Probably should only
1645 be called from within `PUSH_FAILURE_POINT'. */
1646 # define PUSH_FAILURE_ELT(item) \
1647 fail_stack.stack[fail_stack.avail++] = (item)
1648
1649 /* These three POP... operations complement the three PUSH... operations.
1650 All assume that `fail_stack' is nonempty. */
1651 # define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1652 # define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1653 # define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1654
1655 /* Used to omit pushing failure point id's when we're not debugging. */
1656 # ifdef DEBUG
1657 # define DEBUG_PUSH PUSH_FAILURE_INT
1658 # define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1659 # else
1660 # define DEBUG_PUSH(item)
1661 # define DEBUG_POP(item_addr)
1662 # endif
1663
1664
1665 /* Push the information about the state we will need
1666 if we ever fail back to it.
1667
1668 Requires variables fail_stack, regstart, regend, reg_info, and
1669 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
1670 be declared.
1671
1672 Does `return FAILURE_CODE' if runs out of memory. */
1673
1674 # define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1675 do { \
1676 char *destination; \
1677 /* Must be int, so when we don't save any registers, the arithmetic \
1678 of 0 + -1 isn't done as unsigned. */ \
1679 /* Can't be int, since there is not a shred of a guarantee that int \
1680 is wide enough to hold a value of something to which pointer can \
1681 be assigned */ \
1682 active_reg_t this_reg; \
1683 \
1684 DEBUG_STATEMENT (failure_id++); \
1685 DEBUG_STATEMENT (nfailure_points_pushed++); \
1686 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1687 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1688 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1689 \
1690 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
1691 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1692 \
1693 /* Ensure we have enough space allocated for what we will push. */ \
1694 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1695 { \
1696 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1697 return failure_code; \
1698 \
1699 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1700 (fail_stack).size); \
1701 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1702 } \
1703 \
1704 /* Push the info, starting with the registers. */ \
1705 DEBUG_PRINT1 ("\n"); \
1706 \
1707 if (1) \
1708 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1709 this_reg++) \
1710 { \
1711 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
1712 DEBUG_STATEMENT (num_regs_pushed++); \
1713 \
1714 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1715 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1716 \
1717 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1718 PUSH_FAILURE_POINTER (regend[this_reg]); \
1719 \
1720 DEBUG_PRINT2 (" info: %p\n ", \
1721 reg_info[this_reg].word.pointer); \
1722 DEBUG_PRINT2 (" match_null=%d", \
1723 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1724 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1725 DEBUG_PRINT2 (" matched_something=%d", \
1726 MATCHED_SOMETHING (reg_info[this_reg])); \
1727 DEBUG_PRINT2 (" ever_matched=%d", \
1728 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1729 DEBUG_PRINT1 ("\n"); \
1730 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1731 } \
1732 \
1733 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
1734 PUSH_FAILURE_INT (lowest_active_reg); \
1735 \
1736 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
1737 PUSH_FAILURE_INT (highest_active_reg); \
1738 \
1739 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
1740 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1741 PUSH_FAILURE_POINTER (pattern_place); \
1742 \
1743 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
1744 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1745 size2); \
1746 DEBUG_PRINT1 ("'\n"); \
1747 PUSH_FAILURE_POINTER (string_place); \
1748 \
1749 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1750 DEBUG_PUSH (failure_id); \
1751 } while (0)
1752
1753 # ifndef DEFINED_ONCE
1754 /* This is the number of items that are pushed and popped on the stack
1755 for each register. */
1756 # define NUM_REG_ITEMS 3
1757
1758 /* Individual items aside from the registers. */
1759 # ifdef DEBUG
1760 # define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1761 # else
1762 # define NUM_NONREG_ITEMS 4
1763 # endif
1764
1765 /* We push at most this many items on the stack. */
1766 /* We used to use (num_regs - 1), which is the number of registers
1767 this regexp will save; but that was changed to 5
1768 to avoid stack overflow for a regexp with lots of parens. */
1769 # define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1770
1771 /* We actually push this many items. */
1772 # define NUM_FAILURE_ITEMS \
1773 (((0 \
1774 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1775 * NUM_REG_ITEMS) \
1776 + NUM_NONREG_ITEMS)
1777
1778 /* How many items can still be added to the stack without overflowing it. */
1779 # define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1780 # endif /* not DEFINED_ONCE */
1781
1782
1783 /* Pops what PUSH_FAIL_STACK pushes.
1784
1785 We restore into the parameters, all of which should be lvalues:
1786 STR -- the saved data position.
1787 PAT -- the saved pattern position.
1788 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1789 REGSTART, REGEND -- arrays of string positions.
1790 REG_INFO -- array of information about each subexpression.
1791
1792 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1793 `pend', `string1', `size1', `string2', and `size2'. */
1794 # define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1795 { \
1796 DEBUG_STATEMENT (unsigned failure_id;) \
1797 active_reg_t this_reg; \
1798 const UCHAR_T *string_temp; \
1799 \
1800 assert (!FAIL_STACK_EMPTY ()); \
1801 \
1802 /* Remove failure points and point to how many regs pushed. */ \
1803 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1804 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1805 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1806 \
1807 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1808 \
1809 DEBUG_POP (&failure_id); \
1810 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1811 \
1812 /* If the saved string location is NULL, it came from an \
1813 on_failure_keep_string_jump opcode, and we want to throw away the \
1814 saved NULL, thus retaining our current position in the string. */ \
1815 string_temp = POP_FAILURE_POINTER (); \
1816 if (string_temp != NULL) \
1817 str = (const CHAR_T *) string_temp; \
1818 \
1819 DEBUG_PRINT2 (" Popping string %p: `", str); \
1820 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1821 DEBUG_PRINT1 ("'\n"); \
1822 \
1823 pat = (UCHAR_T *) POP_FAILURE_POINTER (); \
1824 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
1825 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1826 \
1827 /* Restore register info. */ \
1828 high_reg = (active_reg_t) POP_FAILURE_INT (); \
1829 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
1830 \
1831 low_reg = (active_reg_t) POP_FAILURE_INT (); \
1832 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
1833 \
1834 if (1) \
1835 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1836 { \
1837 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
1838 \
1839 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1840 DEBUG_PRINT2 (" info: %p\n", \
1841 reg_info[this_reg].word.pointer); \
1842 \
1843 regend[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1844 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1845 \
1846 regstart[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1847 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1848 } \
1849 else \
1850 { \
1851 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1852 { \
1853 reg_info[this_reg].word.integer = 0; \
1854 regend[this_reg] = 0; \
1855 regstart[this_reg] = 0; \
1856 } \
1857 highest_active_reg = high_reg; \
1858 } \
1859 \
1860 set_regs_matched_done = 0; \
1861 DEBUG_STATEMENT (nfailure_points_popped++); \
1862 } /* POP_FAILURE_POINT */
1863
1864 /* Structure for per-register (a.k.a. per-group) information.
1866 Other register information, such as the
1867 starting and ending positions (which are addresses), and the list of
1868 inner groups (which is a bits list) are maintained in separate
1869 variables.
1870
1871 We are making a (strictly speaking) nonportable assumption here: that
1872 the compiler will pack our bit fields into something that fits into
1873 the type of `word', i.e., is something that fits into one item on the
1874 failure stack. */
1875
1876
1877 /* Declarations and macros for re_match_2. */
1878
1879 typedef union
1880 {
1881 PREFIX(fail_stack_elt_t) word;
1882 struct
1883 {
1884 /* This field is one if this group can match the empty string,
1885 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1886 # define MATCH_NULL_UNSET_VALUE 3
1887 unsigned match_null_string_p : 2;
1888 unsigned is_active : 1;
1889 unsigned matched_something : 1;
1890 unsigned ever_matched_something : 1;
1891 } bits;
1892 } PREFIX(register_info_type);
1893
1894 # ifndef DEFINED_ONCE
1895 # define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1896 # define IS_ACTIVE(R) ((R).bits.is_active)
1897 # define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1898 # define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1899
1900
1901 /* Call this when have matched a real character; it sets `matched' flags
1902 for the subexpressions which we are currently inside. Also records
1903 that those subexprs have matched. */
1904 # define SET_REGS_MATCHED() \
1905 do \
1906 { \
1907 if (!set_regs_matched_done) \
1908 { \
1909 active_reg_t r; \
1910 set_regs_matched_done = 1; \
1911 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1912 { \
1913 MATCHED_SOMETHING (reg_info[r]) \
1914 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1915 = 1; \
1916 } \
1917 } \
1918 } \
1919 while (0)
1920 # endif /* not DEFINED_ONCE */
1921
1922 /* Registers are set to a sentinel when they haven't yet matched. */
1923 static CHAR_T PREFIX(reg_unset_dummy);
1924 # define REG_UNSET_VALUE (&PREFIX(reg_unset_dummy))
1925 # define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1926
1927 /* Subroutine declarations and macros for regex_compile. */
1928 static void PREFIX(store_op1) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc, int arg));
1929 static void PREFIX(store_op2) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc,
1930 int arg1, int arg2));
1931 static void PREFIX(insert_op1) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc,
1932 int arg, UCHAR_T *end));
1933 static void PREFIX(insert_op2) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc,
1934 int arg1, int arg2, UCHAR_T *end));
1935 static boolean PREFIX(at_begline_loc_p) _RE_ARGS ((const CHAR_T *pattern,
1936 const CHAR_T *p,
1937 reg_syntax_t syntax));
1938 static boolean PREFIX(at_endline_loc_p) _RE_ARGS ((const CHAR_T *p,
1939 const CHAR_T *pend,
1940 reg_syntax_t syntax));
1941 # ifdef WCHAR
1942 static reg_errcode_t wcs_compile_range _RE_ARGS ((CHAR_T range_start,
1943 const CHAR_T **p_ptr,
1944 const CHAR_T *pend,
1945 char *translate,
1946 reg_syntax_t syntax,
1947 UCHAR_T *b,
1948 CHAR_T *char_set));
1949 static void insert_space _RE_ARGS ((int num, CHAR_T *loc, CHAR_T *end));
1950 # else /* BYTE */
1951 static reg_errcode_t byte_compile_range _RE_ARGS ((unsigned int range_start,
1952 const char **p_ptr,
1953 const char *pend,
1954 char *translate,
1955 reg_syntax_t syntax,
1956 unsigned char *b));
1957 # endif /* WCHAR */
1958
1959 /* Fetch the next character in the uncompiled pattern---translating it
1960 if necessary. Also cast from a signed character in the constant
1961 string passed to us by the user to an unsigned char that we can use
1962 as an array index (in, e.g., `translate'). */
1963 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1964 because it is impossible to allocate 4GB array for some encodings
1965 which have 4 byte character_set like UCS4. */
1966 # ifndef PATFETCH
1967 # ifdef WCHAR
1968 # define PATFETCH(c) \
1969 do {if (p == pend) return REG_EEND; \
1970 c = (UCHAR_T) *p++; \
1971 if (translate && (c <= 0xff)) c = (UCHAR_T) translate[c]; \
1972 } while (0)
1973 # else /* BYTE */
1974 # define PATFETCH(c) \
1975 do {if (p == pend) return REG_EEND; \
1976 c = (unsigned char) *p++; \
1977 if (translate) c = (unsigned char) translate[c]; \
1978 } while (0)
1979 # endif /* WCHAR */
1980 # endif
1981
1982 /* Fetch the next character in the uncompiled pattern, with no
1983 translation. */
1984 # define PATFETCH_RAW(c) \
1985 do {if (p == pend) return REG_EEND; \
1986 c = (UCHAR_T) *p++; \
1987 } while (0)
1988
1989 /* Go backwards one character in the pattern. */
1990 # define PATUNFETCH p--
1991
1992
1993 /* If `translate' is non-null, return translate[D], else just D. We
1994 cast the subscript to translate because some data is declared as
1995 `char *', to avoid warnings when a string constant is passed. But
1996 when we use a character as a subscript we must make it unsigned. */
1997 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1998 because it is impossible to allocate 4GB array for some encodings
1999 which have 4 byte character_set like UCS4. */
2000
2001 # ifndef TRANSLATE
2002 # ifdef WCHAR
2003 # define TRANSLATE(d) \
2004 ((translate && ((UCHAR_T) (d)) <= 0xff) \
2005 ? (char) translate[(unsigned char) (d)] : (d))
2006 # else /* BYTE */
2007 # define TRANSLATE(d) \
2008 (translate ? (char) translate[(unsigned char) (d)] : (d))
2009 # endif /* WCHAR */
2010 # endif
2011
2012
2013 /* Macros for outputting the compiled pattern into `buffer'. */
2014
2015 /* If the buffer isn't allocated when it comes in, use this. */
2016 # define INIT_BUF_SIZE (32 * sizeof(UCHAR_T))
2017
2018 /* Make sure we have at least N more bytes of space in buffer. */
2019 # ifdef WCHAR
2020 # define GET_BUFFER_SPACE(n) \
2021 while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \
2022 + (n)*sizeof(CHAR_T)) > bufp->allocated) \
2023 EXTEND_BUFFER ()
2024 # else /* BYTE */
2025 # define GET_BUFFER_SPACE(n) \
2026 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
2027 EXTEND_BUFFER ()
2028 # endif /* WCHAR */
2029
2030 /* Make sure we have one more byte of buffer space and then add C to it. */
2031 # define BUF_PUSH(c) \
2032 do { \
2033 GET_BUFFER_SPACE (1); \
2034 *b++ = (UCHAR_T) (c); \
2035 } while (0)
2036
2037
2038 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
2039 # define BUF_PUSH_2(c1, c2) \
2040 do { \
2041 GET_BUFFER_SPACE (2); \
2042 *b++ = (UCHAR_T) (c1); \
2043 *b++ = (UCHAR_T) (c2); \
2044 } while (0)
2045
2046
2047 /* As with BUF_PUSH_2, except for three bytes. */
2048 # define BUF_PUSH_3(c1, c2, c3) \
2049 do { \
2050 GET_BUFFER_SPACE (3); \
2051 *b++ = (UCHAR_T) (c1); \
2052 *b++ = (UCHAR_T) (c2); \
2053 *b++ = (UCHAR_T) (c3); \
2054 } while (0)
2055
2056 /* Store a jump with opcode OP at LOC to location TO. We store a
2057 relative address offset by the three bytes the jump itself occupies. */
2058 # define STORE_JUMP(op, loc, to) \
2059 PREFIX(store_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)))
2060
2061 /* Likewise, for a two-argument jump. */
2062 # define STORE_JUMP2(op, loc, to, arg) \
2063 PREFIX(store_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg)
2064
2065 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
2066 # define INSERT_JUMP(op, loc, to) \
2067 PREFIX(insert_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b)
2068
2069 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
2070 # define INSERT_JUMP2(op, loc, to, arg) \
2071 PREFIX(insert_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\
2072 arg, b)
2073
2074 /* This is not an arbitrary limit: the arguments which represent offsets
2075 into the pattern are two bytes long. So if 2^16 bytes turns out to
2076 be too small, many things would have to change. */
2077 /* Any other compiler which, like MSC, has allocation limit below 2^16
2078 bytes will have to use approach similar to what was done below for
2079 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
2080 reallocating to 0 bytes. Such thing is not going to work too well.
2081 You have been warned!! */
2082 # ifndef DEFINED_ONCE
2083 # if defined _MSC_VER && !defined WIN32
2084 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
2085 The REALLOC define eliminates a flurry of conversion warnings,
2086 but is not required. */
2087 # define MAX_BUF_SIZE 65500L
2088 # define REALLOC(p,s) realloc ((p), (size_t) (s))
2089 # else
2090 # define MAX_BUF_SIZE (1L << 16)
2091 # define REALLOC(p,s) realloc ((p), (s))
2092 # endif
2093
2094 /* Extend the buffer by twice its current size via realloc and
2095 reset the pointers that pointed into the old block to point to the
2096 correct places in the new one. If extending the buffer results in it
2097 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
2098 # if __BOUNDED_POINTERS__
2099 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
2100 # define MOVE_BUFFER_POINTER(P) \
2101 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
2102 # define ELSE_EXTEND_BUFFER_HIGH_BOUND \
2103 else \
2104 { \
2105 SET_HIGH_BOUND (b); \
2106 SET_HIGH_BOUND (begalt); \
2107 if (fixup_alt_jump) \
2108 SET_HIGH_BOUND (fixup_alt_jump); \
2109 if (laststart) \
2110 SET_HIGH_BOUND (laststart); \
2111 if (pending_exact) \
2112 SET_HIGH_BOUND (pending_exact); \
2113 }
2114 # else
2115 # define MOVE_BUFFER_POINTER(P) (P) += incr
2116 # define ELSE_EXTEND_BUFFER_HIGH_BOUND
2117 # endif
2118 # endif /* not DEFINED_ONCE */
2119
2120 # ifdef WCHAR
2121 # define EXTEND_BUFFER() \
2122 do { \
2123 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2124 int wchar_count; \
2125 if (bufp->allocated + sizeof(UCHAR_T) > MAX_BUF_SIZE) \
2126 return REG_ESIZE; \
2127 bufp->allocated <<= 1; \
2128 if (bufp->allocated > MAX_BUF_SIZE) \
2129 bufp->allocated = MAX_BUF_SIZE; \
2130 /* How many characters the new buffer can have? */ \
2131 wchar_count = bufp->allocated / sizeof(UCHAR_T); \
2132 if (wchar_count == 0) wchar_count = 1; \
2133 /* Truncate the buffer to CHAR_T align. */ \
2134 bufp->allocated = wchar_count * sizeof(UCHAR_T); \
2135 RETALLOC (COMPILED_BUFFER_VAR, wchar_count, UCHAR_T); \
2136 bufp->buffer = (char*)COMPILED_BUFFER_VAR; \
2137 if (COMPILED_BUFFER_VAR == NULL) \
2138 return REG_ESPACE; \
2139 /* If the buffer moved, move all the pointers into it. */ \
2140 if (old_buffer != COMPILED_BUFFER_VAR) \
2141 { \
2142 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2143 MOVE_BUFFER_POINTER (b); \
2144 MOVE_BUFFER_POINTER (begalt); \
2145 if (fixup_alt_jump) \
2146 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2147 if (laststart) \
2148 MOVE_BUFFER_POINTER (laststart); \
2149 if (pending_exact) \
2150 MOVE_BUFFER_POINTER (pending_exact); \
2151 } \
2152 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2153 } while (0)
2154 # else /* BYTE */
2155 # define EXTEND_BUFFER() \
2156 do { \
2157 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2158 if (bufp->allocated == MAX_BUF_SIZE) \
2159 return REG_ESIZE; \
2160 bufp->allocated <<= 1; \
2161 if (bufp->allocated > MAX_BUF_SIZE) \
2162 bufp->allocated = MAX_BUF_SIZE; \
2163 bufp->buffer = (UCHAR_T *) REALLOC (COMPILED_BUFFER_VAR, \
2164 bufp->allocated); \
2165 if (COMPILED_BUFFER_VAR == NULL) \
2166 return REG_ESPACE; \
2167 /* If the buffer moved, move all the pointers into it. */ \
2168 if (old_buffer != COMPILED_BUFFER_VAR) \
2169 { \
2170 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2171 MOVE_BUFFER_POINTER (b); \
2172 MOVE_BUFFER_POINTER (begalt); \
2173 if (fixup_alt_jump) \
2174 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2175 if (laststart) \
2176 MOVE_BUFFER_POINTER (laststart); \
2177 if (pending_exact) \
2178 MOVE_BUFFER_POINTER (pending_exact); \
2179 } \
2180 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2181 } while (0)
2182 # endif /* WCHAR */
2183
2184 # ifndef DEFINED_ONCE
2185 /* Since we have one byte reserved for the register number argument to
2186 {start,stop}_memory, the maximum number of groups we can report
2187 things about is what fits in that byte. */
2188 # define MAX_REGNUM 255
2189
2190 /* But patterns can have more than `MAX_REGNUM' registers. We just
2191 ignore the excess. */
2192 typedef unsigned regnum_t;
2193
2194
2195 /* Macros for the compile stack. */
2196
2197 /* Since offsets can go either forwards or backwards, this type needs to
2198 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
2199 /* int may be not enough when sizeof(int) == 2. */
2200 typedef long pattern_offset_t;
2201
2202 typedef struct
2203 {
2204 pattern_offset_t begalt_offset;
2205 pattern_offset_t fixup_alt_jump;
2206 pattern_offset_t inner_group_offset;
2207 pattern_offset_t laststart_offset;
2208 regnum_t regnum;
2209 } compile_stack_elt_t;
2210
2211
2212 typedef struct
2213 {
2214 compile_stack_elt_t *stack;
2215 unsigned size;
2216 unsigned avail; /* Offset of next open position. */
2217 } compile_stack_type;
2218
2219
2220 # define INIT_COMPILE_STACK_SIZE 32
2221
2222 # define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
2223 # define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
2224
2225 /* The next available element. */
2226 # define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
2227
2228 # endif /* not DEFINED_ONCE */
2229
2230 /* Set the bit for character C in a list. */
2231 # ifndef DEFINED_ONCE
2232 # define SET_LIST_BIT(c) \
2233 (b[((unsigned char) (c)) / BYTEWIDTH] \
2234 |= 1 << (((unsigned char) c) % BYTEWIDTH))
2235 # endif /* DEFINED_ONCE */
2236
2237 /* Get the next unsigned number in the uncompiled pattern. */
2238 # define GET_UNSIGNED_NUMBER(num) \
2239 { \
2240 while (p != pend) \
2241 { \
2242 PATFETCH (c); \
2243 if (c < '0' || c > '9') \
2244 break; \
2245 if (num <= RE_DUP_MAX) \
2246 { \
2247 if (num < 0) \
2248 num = 0; \
2249 num = num * 10 + c - '0'; \
2250 } \
2251 } \
2252 }
2253
2254 # ifndef DEFINED_ONCE
2255 # if defined _LIBC || WIDE_CHAR_SUPPORT
2256 /* The GNU C library provides support for user-defined character classes
2257 and the functions from ISO C amendement 1. */
2258 # ifdef CHARCLASS_NAME_MAX
2259 # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
2260 # else
2261 /* This shouldn't happen but some implementation might still have this
2262 problem. Use a reasonable default value. */
2263 # define CHAR_CLASS_MAX_LENGTH 256
2264 # endif
2265
2266 # ifdef _LIBC
2267 # define IS_CHAR_CLASS(string) __wctype (string)
2268 # else
2269 # define IS_CHAR_CLASS(string) wctype (string)
2270 # endif
2271 # else
2272 # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
2273
2274 # define IS_CHAR_CLASS(string) \
2275 (STREQ (string, "alpha") || STREQ (string, "upper") \
2276 || STREQ (string, "lower") || STREQ (string, "digit") \
2277 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
2278 || STREQ (string, "space") || STREQ (string, "print") \
2279 || STREQ (string, "punct") || STREQ (string, "graph") \
2280 || STREQ (string, "cntrl") || STREQ (string, "blank"))
2281 # endif
2282 # endif /* DEFINED_ONCE */
2283
2284 # ifndef MATCH_MAY_ALLOCATE
2286
2287 /* If we cannot allocate large objects within re_match_2_internal,
2288 we make the fail stack and register vectors global.
2289 The fail stack, we grow to the maximum size when a regexp
2290 is compiled.
2291 The register vectors, we adjust in size each time we
2292 compile a regexp, according to the number of registers it needs. */
2293
2294 static PREFIX(fail_stack_type) fail_stack;
2295
2296 /* Size with which the following vectors are currently allocated.
2297 That is so we can make them bigger as needed,
2298 but never make them smaller. */
2299 # ifdef DEFINED_ONCE
2300 static int regs_allocated_size;
2301
2302 static const char ** regstart, ** regend;
2303 static const char ** old_regstart, ** old_regend;
2304 static const char **best_regstart, **best_regend;
2305 static const char **reg_dummy;
2306 # endif /* DEFINED_ONCE */
2307
2308 static PREFIX(register_info_type) *PREFIX(reg_info);
2309 static PREFIX(register_info_type) *PREFIX(reg_info_dummy);
2310
2311 /* Make the register vectors big enough for NUM_REGS registers,
2312 but don't make them smaller. */
2313
2314 static void
2315 PREFIX(regex_grow_registers) (num_regs)
2316 int num_regs;
2317 {
2318 if (num_regs > regs_allocated_size)
2319 {
2320 RETALLOC_IF (regstart, num_regs, const char *);
2321 RETALLOC_IF (regend, num_regs, const char *);
2322 RETALLOC_IF (old_regstart, num_regs, const char *);
2323 RETALLOC_IF (old_regend, num_regs, const char *);
2324 RETALLOC_IF (best_regstart, num_regs, const char *);
2325 RETALLOC_IF (best_regend, num_regs, const char *);
2326 RETALLOC_IF (PREFIX(reg_info), num_regs, PREFIX(register_info_type));
2327 RETALLOC_IF (reg_dummy, num_regs, const char *);
2328 RETALLOC_IF (PREFIX(reg_info_dummy), num_regs, PREFIX(register_info_type));
2329
2330 regs_allocated_size = num_regs;
2331 }
2332 }
2333
2334 # endif /* not MATCH_MAY_ALLOCATE */
2335
2336 # ifndef DEFINED_ONCE
2338 static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2339 compile_stack,
2340 regnum_t regnum));
2341 # endif /* not DEFINED_ONCE */
2342
2343 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2344 Returns one of error codes defined in `regex.h', or zero for success.
2345
2346 Assumes the `allocated' (and perhaps `buffer') and `translate'
2347 fields are set in BUFP on entry.
2348
2349 If it succeeds, results are put in BUFP (if it returns an error, the
2350 contents of BUFP are undefined):
2351 `buffer' is the compiled pattern;
2352 `syntax' is set to SYNTAX;
2353 `used' is set to the length of the compiled pattern;
2354 `fastmap_accurate' is zero;
2355 `re_nsub' is the number of subexpressions in PATTERN;
2356 `not_bol' and `not_eol' are zero;
2357
2358 The `fastmap' and `newline_anchor' fields are neither
2359 examined nor set. */
2360
2361 /* Return, freeing storage we allocated. */
2362 # ifdef WCHAR
2363 # define FREE_STACK_RETURN(value) \
2364 return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value)
2365 # else
2366 # define FREE_STACK_RETURN(value) \
2367 return (free (compile_stack.stack), value)
2368 # endif /* WCHAR */
2369
2370 static reg_errcode_t
2371 PREFIX(regex_compile) (ARG_PREFIX(pattern), ARG_PREFIX(size), syntax, bufp)
2372 const char *ARG_PREFIX(pattern);
2373 size_t ARG_PREFIX(size);
2374 reg_syntax_t syntax;
2375 struct re_pattern_buffer *bufp;
2376 {
2377 /* We fetch characters from PATTERN here. Even though PATTERN is
2378 `char *' (i.e., signed), we declare these variables as unsigned, so
2379 they can be reliably used as array indices. */
2380 register UCHAR_T c, c1;
2381
2382 #ifdef WCHAR
2383 /* A temporary space to keep wchar_t pattern and compiled pattern. */
2384 CHAR_T *pattern, *COMPILED_BUFFER_VAR;
2385 size_t size;
2386 /* offset buffer for optimization. See convert_mbs_to_wc. */
2387 int *mbs_offset = NULL;
2388 /* It hold whether each wchar_t is binary data or not. */
2389 char *is_binary = NULL;
2390 /* A flag whether exactn is handling binary data or not. */
2391 char is_exactn_bin = FALSE;
2392 #endif /* WCHAR */
2393
2394 /* A random temporary spot in PATTERN. */
2395 const CHAR_T *p1;
2396
2397 /* Points to the end of the buffer, where we should append. */
2398 register UCHAR_T *b;
2399
2400 /* Keeps track of unclosed groups. */
2401 compile_stack_type compile_stack;
2402
2403 /* Points to the current (ending) position in the pattern. */
2404 #ifdef WCHAR
2405 const CHAR_T *p;
2406 const CHAR_T *pend;
2407 #else /* BYTE */
2408 const CHAR_T *p = pattern;
2409 const CHAR_T *pend = pattern + size;
2410 #endif /* WCHAR */
2411
2412 /* How to translate the characters in the pattern. */
2413 RE_TRANSLATE_TYPE translate = bufp->translate;
2414
2415 /* Address of the count-byte of the most recently inserted `exactn'
2416 command. This makes it possible to tell if a new exact-match
2417 character can be added to that command or if the character requires
2418 a new `exactn' command. */
2419 UCHAR_T *pending_exact = 0;
2420
2421 /* Address of start of the most recently finished expression.
2422 This tells, e.g., postfix * where to find the start of its
2423 operand. Reset at the beginning of groups and alternatives. */
2424 UCHAR_T *laststart = 0;
2425
2426 /* Address of beginning of regexp, or inside of last group. */
2427 UCHAR_T *begalt;
2428
2429 /* Address of the place where a forward jump should go to the end of
2430 the containing expression. Each alternative of an `or' -- except the
2431 last -- ends with a forward jump of this sort. */
2432 UCHAR_T *fixup_alt_jump = 0;
2433
2434 /* Counts open-groups as they are encountered. Remembered for the
2435 matching close-group on the compile stack, so the same register
2436 number is put in the stop_memory as the start_memory. */
2437 regnum_t regnum = 0;
2438
2439 #ifdef WCHAR
2440 /* Initialize the wchar_t PATTERN and offset_buffer. */
2441 p = pend = pattern = TALLOC(csize + 1, CHAR_T);
2442 mbs_offset = TALLOC(csize + 1, int);
2443 is_binary = TALLOC(csize + 1, char);
2444 if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
2445 {
2446 free(pattern);
2447 free(mbs_offset);
2448 free(is_binary);
2449 return REG_ESPACE;
2450 }
2451 pattern[csize] = L'\0'; /* sentinel */
2452 size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
2453 pend = p + size;
2454 if (size < 0)
2455 {
2456 free(pattern);
2457 free(mbs_offset);
2458 free(is_binary);
2459 return REG_BADPAT;
2460 }
2461 #endif
2462
2463 #ifdef DEBUG
2464 DEBUG_PRINT1 ("\nCompiling pattern: ");
2465 if (debug)
2466 {
2467 unsigned debug_count;
2468
2469 for (debug_count = 0; debug_count < size; debug_count++)
2470 PUT_CHAR (pattern[debug_count]);
2471 putchar ('\n');
2472 }
2473 #endif /* DEBUG */
2474
2475 /* Initialize the compile stack. */
2476 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2477 if (compile_stack.stack == NULL)
2478 {
2479 #ifdef WCHAR
2480 free(pattern);
2481 free(mbs_offset);
2482 free(is_binary);
2483 #endif
2484 return REG_ESPACE;
2485 }
2486
2487 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2488 compile_stack.avail = 0;
2489
2490 /* Initialize the pattern buffer. */
2491 bufp->syntax = syntax;
2492 bufp->fastmap_accurate = 0;
2493 bufp->not_bol = bufp->not_eol = 0;
2494
2495 /* Set `used' to zero, so that if we return an error, the pattern
2496 printer (for debugging) will think there's no pattern. We reset it
2497 at the end. */
2498 bufp->used = 0;
2499
2500 /* Always count groups, whether or not bufp->no_sub is set. */
2501 bufp->re_nsub = 0;
2502
2503 #if !defined emacs && !defined SYNTAX_TABLE
2504 /* Initialize the syntax table. */
2505 init_syntax_once ();
2506 #endif
2507
2508 if (bufp->allocated == 0)
2509 {
2510 if (bufp->buffer)
2511 { /* If zero allocated, but buffer is non-null, try to realloc
2512 enough space. This loses if buffer's address is bogus, but
2513 that is the user's responsibility. */
2514 #ifdef WCHAR
2515 /* Free bufp->buffer and allocate an array for wchar_t pattern
2516 buffer. */
2517 free(bufp->buffer);
2518 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(UCHAR_T),
2519 UCHAR_T);
2520 #else
2521 RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, UCHAR_T);
2522 #endif /* WCHAR */
2523 }
2524 else
2525 { /* Caller did not allocate a buffer. Do it for them. */
2526 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(UCHAR_T),
2527 UCHAR_T);
2528 }
2529
2530 if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE);
2531 #ifdef WCHAR
2532 bufp->buffer = (char*)COMPILED_BUFFER_VAR;
2533 #endif /* WCHAR */
2534 bufp->allocated = INIT_BUF_SIZE;
2535 }
2536 #ifdef WCHAR
2537 else
2538 COMPILED_BUFFER_VAR = (UCHAR_T*) bufp->buffer;
2539 #endif
2540
2541 begalt = b = COMPILED_BUFFER_VAR;
2542
2543 /* Loop through the uncompiled pattern until we're at the end. */
2544 while (p != pend)
2545 {
2546 PATFETCH (c);
2547
2548 switch (c)
2549 {
2550 case '^':
2551 {
2552 if ( /* If at start of pattern, it's an operator. */
2553 p == pattern + 1
2554 /* If context independent, it's an operator. */
2555 || syntax & RE_CONTEXT_INDEP_ANCHORS
2556 /* Otherwise, depends on what's come before. */
2557 || PREFIX(at_begline_loc_p) (pattern, p, syntax))
2558 BUF_PUSH (begline);
2559 else
2560 goto normal_char;
2561 }
2562 break;
2563
2564
2565 case '$':
2566 {
2567 if ( /* If at end of pattern, it's an operator. */
2568 p == pend
2569 /* If context independent, it's an operator. */
2570 || syntax & RE_CONTEXT_INDEP_ANCHORS
2571 /* Otherwise, depends on what's next. */
2572 || PREFIX(at_endline_loc_p) (p, pend, syntax))
2573 BUF_PUSH (endline);
2574 else
2575 goto normal_char;
2576 }
2577 break;
2578
2579
2580 case '+':
2581 case '?':
2582 if ((syntax & RE_BK_PLUS_QM)
2583 || (syntax & RE_LIMITED_OPS))
2584 goto normal_char;
2585 handle_plus:
2586 case '*':
2587 /* If there is no previous pattern... */
2588 if (!laststart)
2589 {
2590 if (syntax & RE_CONTEXT_INVALID_OPS)
2591 FREE_STACK_RETURN (REG_BADRPT);
2592 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2593 goto normal_char;
2594 }
2595
2596 {
2597 /* Are we optimizing this jump? */
2598 boolean keep_string_p = false;
2599
2600 /* 1 means zero (many) matches is allowed. */
2601 char zero_times_ok = 0, many_times_ok = 0;
2602
2603 /* If there is a sequence of repetition chars, collapse it
2604 down to just one (the right one). We can't combine
2605 interval operators with these because of, e.g., `a{2}*',
2606 which should only match an even number of `a's. */
2607
2608 for (;;)
2609 {
2610 zero_times_ok |= c != '+';
2611 many_times_ok |= c != '?';
2612
2613 if (p == pend)
2614 break;
2615
2616 PATFETCH (c);
2617
2618 if (c == '*'
2619 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2620 ;
2621
2622 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2623 {
2624 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2625
2626 PATFETCH (c1);
2627 if (!(c1 == '+' || c1 == '?'))
2628 {
2629 PATUNFETCH;
2630 PATUNFETCH;
2631 break;
2632 }
2633
2634 c = c1;
2635 }
2636 else
2637 {
2638 PATUNFETCH;
2639 break;
2640 }
2641
2642 /* If we get here, we found another repeat character. */
2643 }
2644
2645 /* Star, etc. applied to an empty pattern is equivalent
2646 to an empty pattern. */
2647 if (!laststart)
2648 break;
2649
2650 /* Now we know whether or not zero matches is allowed
2651 and also whether or not two or more matches is allowed. */
2652 if (many_times_ok)
2653 { /* More than one repetition is allowed, so put in at the
2654 end a backward relative jump from `b' to before the next
2655 jump we're going to put in below (which jumps from
2656 laststart to after this jump).
2657
2658 But if we are at the `*' in the exact sequence `.*\n',
2659 insert an unconditional jump backwards to the .,
2660 instead of the beginning of the loop. This way we only
2661 push a failure point once, instead of every time
2662 through the loop. */
2663 assert (p - 1 > pattern);
2664
2665 /* Allocate the space for the jump. */
2666 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2667
2668 /* We know we are not at the first character of the pattern,
2669 because laststart was nonzero. And we've already
2670 incremented `p', by the way, to be the character after
2671 the `*'. Do we have to do something analogous here
2672 for null bytes, because of RE_DOT_NOT_NULL? */
2673 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2674 && zero_times_ok
2675 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2676 && !(syntax & RE_DOT_NEWLINE))
2677 { /* We have .*\n. */
2678 STORE_JUMP (jump, b, laststart);
2679 keep_string_p = true;
2680 }
2681 else
2682 /* Anything else. */
2683 STORE_JUMP (maybe_pop_jump, b, laststart -
2684 (1 + OFFSET_ADDRESS_SIZE));
2685
2686 /* We've added more stuff to the buffer. */
2687 b += 1 + OFFSET_ADDRESS_SIZE;
2688 }
2689
2690 /* On failure, jump from laststart to b + 3, which will be the
2691 end of the buffer after this jump is inserted. */
2692 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of
2693 'b + 3'. */
2694 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2695 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2696 : on_failure_jump,
2697 laststart, b + 1 + OFFSET_ADDRESS_SIZE);
2698 pending_exact = 0;
2699 b += 1 + OFFSET_ADDRESS_SIZE;
2700
2701 if (!zero_times_ok)
2702 {
2703 /* At least one repetition is required, so insert a
2704 `dummy_failure_jump' before the initial
2705 `on_failure_jump' instruction of the loop. This
2706 effects a skip over that instruction the first time
2707 we hit that loop. */
2708 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2709 INSERT_JUMP (dummy_failure_jump, laststart, laststart +
2710 2 + 2 * OFFSET_ADDRESS_SIZE);
2711 b += 1 + OFFSET_ADDRESS_SIZE;
2712 }
2713 }
2714 break;
2715
2716
2717 case '.':
2718 laststart = b;
2719 BUF_PUSH (anychar);
2720 break;
2721
2722
2723 case '[':
2724 {
2725 boolean had_char_class = false;
2726 #ifdef WCHAR
2727 CHAR_T range_start = 0xffffffff;
2728 #else
2729 unsigned int range_start = 0xffffffff;
2730 #endif
2731 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2732
2733 #ifdef WCHAR
2734 /* We assume a charset(_not) structure as a wchar_t array.
2735 charset[0] = (re_opcode_t) charset(_not)
2736 charset[1] = l (= length of char_classes)
2737 charset[2] = m (= length of collating_symbols)
2738 charset[3] = n (= length of equivalence_classes)
2739 charset[4] = o (= length of char_ranges)
2740 charset[5] = p (= length of chars)
2741
2742 charset[6] = char_class (wctype_t)
2743 charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
2744 ...
2745 charset[l+5] = char_class (wctype_t)
2746
2747 charset[l+6] = collating_symbol (wchar_t)
2748 ...
2749 charset[l+m+5] = collating_symbol (wchar_t)
2750 ifdef _LIBC we use the index if
2751 _NL_COLLATE_SYMB_EXTRAMB instead of
2752 wchar_t string.
2753
2754 charset[l+m+6] = equivalence_classes (wchar_t)
2755 ...
2756 charset[l+m+n+5] = equivalence_classes (wchar_t)
2757 ifdef _LIBC we use the index in
2758 _NL_COLLATE_WEIGHT instead of
2759 wchar_t string.
2760
2761 charset[l+m+n+6] = range_start
2762 charset[l+m+n+7] = range_end
2763 ...
2764 charset[l+m+n+2o+4] = range_start
2765 charset[l+m+n+2o+5] = range_end
2766 ifdef _LIBC we use the value looked up
2767 in _NL_COLLATE_COLLSEQ instead of
2768 wchar_t character.
2769
2770 charset[l+m+n+2o+6] = char
2771 ...
2772 charset[l+m+n+2o+p+5] = char
2773
2774 */
2775
2776 /* We need at least 6 spaces: the opcode, the length of
2777 char_classes, the length of collating_symbols, the length of
2778 equivalence_classes, the length of char_ranges, the length of
2779 chars. */
2780 GET_BUFFER_SPACE (6);
2781
2782 /* Save b as laststart. And We use laststart as the pointer
2783 to the first element of the charset here.
2784 In other words, laststart[i] indicates charset[i]. */
2785 laststart = b;
2786
2787 /* We test `*p == '^' twice, instead of using an if
2788 statement, so we only need one BUF_PUSH. */
2789 BUF_PUSH (*p == '^' ? charset_not : charset);
2790 if (*p == '^')
2791 p++;
2792
2793 /* Push the length of char_classes, the length of
2794 collating_symbols, the length of equivalence_classes, the
2795 length of char_ranges and the length of chars. */
2796 BUF_PUSH_3 (0, 0, 0);
2797 BUF_PUSH_2 (0, 0);
2798
2799 /* Remember the first position in the bracket expression. */
2800 p1 = p;
2801
2802 /* charset_not matches newline according to a syntax bit. */
2803 if ((re_opcode_t) b[-6] == charset_not
2804 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2805 {
2806 BUF_PUSH('\n');
2807 laststart[5]++; /* Update the length of characters */
2808 }
2809
2810 /* Read in characters and ranges, setting map bits. */
2811 for (;;)
2812 {
2813 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2814
2815 PATFETCH (c);
2816
2817 /* \ might escape characters inside [...] and [^...]. */
2818 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2819 {
2820 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2821
2822 PATFETCH (c1);
2823 BUF_PUSH(c1);
2824 laststart[5]++; /* Update the length of chars */
2825 range_start = c1;
2826 continue;
2827 }
2828
2829 /* Could be the end of the bracket expression. If it's
2830 not (i.e., when the bracket expression is `[]' so
2831 far), the ']' character bit gets set way below. */
2832 if (c == ']' && p != p1 + 1)
2833 break;
2834
2835 /* Look ahead to see if it's a range when the last thing
2836 was a character class. */
2837 if (had_char_class && c == '-' && *p != ']')
2838 FREE_STACK_RETURN (REG_ERANGE);
2839
2840 /* Look ahead to see if it's a range when the last thing
2841 was a character: if this is a hyphen not at the
2842 beginning or the end of a list, then it's the range
2843 operator. */
2844 if (c == '-'
2845 && !(p - 2 >= pattern && p[-2] == '[')
2846 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2847 && *p != ']')
2848 {
2849 reg_errcode_t ret;
2850 /* Allocate the space for range_start and range_end. */
2851 GET_BUFFER_SPACE (2);
2852 /* Update the pointer to indicate end of buffer. */
2853 b += 2;
2854 ret = wcs_compile_range (range_start, &p, pend, translate,
2855 syntax, b, laststart);
2856 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2857 range_start = 0xffffffff;
2858 }
2859 else if (p[0] == '-' && p[1] != ']')
2860 { /* This handles ranges made up of characters only. */
2861 reg_errcode_t ret;
2862
2863 /* Move past the `-'. */
2864 PATFETCH (c1);
2865 /* Allocate the space for range_start and range_end. */
2866 GET_BUFFER_SPACE (2);
2867 /* Update the pointer to indicate end of buffer. */
2868 b += 2;
2869 ret = wcs_compile_range (c, &p, pend, translate, syntax, b,
2870 laststart);
2871 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2872 range_start = 0xffffffff;
2873 }
2874
2875 /* See if we're at the beginning of a possible character
2876 class. */
2877 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2878 { /* Leave room for the null. */
2879 char str[CHAR_CLASS_MAX_LENGTH + 1];
2880
2881 PATFETCH (c);
2882 c1 = 0;
2883
2884 /* If pattern is `[[:'. */
2885 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2886
2887 for (;;)
2888 {
2889 PATFETCH (c);
2890 if ((c == ':' && *p == ']') || p == pend)
2891 break;
2892 if (c1 < CHAR_CLASS_MAX_LENGTH)
2893 str[c1++] = c;
2894 else
2895 /* This is in any case an invalid class name. */
2896 str[0] = '\0';
2897 }
2898 str[c1] = '\0';
2899
2900 /* If isn't a word bracketed by `[:' and `:]':
2901 undo the ending character, the letters, and leave
2902 the leading `:' and `[' (but store them as character). */
2903 if (c == ':' && *p == ']')
2904 {
2905 wctype_t wt;
2906 uintptr_t alignedp;
2907
2908 /* Query the character class as wctype_t. */
2909 wt = IS_CHAR_CLASS (str);
2910 if (wt == 0)
2911 FREE_STACK_RETURN (REG_ECTYPE);
2912
2913 /* Throw away the ] at the end of the character
2914 class. */
2915 PATFETCH (c);
2916
2917 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2918
2919 /* Allocate the space for character class. */
2920 GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
2921 /* Update the pointer to indicate end of buffer. */
2922 b += CHAR_CLASS_SIZE;
2923 /* Move data which follow character classes
2924 not to violate the data. */
2925 insert_space(CHAR_CLASS_SIZE,
2926 laststart + 6 + laststart[1],
2927 b - 1);
2928 alignedp = ((uintptr_t)(laststart + 6 + laststart[1])
2929 + __alignof__(wctype_t) - 1)
2930 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
2931 /* Store the character class. */
2932 *((wctype_t*)alignedp) = wt;
2933 /* Update length of char_classes */
2934 laststart[1] += CHAR_CLASS_SIZE;
2935
2936 had_char_class = true;
2937 }
2938 else
2939 {
2940 c1++;
2941 while (c1--)
2942 PATUNFETCH;
2943 BUF_PUSH ('[');
2944 BUF_PUSH (':');
2945 laststart[5] += 2; /* Update the length of characters */
2946 range_start = ':';
2947 had_char_class = false;
2948 }
2949 }
2950 else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '='
2951 || *p == '.'))
2952 {
2953 CHAR_T str[128]; /* Should be large enough. */
2954 CHAR_T delim = *p; /* '=' or '.' */
2955 # ifdef _LIBC
2956 uint32_t nrules =
2957 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2958 # endif
2959 PATFETCH (c);
2960 c1 = 0;
2961
2962 /* If pattern is `[[=' or '[[.'. */
2963 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2964
2965 for (;;)
2966 {
2967 PATFETCH (c);
2968 if ((c == delim && *p == ']') || p == pend)
2969 break;
2970 if (c1 < sizeof (str) - 1)
2971 str[c1++] = c;
2972 else
2973 /* This is in any case an invalid class name. */
2974 str[0] = '\0';
2975 }
2976 str[c1] = '\0';
2977
2978 if (c == delim && *p == ']' && str[0] != '\0')
2979 {
2980 unsigned int i, offset;
2981 /* If we have no collation data we use the default
2982 collation in which each character is in a class
2983 by itself. It also means that ASCII is the
2984 character set and therefore we cannot have character
2985 with more than one byte in the multibyte
2986 representation. */
2987
2988 /* If not defined _LIBC, we push the name and
2989 `\0' for the sake of matching performance. */
2990 int datasize = c1 + 1;
2991
2992 # ifdef _LIBC
2993 int32_t idx = 0;
2994 if (nrules == 0)
2995 # endif
2996 {
2997 if (c1 != 1)
2998 FREE_STACK_RETURN (REG_ECOLLATE);
2999 }
3000 # ifdef _LIBC
3001 else
3002 {
3003 const int32_t *table;
3004 const int32_t *weights;
3005 const int32_t *extra;
3006 const int32_t *indirect;
3007 wint_t *cp;
3008
3009 /* This #include defines a local function! */
3010 # include <locale/weightwc.h>
3011
3012 if(delim == '=')
3013 {
3014 /* We push the index for equivalence class. */
3015 cp = (wint_t*)str;
3016
3017 table = (const int32_t *)
3018 _NL_CURRENT (LC_COLLATE,
3019 _NL_COLLATE_TABLEWC);
3020 weights = (const int32_t *)
3021 _NL_CURRENT (LC_COLLATE,
3022 _NL_COLLATE_WEIGHTWC);
3023 extra = (const int32_t *)
3024 _NL_CURRENT (LC_COLLATE,
3025 _NL_COLLATE_EXTRAWC);
3026 indirect = (const int32_t *)
3027 _NL_CURRENT (LC_COLLATE,
3028 _NL_COLLATE_INDIRECTWC);
3029
3030 idx = findidx ((const wint_t**)&cp);
3031 if (idx == 0 || cp < (wint_t*) str + c1)
3032 /* This is no valid character. */
3033 FREE_STACK_RETURN (REG_ECOLLATE);
3034
3035 str[0] = (wchar_t)idx;
3036 }
3037 else /* delim == '.' */
3038 {
3039 /* We push collation sequence value
3040 for collating symbol. */
3041 int32_t table_size;
3042 const int32_t *symb_table;
3043 const unsigned char *extra;
3044 int32_t idx;
3045 int32_t elem;
3046 int32_t second;
3047 int32_t hash;
3048 char char_str[c1];
3049
3050 /* We have to convert the name to a single-byte
3051 string. This is possible since the names
3052 consist of ASCII characters and the internal
3053 representation is UCS4. */
3054 for (i = 0; i < c1; ++i)
3055 char_str[i] = str[i];
3056
3057 table_size =
3058 _NL_CURRENT_WORD (LC_COLLATE,
3059 _NL_COLLATE_SYMB_HASH_SIZEMB);
3060 symb_table = (const int32_t *)
3061 _NL_CURRENT (LC_COLLATE,
3062 _NL_COLLATE_SYMB_TABLEMB);
3063 extra = (const unsigned char *)
3064 _NL_CURRENT (LC_COLLATE,
3065 _NL_COLLATE_SYMB_EXTRAMB);
3066
3067 /* Locate the character in the hashing table. */
3068 hash = elem_hash (char_str, c1);
3069
3070 idx = 0;
3071 elem = hash % table_size;
3072 second = hash % (table_size - 2);
3073 while (symb_table[2 * elem] != 0)
3074 {
3075 /* First compare the hashing value. */
3076 if (symb_table[2 * elem] == hash
3077 && c1 == extra[symb_table[2 * elem + 1]]
3078 && memcmp (char_str,
3079 &extra[symb_table[2 * elem + 1]
3080 + 1], c1) == 0)
3081 {
3082 /* Yep, this is the entry. */
3083 idx = symb_table[2 * elem + 1];
3084 idx += 1 + extra[idx];
3085 break;
3086 }
3087
3088 /* Next entry. */
3089 elem += second;
3090 }
3091
3092 if (symb_table[2 * elem] != 0)
3093 {
3094 /* Compute the index of the byte sequence
3095 in the table. */
3096 idx += 1 + extra[idx];
3097 /* Adjust for the alignment. */
3098 idx = (idx + 3) & ~3;
3099
3100 str[0] = (wchar_t) idx + 4;
3101 }
3102 else if (symb_table[2 * elem] == 0 && c1 == 1)
3103 {
3104 /* No valid character. Match it as a
3105 single byte character. */
3106 had_char_class = false;
3107 BUF_PUSH(str[0]);
3108 /* Update the length of characters */
3109 laststart[5]++;
3110 range_start = str[0];
3111
3112 /* Throw away the ] at the end of the
3113 collating symbol. */
3114 PATFETCH (c);
3115 /* exit from the switch block. */
3116 continue;
3117 }
3118 else
3119 FREE_STACK_RETURN (REG_ECOLLATE);
3120 }
3121 datasize = 1;
3122 }
3123 # endif
3124 /* Throw away the ] at the end of the equivalence
3125 class (or collating symbol). */
3126 PATFETCH (c);
3127
3128 /* Allocate the space for the equivalence class
3129 (or collating symbol) (and '\0' if needed). */
3130 GET_BUFFER_SPACE(datasize);
3131 /* Update the pointer to indicate end of buffer. */
3132 b += datasize;
3133
3134 if (delim == '=')
3135 { /* equivalence class */
3136 /* Calculate the offset of char_ranges,
3137 which is next to equivalence_classes. */
3138 offset = laststart[1] + laststart[2]
3139 + laststart[3] +6;
3140 /* Insert space. */
3141 insert_space(datasize, laststart + offset, b - 1);
3142
3143 /* Write the equivalence_class and \0. */
3144 for (i = 0 ; i < datasize ; i++)
3145 laststart[offset + i] = str[i];
3146
3147 /* Update the length of equivalence_classes. */
3148 laststart[3] += datasize;
3149 had_char_class = true;
3150 }
3151 else /* delim == '.' */
3152 { /* collating symbol */
3153 /* Calculate the offset of the equivalence_classes,
3154 which is next to collating_symbols. */
3155 offset = laststart[1] + laststart[2] + 6;
3156 /* Insert space and write the collationg_symbol
3157 and \0. */
3158 insert_space(datasize, laststart + offset, b-1);
3159 for (i = 0 ; i < datasize ; i++)
3160 laststart[offset + i] = str[i];
3161
3162 /* In re_match_2_internal if range_start < -1, we
3163 assume -range_start is the offset of the
3164 collating symbol which is specified as
3165 the character of the range start. So we assign
3166 -(laststart[1] + laststart[2] + 6) to
3167 range_start. */
3168 range_start = -(laststart[1] + laststart[2] + 6);
3169 /* Update the length of collating_symbol. */
3170 laststart[2] += datasize;
3171 had_char_class = false;
3172 }
3173 }
3174 else
3175 {
3176 c1++;
3177 while (c1--)
3178 PATUNFETCH;
3179 BUF_PUSH ('[');
3180 BUF_PUSH (delim);
3181 laststart[5] += 2; /* Update the length of characters */
3182 range_start = delim;
3183 had_char_class = false;
3184 }
3185 }
3186 else
3187 {
3188 had_char_class = false;
3189 BUF_PUSH(c);
3190 laststart[5]++; /* Update the length of characters */
3191 range_start = c;
3192 }
3193 }
3194
3195 #else /* BYTE */
3196 /* Ensure that we have enough space to push a charset: the
3197 opcode, the length count, and the bitset; 34 bytes in all. */
3198 GET_BUFFER_SPACE (34);
3199
3200 laststart = b;
3201
3202 /* We test `*p == '^' twice, instead of using an if
3203 statement, so we only need one BUF_PUSH. */
3204 BUF_PUSH (*p == '^' ? charset_not : charset);
3205 if (*p == '^')
3206 p++;
3207
3208 /* Remember the first position in the bracket expression. */
3209 p1 = p;
3210
3211 /* Push the number of bytes in the bitmap. */
3212 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
3213
3214 /* Clear the whole map. */
3215 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
3216
3217 /* charset_not matches newline according to a syntax bit. */
3218 if ((re_opcode_t) b[-2] == charset_not
3219 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
3220 SET_LIST_BIT ('\n');
3221
3222 /* Read in characters and ranges, setting map bits. */
3223 for (;;)
3224 {
3225 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3226
3227 PATFETCH (c);
3228
3229 /* \ might escape characters inside [...] and [^...]. */
3230 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
3231 {
3232 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3233
3234 PATFETCH (c1);
3235 SET_LIST_BIT (c1);
3236 range_start = c1;
3237 continue;
3238 }
3239
3240 /* Could be the end of the bracket expression. If it's
3241 not (i.e., when the bracket expression is `[]' so
3242 far), the ']' character bit gets set way below. */
3243 if (c == ']' && p != p1 + 1)
3244 break;
3245
3246 /* Look ahead to see if it's a range when the last thing
3247 was a character class. */
3248 if (had_char_class && c == '-' && *p != ']')
3249 FREE_STACK_RETURN (REG_ERANGE);
3250
3251 /* Look ahead to see if it's a range when the last thing
3252 was a character: if this is a hyphen not at the
3253 beginning or the end of a list, then it's the range
3254 operator. */
3255 if (c == '-'
3256 && !(p - 2 >= pattern && p[-2] == '[')
3257 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
3258 && *p != ']')
3259 {
3260 reg_errcode_t ret
3261 = byte_compile_range (range_start, &p, pend, translate,
3262 syntax, b);
3263 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3264 range_start = 0xffffffff;
3265 }
3266
3267 else if (p[0] == '-' && p[1] != ']')
3268 { /* This handles ranges made up of characters only. */
3269 reg_errcode_t ret;
3270
3271 /* Move past the `-'. */
3272 PATFETCH (c1);
3273
3274 ret = byte_compile_range (c, &p, pend, translate, syntax, b);
3275 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3276 range_start = 0xffffffff;
3277 }
3278
3279 /* See if we're at the beginning of a possible character
3280 class. */
3281
3282 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
3283 { /* Leave room for the null. */
3284 char str[CHAR_CLASS_MAX_LENGTH + 1];
3285
3286 PATFETCH (c);
3287 c1 = 0;
3288
3289 /* If pattern is `[[:'. */
3290 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3291
3292 for (;;)
3293 {
3294 PATFETCH (c);
3295 if ((c == ':' && *p == ']') || p == pend)
3296 break;
3297 if (c1 < CHAR_CLASS_MAX_LENGTH)
3298 str[c1++] = c;
3299 else
3300 /* This is in any case an invalid class name. */
3301 str[0] = '\0';
3302 }
3303 str[c1] = '\0';
3304
3305 /* If isn't a word bracketed by `[:' and `:]':
3306 undo the ending character, the letters, and leave
3307 the leading `:' and `[' (but set bits for them). */
3308 if (c == ':' && *p == ']')
3309 {
3310 # if defined _LIBC || WIDE_CHAR_SUPPORT
3311 boolean is_lower = STREQ (str, "lower");
3312 boolean is_upper = STREQ (str, "upper");
3313 wctype_t wt;
3314 int ch;
3315
3316 wt = IS_CHAR_CLASS (str);
3317 if (wt == 0)
3318 FREE_STACK_RETURN (REG_ECTYPE);
3319
3320 /* Throw away the ] at the end of the character
3321 class. */
3322 PATFETCH (c);
3323
3324 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3325
3326 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
3327 {
3328 if (iswctype (btowc (ch), wt))
3329 SET_LIST_BIT (ch);
3330
3331 if (translate && (is_upper || is_lower)
3332 && (ISUPPER (ch) || ISLOWER (ch)))
3333 SET_LIST_BIT (ch);
3334 }
3335
3336 had_char_class = true;
3337 # else
3338 int ch;
3339 boolean is_alnum = STREQ (str, "alnum");
3340 boolean is_alpha = STREQ (str, "alpha");
3341 boolean is_blank = STREQ (str, "blank");
3342 boolean is_cntrl = STREQ (str, "cntrl");
3343 boolean is_digit = STREQ (str, "digit");
3344 boolean is_graph = STREQ (str, "graph");
3345 boolean is_lower = STREQ (str, "lower");
3346 boolean is_print = STREQ (str, "print");
3347 boolean is_punct = STREQ (str, "punct");
3348 boolean is_space = STREQ (str, "space");
3349 boolean is_upper = STREQ (str, "upper");
3350 boolean is_xdigit = STREQ (str, "xdigit");
3351
3352 if (!IS_CHAR_CLASS (str))
3353 FREE_STACK_RETURN (REG_ECTYPE);
3354
3355 /* Throw away the ] at the end of the character
3356 class. */
3357 PATFETCH (c);
3358
3359 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3360
3361 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
3362 {
3363 /* This was split into 3 if's to
3364 avoid an arbitrary limit in some compiler. */
3365 if ( (is_alnum && ISALNUM (ch))
3366 || (is_alpha && ISALPHA (ch))
3367 || (is_blank && ISBLANK (ch))
3368 || (is_cntrl && ISCNTRL (ch)))
3369 SET_LIST_BIT (ch);
3370 if ( (is_digit && ISDIGIT (ch))
3371 || (is_graph && ISGRAPH (ch))
3372 || (is_lower && ISLOWER (ch))
3373 || (is_print && ISPRINT (ch)))
3374 SET_LIST_BIT (ch);
3375 if ( (is_punct && ISPUNCT (ch))
3376 || (is_space && ISSPACE (ch))
3377 || (is_upper && ISUPPER (ch))
3378 || (is_xdigit && ISXDIGIT (ch)))
3379 SET_LIST_BIT (ch);
3380 if ( translate && (is_upper || is_lower)
3381 && (ISUPPER (ch) || ISLOWER (ch)))
3382 SET_LIST_BIT (ch);
3383 }
3384 had_char_class = true;
3385 # endif /* libc || wctype.h */
3386 }
3387 else
3388 {
3389 c1++;
3390 while (c1--)
3391 PATUNFETCH;
3392 SET_LIST_BIT ('[');
3393 SET_LIST_BIT (':');
3394 range_start = ':';
3395 had_char_class = false;
3396 }
3397 }
3398 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
3399 {
3400 unsigned char str[MB_LEN_MAX + 1];
3401 # ifdef _LIBC
3402 uint32_t nrules =
3403 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3404 # endif
3405
3406 PATFETCH (c);
3407 c1 = 0;
3408
3409 /* If pattern is `[[='. */
3410 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3411
3412 for (;;)
3413 {
3414 PATFETCH (c);
3415 if ((c == '=' && *p == ']') || p == pend)
3416 break;
3417 if (c1 < MB_LEN_MAX)
3418 str[c1++] = c;
3419 else
3420 /* This is in any case an invalid class name. */
3421 str[0] = '\0';
3422 }
3423 str[c1] = '\0';
3424
3425 if (c == '=' && *p == ']' && str[0] != '\0')
3426 {
3427 /* If we have no collation data we use the default
3428 collation in which each character is in a class
3429 by itself. It also means that ASCII is the
3430 character set and therefore we cannot have character
3431 with more than one byte in the multibyte
3432 representation. */
3433 # ifdef _LIBC
3434 if (nrules == 0)
3435 # endif
3436 {
3437 if (c1 != 1)
3438 FREE_STACK_RETURN (REG_ECOLLATE);
3439
3440 /* Throw away the ] at the end of the equivalence
3441 class. */
3442 PATFETCH (c);
3443
3444 /* Set the bit for the character. */
3445 SET_LIST_BIT (str[0]);
3446 }
3447 # ifdef _LIBC
3448 else
3449 {
3450 /* Try to match the byte sequence in `str' against
3451 those known to the collate implementation.
3452 First find out whether the bytes in `str' are
3453 actually from exactly one character. */
3454 const int32_t *table;
3455 const unsigned char *weights;
3456 const unsigned char *extra;
3457 const int32_t *indirect;
3458 int32_t idx;
3459 const unsigned char *cp = str;
3460 int ch;
3461
3462 /* This #include defines a local function! */
3463 # include <locale/weight.h>
3464
3465 table = (const int32_t *)
3466 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3467 weights = (const unsigned char *)
3468 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3469 extra = (const unsigned char *)
3470 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3471 indirect = (const int32_t *)
3472 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3473
3474 idx = findidx (&cp);
3475 if (idx == 0 || cp < str + c1)
3476 /* This is no valid character. */
3477 FREE_STACK_RETURN (REG_ECOLLATE);
3478
3479 /* Throw away the ] at the end of the equivalence
3480 class. */
3481 PATFETCH (c);
3482
3483 /* Now we have to go throught the whole table
3484 and find all characters which have the same
3485 first level weight.
3486
3487 XXX Note that this is not entirely correct.
3488 we would have to match multibyte sequences
3489 but this is not possible with the current
3490 implementation. */
3491 for (ch = 1; ch < 256; ++ch)
3492 /* XXX This test would have to be changed if we
3493 would allow matching multibyte sequences. */
3494 if (table[ch] > 0)
3495 {
3496 int32_t idx2 = table[ch];
3497 size_t len = weights[idx2];
3498
3499 /* Test whether the lenghts match. */
3500 if (weights[idx] == len)
3501 {
3502 /* They do. New compare the bytes of
3503 the weight. */
3504 size_t cnt = 0;
3505
3506 while (cnt < len
3507 && (weights[idx + 1 + cnt]
3508 == weights[idx2 + 1 + cnt]))
3509 ++cnt;
3510
3511 if (cnt == len)
3512 /* They match. Mark the character as
3513 acceptable. */
3514 SET_LIST_BIT (ch);
3515 }
3516 }
3517 }
3518 # endif
3519 had_char_class = true;
3520 }
3521 else
3522 {
3523 c1++;
3524 while (c1--)
3525 PATUNFETCH;
3526 SET_LIST_BIT ('[');
3527 SET_LIST_BIT ('=');
3528 range_start = '=';
3529 had_char_class = false;
3530 }
3531 }
3532 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
3533 {
3534 unsigned char str[128]; /* Should be large enough. */
3535 # ifdef _LIBC
3536 uint32_t nrules =
3537 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3538 # endif
3539
3540 PATFETCH (c);
3541 c1 = 0;
3542
3543 /* If pattern is `[[.'. */
3544 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3545
3546 for (;;)
3547 {
3548 PATFETCH (c);
3549 if ((c == '.' && *p == ']') || p == pend)
3550 break;
3551 if (c1 < sizeof (str))
3552 str[c1++] = c;
3553 else
3554 /* This is in any case an invalid class name. */
3555 str[0] = '\0';
3556 }
3557 str[c1] = '\0';
3558
3559 if (c == '.' && *p == ']' && str[0] != '\0')
3560 {
3561 /* If we have no collation data we use the default
3562 collation in which each character is the name
3563 for its own class which contains only the one
3564 character. It also means that ASCII is the
3565 character set and therefore we cannot have character
3566 with more than one byte in the multibyte
3567 representation. */
3568 # ifdef _LIBC
3569 if (nrules == 0)
3570 # endif
3571 {
3572 if (c1 != 1)
3573 FREE_STACK_RETURN (REG_ECOLLATE);
3574
3575 /* Throw away the ] at the end of the equivalence
3576 class. */
3577 PATFETCH (c);
3578
3579 /* Set the bit for the character. */
3580 SET_LIST_BIT (str[0]);
3581 range_start = ((const unsigned char *) str)[0];
3582 }
3583 # ifdef _LIBC
3584 else
3585 {
3586 /* Try to match the byte sequence in `str' against
3587 those known to the collate implementation.
3588 First find out whether the bytes in `str' are
3589 actually from exactly one character. */
3590 int32_t table_size;
3591 const int32_t *symb_table;
3592 const unsigned char *extra;
3593 int32_t idx;
3594 int32_t elem;
3595 int32_t second;
3596 int32_t hash;
3597
3598 table_size =
3599 _NL_CURRENT_WORD (LC_COLLATE,
3600 _NL_COLLATE_SYMB_HASH_SIZEMB);
3601 symb_table = (const int32_t *)
3602 _NL_CURRENT (LC_COLLATE,
3603 _NL_COLLATE_SYMB_TABLEMB);
3604 extra = (const unsigned char *)
3605 _NL_CURRENT (LC_COLLATE,
3606 _NL_COLLATE_SYMB_EXTRAMB);
3607
3608 /* Locate the character in the hashing table. */
3609 hash = elem_hash (str, c1);
3610
3611 idx = 0;
3612 elem = hash % table_size;
3613 second = hash % (table_size - 2);
3614 while (symb_table[2 * elem] != 0)
3615 {
3616 /* First compare the hashing value. */
3617 if (symb_table[2 * elem] == hash
3618 && c1 == extra[symb_table[2 * elem + 1]]
3619 && memcmp (str,
3620 &extra[symb_table[2 * elem + 1]
3621 + 1],
3622 c1) == 0)
3623 {
3624 /* Yep, this is the entry. */
3625 idx = symb_table[2 * elem + 1];
3626 idx += 1 + extra[idx];
3627 break;
3628 }
3629
3630 /* Next entry. */
3631 elem += second;
3632 }
3633
3634 if (symb_table[2 * elem] == 0)
3635 /* This is no valid character. */
3636 FREE_STACK_RETURN (REG_ECOLLATE);
3637
3638 /* Throw away the ] at the end of the equivalence
3639 class. */
3640 PATFETCH (c);
3641
3642 /* Now add the multibyte character(s) we found
3643 to the accept list.
3644
3645 XXX Note that this is not entirely correct.
3646 we would have to match multibyte sequences
3647 but this is not possible with the current
3648 implementation. Also, we have to match
3649 collating symbols, which expand to more than
3650 one file, as a whole and not allow the
3651 individual bytes. */
3652 c1 = extra[idx++];
3653 if (c1 == 1)
3654 range_start = extra[idx];
3655 while (c1-- > 0)
3656 {
3657 SET_LIST_BIT (extra[idx]);
3658 ++idx;
3659 }
3660 }
3661 # endif
3662 had_char_class = false;
3663 }
3664 else
3665 {
3666 c1++;
3667 while (c1--)
3668 PATUNFETCH;
3669 SET_LIST_BIT ('[');
3670 SET_LIST_BIT ('.');
3671 range_start = '.';
3672 had_char_class = false;
3673 }
3674 }
3675 else
3676 {
3677 had_char_class = false;
3678 SET_LIST_BIT (c);
3679 range_start = c;
3680 }
3681 }
3682
3683 /* Discard any (non)matching list bytes that are all 0 at the
3684 end of the map. Decrease the map-length byte too. */
3685 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3686 b[-1]--;
3687 b += b[-1];
3688 #endif /* WCHAR */
3689 }
3690 break;
3691
3692
3693 case '(':
3694 if (syntax & RE_NO_BK_PARENS)
3695 goto handle_open;
3696 else
3697 goto normal_char;
3698
3699
3700 case ')':
3701 if (syntax & RE_NO_BK_PARENS)
3702 goto handle_close;
3703 else
3704 goto normal_char;
3705
3706
3707 case '\n':
3708 if (syntax & RE_NEWLINE_ALT)
3709 goto handle_alt;
3710 else
3711 goto normal_char;
3712
3713
3714 case '|':
3715 if (syntax & RE_NO_BK_VBAR)
3716 goto handle_alt;
3717 else
3718 goto normal_char;
3719
3720
3721 case '{':
3722 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3723 goto handle_interval;
3724 else
3725 goto normal_char;
3726
3727
3728 case '\\':
3729 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3730
3731 /* Do not translate the character after the \, so that we can
3732 distinguish, e.g., \B from \b, even if we normally would
3733 translate, e.g., B to b. */
3734 PATFETCH_RAW (c);
3735
3736 switch (c)
3737 {
3738 case '(':
3739 if (syntax & RE_NO_BK_PARENS)
3740 goto normal_backslash;
3741
3742 handle_open:
3743 bufp->re_nsub++;
3744 regnum++;
3745
3746 if (COMPILE_STACK_FULL)
3747 {
3748 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3749 compile_stack_elt_t);
3750 if (compile_stack.stack == NULL) return REG_ESPACE;
3751
3752 compile_stack.size <<= 1;
3753 }
3754
3755 /* These are the values to restore when we hit end of this
3756 group. They are all relative offsets, so that if the
3757 whole pattern moves because of realloc, they will still
3758 be valid. */
3759 COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR;
3760 COMPILE_STACK_TOP.fixup_alt_jump
3761 = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0;
3762 COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR;
3763 COMPILE_STACK_TOP.regnum = regnum;
3764
3765 /* We will eventually replace the 0 with the number of
3766 groups inner to this one. But do not push a
3767 start_memory for groups beyond the last one we can
3768 represent in the compiled pattern. */
3769 if (regnum <= MAX_REGNUM)
3770 {
3771 COMPILE_STACK_TOP.inner_group_offset = b
3772 - COMPILED_BUFFER_VAR + 2;
3773 BUF_PUSH_3 (start_memory, regnum, 0);
3774 }
3775
3776 compile_stack.avail++;
3777
3778 fixup_alt_jump = 0;
3779 laststart = 0;
3780 begalt = b;
3781 /* If we've reached MAX_REGNUM groups, then this open
3782 won't actually generate any code, so we'll have to
3783 clear pending_exact explicitly. */
3784 pending_exact = 0;
3785 break;
3786
3787
3788 case ')':
3789 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3790
3791 if (COMPILE_STACK_EMPTY)
3792 {
3793 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3794 goto normal_backslash;
3795 else
3796 FREE_STACK_RETURN (REG_ERPAREN);
3797 }
3798
3799 handle_close:
3800 if (fixup_alt_jump)
3801 { /* Push a dummy failure point at the end of the
3802 alternative for a possible future
3803 `pop_failure_jump' to pop. See comments at
3804 `push_dummy_failure' in `re_match_2'. */
3805 BUF_PUSH (push_dummy_failure);
3806
3807 /* We allocated space for this jump when we assigned
3808 to `fixup_alt_jump', in the `handle_alt' case below. */
3809 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
3810 }
3811
3812 /* See similar code for backslashed left paren above. */
3813 if (COMPILE_STACK_EMPTY)
3814 {
3815 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3816 goto normal_char;
3817 else
3818 FREE_STACK_RETURN (REG_ERPAREN);
3819 }
3820
3821 /* Since we just checked for an empty stack above, this
3822 ``can't happen''. */
3823 assert (compile_stack.avail != 0);
3824 {
3825 /* We don't just want to restore into `regnum', because
3826 later groups should continue to be numbered higher,
3827 as in `(ab)c(de)' -- the second group is #2. */
3828 regnum_t this_group_regnum;
3829
3830 compile_stack.avail--;
3831 begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset;
3832 fixup_alt_jump
3833 = COMPILE_STACK_TOP.fixup_alt_jump
3834 ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1
3835 : 0;
3836 laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset;
3837 this_group_regnum = COMPILE_STACK_TOP.regnum;
3838 /* If we've reached MAX_REGNUM groups, then this open
3839 won't actually generate any code, so we'll have to
3840 clear pending_exact explicitly. */
3841 pending_exact = 0;
3842
3843 /* We're at the end of the group, so now we know how many
3844 groups were inside this one. */
3845 if (this_group_regnum <= MAX_REGNUM)
3846 {
3847 UCHAR_T *inner_group_loc
3848 = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset;
3849
3850 *inner_group_loc = regnum - this_group_regnum;
3851 BUF_PUSH_3 (stop_memory, this_group_regnum,
3852 regnum - this_group_regnum);
3853 }
3854 }
3855 break;
3856
3857
3858 case '|': /* `\|'. */
3859 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3860 goto normal_backslash;
3861 handle_alt:
3862 if (syntax & RE_LIMITED_OPS)
3863 goto normal_char;
3864
3865 /* Insert before the previous alternative a jump which
3866 jumps to this alternative if the former fails. */
3867 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3868 INSERT_JUMP (on_failure_jump, begalt,
3869 b + 2 + 2 * OFFSET_ADDRESS_SIZE);
3870 pending_exact = 0;
3871 b += 1 + OFFSET_ADDRESS_SIZE;
3872
3873 /* The alternative before this one has a jump after it
3874 which gets executed if it gets matched. Adjust that
3875 jump so it will jump to this alternative's analogous
3876 jump (put in below, which in turn will jump to the next
3877 (if any) alternative's such jump, etc.). The last such
3878 jump jumps to the correct final destination. A picture:
3879 _____ _____
3880 | | | |
3881 | v | v
3882 a | b | c
3883
3884 If we are at `b', then fixup_alt_jump right now points to a
3885 three-byte space after `a'. We'll put in the jump, set
3886 fixup_alt_jump to right after `b', and leave behind three
3887 bytes which we'll fill in when we get to after `c'. */
3888
3889 if (fixup_alt_jump)
3890 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3891
3892 /* Mark and leave space for a jump after this alternative,
3893 to be filled in later either by next alternative or
3894 when know we're at the end of a series of alternatives. */
3895 fixup_alt_jump = b;
3896 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3897 b += 1 + OFFSET_ADDRESS_SIZE;
3898
3899 laststart = 0;
3900 begalt = b;
3901 break;
3902
3903
3904 case '{':
3905 /* If \{ is a literal. */
3906 if (!(syntax & RE_INTERVALS)
3907 /* If we're at `\{' and it's not the open-interval
3908 operator. */
3909 || (syntax & RE_NO_BK_BRACES))
3910 goto normal_backslash;
3911
3912 handle_interval:
3913 {
3914 /* If got here, then the syntax allows intervals. */
3915
3916 /* At least (most) this many matches must be made. */
3917 int lower_bound = -1, upper_bound = -1;
3918
3919 /* Place in the uncompiled pattern (i.e., just after
3920 the '{') to go back to if the interval is invalid. */
3921 const CHAR_T *beg_interval = p;
3922
3923 if (p == pend)
3924 goto invalid_interval;
3925
3926 GET_UNSIGNED_NUMBER (lower_bound);
3927
3928 if (c == ',')
3929 {
3930 GET_UNSIGNED_NUMBER (upper_bound);
3931 if (upper_bound < 0)
3932 upper_bound = RE_DUP_MAX;
3933 }
3934 else
3935 /* Interval such as `{1}' => match exactly once. */
3936 upper_bound = lower_bound;
3937
3938 if (! (0 <= lower_bound && lower_bound <= upper_bound))
3939 goto invalid_interval;
3940
3941 if (!(syntax & RE_NO_BK_BRACES))
3942 {
3943 if (c != '\\' || p == pend)
3944 goto invalid_interval;
3945 PATFETCH (c);
3946 }
3947
3948 if (c != '}')
3949 goto invalid_interval;
3950
3951 /* If it's invalid to have no preceding re. */
3952 if (!laststart)
3953 {
3954 if (syntax & RE_CONTEXT_INVALID_OPS
3955 && !(syntax & RE_INVALID_INTERVAL_ORD))
3956 FREE_STACK_RETURN (REG_BADRPT);
3957 else if (syntax & RE_CONTEXT_INDEP_OPS)
3958 laststart = b;
3959 else
3960 goto unfetch_interval;
3961 }
3962
3963 /* We just parsed a valid interval. */
3964
3965 if (RE_DUP_MAX < upper_bound)
3966 FREE_STACK_RETURN (REG_BADBR);
3967
3968 /* If the upper bound is zero, don't want to succeed at
3969 all; jump from `laststart' to `b + 3', which will be
3970 the end of the buffer after we insert the jump. */
3971 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE'
3972 instead of 'b + 3'. */
3973 if (upper_bound == 0)
3974 {
3975 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3976 INSERT_JUMP (jump, laststart, b + 1
3977 + OFFSET_ADDRESS_SIZE);
3978 b += 1 + OFFSET_ADDRESS_SIZE;
3979 }
3980
3981 /* Otherwise, we have a nontrivial interval. When
3982 we're all done, the pattern will look like:
3983 set_number_at <jump count> <upper bound>
3984 set_number_at <succeed_n count> <lower bound>
3985 succeed_n <after jump addr> <succeed_n count>
3986 <body of loop>
3987 jump_n <succeed_n addr> <jump count>
3988 (The upper bound and `jump_n' are omitted if
3989 `upper_bound' is 1, though.) */
3990 else
3991 { /* If the upper bound is > 1, we need to insert
3992 more at the end of the loop. */
3993 unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE +
3994 (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE);
3995
3996 GET_BUFFER_SPACE (nbytes);
3997
3998 /* Initialize lower bound of the `succeed_n', even
3999 though it will be set during matching by its
4000 attendant `set_number_at' (inserted next),
4001 because `re_compile_fastmap' needs to know.
4002 Jump to the `jump_n' we might insert below. */
4003 INSERT_JUMP2 (succeed_n, laststart,
4004 b + 1 + 2 * OFFSET_ADDRESS_SIZE
4005 + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE)
4006 , lower_bound);
4007 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
4008
4009 /* Code to initialize the lower bound. Insert
4010 before the `succeed_n'. The `5' is the last two
4011 bytes of this `set_number_at', plus 3 bytes of
4012 the following `succeed_n'. */
4013 /* ifdef WCHAR, The '1+2*OFFSET_ADDRESS_SIZE'
4014 is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE'
4015 of the following `succeed_n'. */
4016 PREFIX(insert_op2) (set_number_at, laststart, 1
4017 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b);
4018 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
4019
4020 if (upper_bound > 1)
4021 { /* More than one repetition is allowed, so
4022 append a backward jump to the `succeed_n'
4023 that starts this interval.
4024
4025 When we've reached this during matching,
4026 we'll have matched the interval once, so
4027 jump back only `upper_bound - 1' times. */
4028 STORE_JUMP2 (jump_n, b, laststart
4029 + 2 * OFFSET_ADDRESS_SIZE + 1,
4030 upper_bound - 1);
4031 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
4032
4033 /* The location we want to set is the second
4034 parameter of the `jump_n'; that is `b-2' as
4035 an absolute address. `laststart' will be
4036 the `set_number_at' we're about to insert;
4037 `laststart+3' the number to set, the source
4038 for the relative address. But we are
4039 inserting into the middle of the pattern --
4040 so everything is getting moved up by 5.
4041 Conclusion: (b - 2) - (laststart + 3) + 5,
4042 i.e., b - laststart.
4043
4044 We insert this at the beginning of the loop
4045 so that if we fail during matching, we'll
4046 reinitialize the bounds. */
4047 PREFIX(insert_op2) (set_number_at, laststart,
4048 b - laststart,
4049 upper_bound - 1, b);
4050 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
4051 }
4052 }
4053 pending_exact = 0;
4054 break;
4055
4056 invalid_interval:
4057 if (!(syntax & RE_INVALID_INTERVAL_ORD))
4058 FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR);
4059 unfetch_interval:
4060 /* Match the characters as literals. */
4061 p = beg_interval;
4062 c = '{';
4063 if (syntax & RE_NO_BK_BRACES)
4064 goto normal_char;
4065 else
4066 goto normal_backslash;
4067 }
4068
4069 #ifdef emacs
4070 /* There is no way to specify the before_dot and after_dot
4071 operators. rms says this is ok. --karl */
4072 case '=':
4073 BUF_PUSH (at_dot);
4074 break;
4075
4076 case 's':
4077 laststart = b;
4078 PATFETCH (c);
4079 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
4080 break;
4081
4082 case 'S':
4083 laststart = b;
4084 PATFETCH (c);
4085 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
4086 break;
4087 #endif /* emacs */
4088
4089
4090 case 'w':
4091 if (syntax & RE_NO_GNU_OPS)
4092 goto normal_char;
4093 laststart = b;
4094 BUF_PUSH (wordchar);
4095 break;
4096
4097
4098 case 'W':
4099 if (syntax & RE_NO_GNU_OPS)
4100 goto normal_char;
4101 laststart = b;
4102 BUF_PUSH (notwordchar);
4103 break;
4104
4105
4106 case '<':
4107 if (syntax & RE_NO_GNU_OPS)
4108 goto normal_char;
4109 BUF_PUSH (wordbeg);
4110 break;
4111
4112 case '>':
4113 if (syntax & RE_NO_GNU_OPS)
4114 goto normal_char;
4115 BUF_PUSH (wordend);
4116 break;
4117
4118 case 'b':
4119 if (syntax & RE_NO_GNU_OPS)
4120 goto normal_char;
4121 BUF_PUSH (wordbound);
4122 break;
4123
4124 case 'B':
4125 if (syntax & RE_NO_GNU_OPS)
4126 goto normal_char;
4127 BUF_PUSH (notwordbound);
4128 break;
4129
4130 case '`':
4131 if (syntax & RE_NO_GNU_OPS)
4132 goto normal_char;
4133 BUF_PUSH (begbuf);
4134 break;
4135
4136 case '\'':
4137 if (syntax & RE_NO_GNU_OPS)
4138 goto normal_char;
4139 BUF_PUSH (endbuf);
4140 break;
4141
4142 case '1': case '2': case '3': case '4': case '5':
4143 case '6': case '7': case '8': case '9':
4144 if (syntax & RE_NO_BK_REFS)
4145 goto normal_char;
4146
4147 c1 = c - '0';
4148
4149 if (c1 > regnum)
4150 FREE_STACK_RETURN (REG_ESUBREG);
4151
4152 /* Can't back reference to a subexpression if inside of it. */
4153 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
4154 goto normal_char;
4155
4156 laststart = b;
4157 BUF_PUSH_2 (duplicate, c1);
4158 break;
4159
4160
4161 case '+':
4162 case '?':
4163 if (syntax & RE_BK_PLUS_QM)
4164 goto handle_plus;
4165 else
4166 goto normal_backslash;
4167
4168 default:
4169 normal_backslash:
4170 /* You might think it would be useful for \ to mean
4171 not to translate; but if we don't translate it
4172 it will never match anything. */
4173 c = TRANSLATE (c);
4174 goto normal_char;
4175 }
4176 break;
4177
4178
4179 default:
4180 /* Expects the character in `c'. */
4181 normal_char:
4182 /* If no exactn currently being built. */
4183 if (!pending_exact
4184 #ifdef WCHAR
4185 /* If last exactn handle binary(or character) and
4186 new exactn handle character(or binary). */
4187 || is_exactn_bin != is_binary[p - 1 - pattern]
4188 #endif /* WCHAR */
4189
4190 /* If last exactn not at current position. */
4191 || pending_exact + *pending_exact + 1 != b
4192
4193 /* We have only one byte following the exactn for the count. */
4194 || *pending_exact == (1 << BYTEWIDTH) - 1
4195
4196 /* If followed by a repetition operator. */
4197 || *p == '*' || *p == '^'
4198 || ((syntax & RE_BK_PLUS_QM)
4199 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
4200 : (*p == '+' || *p == '?'))
4201 || ((syntax & RE_INTERVALS)
4202 && ((syntax & RE_NO_BK_BRACES)
4203 ? *p == '{'
4204 : (p[0] == '\\' && p[1] == '{'))))
4205 {
4206 /* Start building a new exactn. */
4207
4208 laststart = b;
4209
4210 #ifdef WCHAR
4211 /* Is this exactn binary data or character? */
4212 is_exactn_bin = is_binary[p - 1 - pattern];
4213 if (is_exactn_bin)
4214 BUF_PUSH_2 (exactn_bin, 0);
4215 else
4216 BUF_PUSH_2 (exactn, 0);
4217 #else
4218 BUF_PUSH_2 (exactn, 0);
4219 #endif /* WCHAR */
4220 pending_exact = b - 1;
4221 }
4222
4223 BUF_PUSH (c);
4224 (*pending_exact)++;
4225 break;
4226 } /* switch (c) */
4227 } /* while p != pend */
4228
4229
4230 /* Through the pattern now. */
4231
4232 if (fixup_alt_jump)
4233 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
4234
4235 if (!COMPILE_STACK_EMPTY)
4236 FREE_STACK_RETURN (REG_EPAREN);
4237
4238 /* If we don't want backtracking, force success
4239 the first time we reach the end of the compiled pattern. */
4240 if (syntax & RE_NO_POSIX_BACKTRACKING)
4241 BUF_PUSH (succeed);
4242
4243 #ifdef WCHAR
4244 free (pattern);
4245 free (mbs_offset);
4246 free (is_binary);
4247 #endif
4248 free (compile_stack.stack);
4249
4250 /* We have succeeded; set the length of the buffer. */
4251 #ifdef WCHAR
4252 bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR;
4253 #else
4254 bufp->used = b - bufp->buffer;
4255 #endif
4256
4257 #ifdef DEBUG
4258 if (debug)
4259 {
4260 DEBUG_PRINT1 ("\nCompiled pattern: \n");
4261 PREFIX(print_compiled_pattern) (bufp);
4262 }
4263 #endif /* DEBUG */
4264
4265 #ifndef MATCH_MAY_ALLOCATE
4266 /* Initialize the failure stack to the largest possible stack. This
4267 isn't necessary unless we're trying to avoid calling alloca in
4268 the search and match routines. */
4269 {
4270 int num_regs = bufp->re_nsub + 1;
4271
4272 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
4273 is strictly greater than re_max_failures, the largest possible stack
4274 is 2 * re_max_failures failure points. */
4275 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
4276 {
4277 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
4278
4279 # ifdef emacs
4280 if (! fail_stack.stack)
4281 fail_stack.stack
4282 = (PREFIX(fail_stack_elt_t) *) xmalloc (fail_stack.size
4283 * sizeof (PREFIX(fail_stack_elt_t)));
4284 else
4285 fail_stack.stack
4286 = (PREFIX(fail_stack_elt_t) *) xrealloc (fail_stack.stack,
4287 (fail_stack.size
4288 * sizeof (PREFIX(fail_stack_elt_t))));
4289 # else /* not emacs */
4290 if (! fail_stack.stack)
4291 fail_stack.stack
4292 = (PREFIX(fail_stack_elt_t) *) malloc (fail_stack.size
4293 * sizeof (PREFIX(fail_stack_elt_t)));
4294 else
4295 fail_stack.stack
4296 = (PREFIX(fail_stack_elt_t) *) realloc (fail_stack.stack,
4297 (fail_stack.size
4298 * sizeof (PREFIX(fail_stack_elt_t))));
4299 # endif /* not emacs */
4300 }
4301
4302 PREFIX(regex_grow_registers) (num_regs);
4303 }
4304 #endif /* not MATCH_MAY_ALLOCATE */
4305
4306 return REG_NOERROR;
4307 } /* regex_compile */
4308
4309 /* Subroutines for `regex_compile'. */
4310
4311 /* Store OP at LOC followed by two-byte integer parameter ARG. */
4312 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4313
4314 static void
4315 PREFIX(store_op1) (op, loc, arg)
4316 re_opcode_t op;
4317 UCHAR_T *loc;
4318 int arg;
4319 {
4320 *loc = (UCHAR_T) op;
4321 STORE_NUMBER (loc + 1, arg);
4322 }
4323
4324
4325 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
4326 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4327
4328 static void
4329 PREFIX(store_op2) (op, loc, arg1, arg2)
4330 re_opcode_t op;
4331 UCHAR_T *loc;
4332 int arg1, arg2;
4333 {
4334 *loc = (UCHAR_T) op;
4335 STORE_NUMBER (loc + 1, arg1);
4336 STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2);
4337 }
4338
4339
4340 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
4341 for OP followed by two-byte integer parameter ARG. */
4342 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4343
4344 static void
4345 PREFIX(insert_op1) (op, loc, arg, end)
4346 re_opcode_t op;
4347 UCHAR_T *loc;
4348 int arg;
4349 UCHAR_T *end;
4350 {
4351 register UCHAR_T *pfrom = end;
4352 register UCHAR_T *pto = end + 1 + OFFSET_ADDRESS_SIZE;
4353
4354 while (pfrom != loc)
4355 *--pto = *--pfrom;
4356
4357 PREFIX(store_op1) (op, loc, arg);
4358 }
4359
4360
4361 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
4362 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4363
4364 static void
4365 PREFIX(insert_op2) (op, loc, arg1, arg2, end)
4366 re_opcode_t op;
4367 UCHAR_T *loc;
4368 int arg1, arg2;
4369 UCHAR_T *end;
4370 {
4371 register UCHAR_T *pfrom = end;
4372 register UCHAR_T *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE;
4373
4374 while (pfrom != loc)
4375 *--pto = *--pfrom;
4376
4377 PREFIX(store_op2) (op, loc, arg1, arg2);
4378 }
4379
4380
4381 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
4382 after an alternative or a begin-subexpression. We assume there is at
4383 least one character before the ^. */
4384
4385 static boolean
4386 PREFIX(at_begline_loc_p) (pattern, p, syntax)
4387 const CHAR_T *pattern, *p;
4388 reg_syntax_t syntax;
4389 {
4390 const CHAR_T *prev = p - 2;
4391 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
4392
4393 return
4394 /* After a subexpression? */
4395 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
4396 /* After an alternative? */
4397 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
4398 }
4399
4400
4401 /* The dual of at_begline_loc_p. This one is for $. We assume there is
4402 at least one character after the $, i.e., `P < PEND'. */
4403
4404 static boolean
4405 PREFIX(at_endline_loc_p) (p, pend, syntax)
4406 const CHAR_T *p, *pend;
4407 reg_syntax_t syntax;
4408 {
4409 const CHAR_T *next = p;
4410 boolean next_backslash = *next == '\\';
4411 const CHAR_T *next_next = p + 1 < pend ? p + 1 : 0;
4412
4413 return
4414 /* Before a subexpression? */
4415 (syntax & RE_NO_BK_PARENS ? *next == ')'
4416 : next_backslash && next_next && *next_next == ')')
4417 /* Before an alternative? */
4418 || (syntax & RE_NO_BK_VBAR ? *next == '|'
4419 : next_backslash && next_next && *next_next == '|');
4420 }
4421
4422 #else /* not INSIDE_RECURSION */
4423
4424 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4425 false if it's not. */
4426
4427 static boolean
4428 group_in_compile_stack (compile_stack, regnum)
4429 compile_stack_type compile_stack;
4430 regnum_t regnum;
4431 {
4432 int this_element;
4433
4434 for (this_element = compile_stack.avail - 1;
4435 this_element >= 0;
4436 this_element--)
4437 if (compile_stack.stack[this_element].regnum == regnum)
4438 return true;
4439
4440 return false;
4441 }
4442 #endif /* not INSIDE_RECURSION */
4443
4444 #ifdef INSIDE_RECURSION
4445
4446 #ifdef WCHAR
4447 /* This insert space, which size is "num", into the pattern at "loc".
4448 "end" must point the end of the allocated buffer. */
4449 static void
4450 insert_space (num, loc, end)
4451 int num;
4452 CHAR_T *loc;
4453 CHAR_T *end;
4454 {
4455 register CHAR_T *pto = end;
4456 register CHAR_T *pfrom = end - num;
4457
4458 while (pfrom >= loc)
4459 *pto-- = *pfrom--;
4460 }
4461 #endif /* WCHAR */
4462
4463 #ifdef WCHAR
4464 static reg_errcode_t
4465 wcs_compile_range (range_start_char, p_ptr, pend, translate, syntax, b,
4466 char_set)
4467 CHAR_T range_start_char;
4468 const CHAR_T **p_ptr, *pend;
4469 CHAR_T *char_set, *b;
4470 RE_TRANSLATE_TYPE translate;
4471 reg_syntax_t syntax;
4472 {
4473 const CHAR_T *p = *p_ptr;
4474 CHAR_T range_start, range_end;
4475 reg_errcode_t ret;
4476 # ifdef _LIBC
4477 uint32_t nrules;
4478 uint32_t start_val, end_val;
4479 # endif
4480 if (p == pend)
4481 return REG_ERANGE;
4482
4483 # ifdef _LIBC
4484 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4485 if (nrules != 0)
4486 {
4487 const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
4488 _NL_COLLATE_COLLSEQWC);
4489 const unsigned char *extra = (const unsigned char *)
4490 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4491
4492 if (range_start_char < -1)
4493 {
4494 /* range_start is a collating symbol. */
4495 int32_t *wextra;
4496 /* Retreive the index and get collation sequence value. */
4497 wextra = (int32_t*)(extra + char_set[-range_start_char]);
4498 start_val = wextra[1 + *wextra];
4499 }
4500 else
4501 start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char));
4502
4503 end_val = collseq_table_lookup (collseq, TRANSLATE (p[0]));
4504
4505 /* Report an error if the range is empty and the syntax prohibits
4506 this. */
4507 ret = ((syntax & RE_NO_EMPTY_RANGES)
4508 && (start_val > end_val))? REG_ERANGE : REG_NOERROR;
4509
4510 /* Insert space to the end of the char_ranges. */
4511 insert_space(2, b - char_set[5] - 2, b - 1);
4512 *(b - char_set[5] - 2) = (wchar_t)start_val;
4513 *(b - char_set[5] - 1) = (wchar_t)end_val;
4514 char_set[4]++; /* ranges_index */
4515 }
4516 else
4517 # endif
4518 {
4519 range_start = (range_start_char >= 0)? TRANSLATE (range_start_char):
4520 range_start_char;
4521 range_end = TRANSLATE (p[0]);
4522 /* Report an error if the range is empty and the syntax prohibits
4523 this. */
4524 ret = ((syntax & RE_NO_EMPTY_RANGES)
4525 && (range_start > range_end))? REG_ERANGE : REG_NOERROR;
4526
4527 /* Insert space to the end of the char_ranges. */
4528 insert_space(2, b - char_set[5] - 2, b - 1);
4529 *(b - char_set[5] - 2) = range_start;
4530 *(b - char_set[5] - 1) = range_end;
4531 char_set[4]++; /* ranges_index */
4532 }
4533 /* Have to increment the pointer into the pattern string, so the
4534 caller isn't still at the ending character. */
4535 (*p_ptr)++;
4536
4537 return ret;
4538 }
4539 #else /* BYTE */
4540 /* Read the ending character of a range (in a bracket expression) from the
4541 uncompiled pattern *P_PTR (which ends at PEND). We assume the
4542 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4543 Then we set the translation of all bits between the starting and
4544 ending characters (inclusive) in the compiled pattern B.
4545
4546 Return an error code.
4547
4548 We use these short variable names so we can use the same macros as
4549 `regex_compile' itself. */
4550
4551 static reg_errcode_t
4552 byte_compile_range (range_start_char, p_ptr, pend, translate, syntax, b)
4553 unsigned int range_start_char;
4554 const char **p_ptr, *pend;
4555 RE_TRANSLATE_TYPE translate;
4556 reg_syntax_t syntax;
4557 unsigned char *b;
4558 {
4559 unsigned this_char;
4560 const char *p = *p_ptr;
4561 reg_errcode_t ret;
4562 # if _LIBC
4563 const unsigned char *collseq;
4564 unsigned int start_colseq;
4565 unsigned int end_colseq;
4566 # else
4567 unsigned end_char;
4568 # endif
4569
4570 if (p == pend)
4571 return REG_ERANGE;
4572
4573 /* Have to increment the pointer into the pattern string, so the
4574 caller isn't still at the ending character. */
4575 (*p_ptr)++;
4576
4577 /* Report an error if the range is empty and the syntax prohibits this. */
4578 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
4579
4580 # if _LIBC
4581 collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
4582 _NL_COLLATE_COLLSEQMB);
4583
4584 start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)];
4585 end_colseq = collseq[(unsigned char) TRANSLATE (p[0])];
4586 for (this_char = 0; this_char <= (unsigned char) -1; ++this_char)
4587 {
4588 unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)];
4589
4590 if (start_colseq <= this_colseq && this_colseq <= end_colseq)
4591 {
4592 SET_LIST_BIT (TRANSLATE (this_char));
4593 ret = REG_NOERROR;
4594 }
4595 }
4596 # else
4597 /* Here we see why `this_char' has to be larger than an `unsigned
4598 char' -- we would otherwise go into an infinite loop, since all
4599 characters <= 0xff. */
4600 range_start_char = TRANSLATE (range_start_char);
4601 /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE,
4602 and some compilers cast it to int implicitly, so following for_loop
4603 may fall to (almost) infinite loop.
4604 e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff.
4605 To avoid this, we cast p[0] to unsigned int and truncate it. */
4606 end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1));
4607
4608 for (this_char = range_start_char; this_char <= end_char; ++this_char)
4609 {
4610 SET_LIST_BIT (TRANSLATE (this_char));
4611 ret = REG_NOERROR;
4612 }
4613 # endif
4614
4615 return ret;
4616 }
4617 #endif /* WCHAR */
4618
4619 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4621 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4622 characters can start a string that matches the pattern. This fastmap
4623 is used by re_search to skip quickly over impossible starting points.
4624
4625 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4626 area as BUFP->fastmap.
4627
4628 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4629 the pattern buffer.
4630
4631 Returns 0 if we succeed, -2 if an internal error. */
4632
4633 #ifdef WCHAR
4634 /* local function for re_compile_fastmap.
4635 truncate wchar_t character to char. */
4636 static unsigned char truncate_wchar (CHAR_T c);
4637
4638 static unsigned char
4639 truncate_wchar (c)
4640 CHAR_T c;
4641 {
4642 unsigned char buf[MB_CUR_MAX];
4643 mbstate_t state;
4644 int retval;
4645 memset (&state, '\0', sizeof (state));
4646 retval = wcrtomb (buf, c, &state);
4647 return retval > 0 ? buf[0] : (unsigned char) c;
4648 }
4649 #endif /* WCHAR */
4650
4651 static int
4652 PREFIX(re_compile_fastmap) (bufp)
4653 struct re_pattern_buffer *bufp;
4654 {
4655 int j, k;
4656 #ifdef MATCH_MAY_ALLOCATE
4657 PREFIX(fail_stack_type) fail_stack;
4658 #endif
4659 #ifndef REGEX_MALLOC
4660 char *destination;
4661 #endif
4662
4663 register char *fastmap = bufp->fastmap;
4664
4665 #ifdef WCHAR
4666 /* We need to cast pattern to (wchar_t*), because we casted this compiled
4667 pattern to (char*) in regex_compile. */
4668 UCHAR_T *pattern = (UCHAR_T*)bufp->buffer;
4669 register UCHAR_T *pend = (UCHAR_T*) (bufp->buffer + bufp->used);
4670 #else /* BYTE */
4671 UCHAR_T *pattern = bufp->buffer;
4672 register UCHAR_T *pend = pattern + bufp->used;
4673 #endif /* WCHAR */
4674 UCHAR_T *p = pattern;
4675
4676 #ifdef REL_ALLOC
4677 /* This holds the pointer to the failure stack, when
4678 it is allocated relocatably. */
4679 fail_stack_elt_t *failure_stack_ptr;
4680 #endif
4681
4682 /* Assume that each path through the pattern can be null until
4683 proven otherwise. We set this false at the bottom of switch
4684 statement, to which we get only if a particular path doesn't
4685 match the empty string. */
4686 boolean path_can_be_null = true;
4687
4688 /* We aren't doing a `succeed_n' to begin with. */
4689 boolean succeed_n_p = false;
4690
4691 assert (fastmap != NULL && p != NULL);
4692
4693 INIT_FAIL_STACK ();
4694 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4695 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4696 bufp->can_be_null = 0;
4697
4698 while (1)
4699 {
4700 if (p == pend || *p == succeed)
4701 {
4702 /* We have reached the (effective) end of pattern. */
4703 if (!FAIL_STACK_EMPTY ())
4704 {
4705 bufp->can_be_null |= path_can_be_null;
4706
4707 /* Reset for next path. */
4708 path_can_be_null = true;
4709
4710 p = fail_stack.stack[--fail_stack.avail].pointer;
4711
4712 continue;
4713 }
4714 else
4715 break;
4716 }
4717
4718 /* We should never be about to go beyond the end of the pattern. */
4719 assert (p < pend);
4720
4721 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4722 {
4723
4724 /* I guess the idea here is to simply not bother with a fastmap
4725 if a backreference is used, since it's too hard to figure out
4726 the fastmap for the corresponding group. Setting
4727 `can_be_null' stops `re_search_2' from using the fastmap, so
4728 that is all we do. */
4729 case duplicate:
4730 bufp->can_be_null = 1;
4731 goto done;
4732
4733
4734 /* Following are the cases which match a character. These end
4735 with `break'. */
4736
4737 #ifdef WCHAR
4738 case exactn:
4739 fastmap[truncate_wchar(p[1])] = 1;
4740 break;
4741 #else /* BYTE */
4742 case exactn:
4743 fastmap[p[1]] = 1;
4744 break;
4745 #endif /* WCHAR */
4746 #ifdef MBS_SUPPORT
4747 case exactn_bin:
4748 fastmap[p[1]] = 1;
4749 break;
4750 #endif
4751
4752 #ifdef WCHAR
4753 /* It is hard to distinguish fastmap from (multi byte) characters
4754 which depends on current locale. */
4755 case charset:
4756 case charset_not:
4757 case wordchar:
4758 case notwordchar:
4759 bufp->can_be_null = 1;
4760 goto done;
4761 #else /* BYTE */
4762 case charset:
4763 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4764 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
4765 fastmap[j] = 1;
4766 break;
4767
4768
4769 case charset_not:
4770 /* Chars beyond end of map must be allowed. */
4771 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
4772 fastmap[j] = 1;
4773
4774 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4775 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
4776 fastmap[j] = 1;
4777 break;
4778
4779
4780 case wordchar:
4781 for (j = 0; j < (1 << BYTEWIDTH); j++)
4782 if (SYNTAX (j) == Sword)
4783 fastmap[j] = 1;
4784 break;
4785
4786
4787 case notwordchar:
4788 for (j = 0; j < (1 << BYTEWIDTH); j++)
4789 if (SYNTAX (j) != Sword)
4790 fastmap[j] = 1;
4791 break;
4792 #endif /* WCHAR */
4793
4794 case anychar:
4795 {
4796 int fastmap_newline = fastmap['\n'];
4797
4798 /* `.' matches anything ... */
4799 for (j = 0; j < (1 << BYTEWIDTH); j++)
4800 fastmap[j] = 1;
4801
4802 /* ... except perhaps newline. */
4803 if (!(bufp->syntax & RE_DOT_NEWLINE))
4804 fastmap['\n'] = fastmap_newline;
4805
4806 /* Return if we have already set `can_be_null'; if we have,
4807 then the fastmap is irrelevant. Something's wrong here. */
4808 else if (bufp->can_be_null)
4809 goto done;
4810
4811 /* Otherwise, have to check alternative paths. */
4812 break;
4813 }
4814
4815 #ifdef emacs
4816 case syntaxspec:
4817 k = *p++;
4818 for (j = 0; j < (1 << BYTEWIDTH); j++)
4819 if (SYNTAX (j) == (enum syntaxcode) k)
4820 fastmap[j] = 1;
4821 break;
4822
4823
4824 case notsyntaxspec:
4825 k = *p++;
4826 for (j = 0; j < (1 << BYTEWIDTH); j++)
4827 if (SYNTAX (j) != (enum syntaxcode) k)
4828 fastmap[j] = 1;
4829 break;
4830
4831
4832 /* All cases after this match the empty string. These end with
4833 `continue'. */
4834
4835
4836 case before_dot:
4837 case at_dot:
4838 case after_dot:
4839 continue;
4840 #endif /* emacs */
4841
4842
4843 case no_op:
4844 case begline:
4845 case endline:
4846 case begbuf:
4847 case endbuf:
4848 case wordbound:
4849 case notwordbound:
4850 case wordbeg:
4851 case wordend:
4852 case push_dummy_failure:
4853 continue;
4854
4855
4856 case jump_n:
4857 case pop_failure_jump:
4858 case maybe_pop_jump:
4859 case jump:
4860 case jump_past_alt:
4861 case dummy_failure_jump:
4862 EXTRACT_NUMBER_AND_INCR (j, p);
4863 p += j;
4864 if (j > 0)
4865 continue;
4866
4867 /* Jump backward implies we just went through the body of a
4868 loop and matched nothing. Opcode jumped to should be
4869 `on_failure_jump' or `succeed_n'. Just treat it like an
4870 ordinary jump. For a * loop, it has pushed its failure
4871 point already; if so, discard that as redundant. */
4872 if ((re_opcode_t) *p != on_failure_jump
4873 && (re_opcode_t) *p != succeed_n)
4874 continue;
4875
4876 p++;
4877 EXTRACT_NUMBER_AND_INCR (j, p);
4878 p += j;
4879
4880 /* If what's on the stack is where we are now, pop it. */
4881 if (!FAIL_STACK_EMPTY ()
4882 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
4883 fail_stack.avail--;
4884
4885 continue;
4886
4887
4888 case on_failure_jump:
4889 case on_failure_keep_string_jump:
4890 handle_on_failure_jump:
4891 EXTRACT_NUMBER_AND_INCR (j, p);
4892
4893 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
4894 end of the pattern. We don't want to push such a point,
4895 since when we restore it above, entering the switch will
4896 increment `p' past the end of the pattern. We don't need
4897 to push such a point since we obviously won't find any more
4898 fastmap entries beyond `pend'. Such a pattern can match
4899 the null string, though. */
4900 if (p + j < pend)
4901 {
4902 if (!PUSH_PATTERN_OP (p + j, fail_stack))
4903 {
4904 RESET_FAIL_STACK ();
4905 return -2;
4906 }
4907 }
4908 else
4909 bufp->can_be_null = 1;
4910
4911 if (succeed_n_p)
4912 {
4913 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
4914 succeed_n_p = false;
4915 }
4916
4917 continue;
4918
4919
4920 case succeed_n:
4921 /* Get to the number of times to succeed. */
4922 p += OFFSET_ADDRESS_SIZE;
4923
4924 /* Increment p past the n for when k != 0. */
4925 EXTRACT_NUMBER_AND_INCR (k, p);
4926 if (k == 0)
4927 {
4928 p -= 2 * OFFSET_ADDRESS_SIZE;
4929 succeed_n_p = true; /* Spaghetti code alert. */
4930 goto handle_on_failure_jump;
4931 }
4932 continue;
4933
4934
4935 case set_number_at:
4936 p += 2 * OFFSET_ADDRESS_SIZE;
4937 continue;
4938
4939
4940 case start_memory:
4941 case stop_memory:
4942 p += 2;
4943 continue;
4944
4945
4946 default:
4947 abort (); /* We have listed all the cases. */
4948 } /* switch *p++ */
4949
4950 /* Getting here means we have found the possible starting
4951 characters for one path of the pattern -- and that the empty
4952 string does not match. We need not follow this path further.
4953 Instead, look at the next alternative (remembered on the
4954 stack), or quit if no more. The test at the top of the loop
4955 does these things. */
4956 path_can_be_null = false;
4957 p = pend;
4958 } /* while p */
4959
4960 /* Set `can_be_null' for the last path (also the first path, if the
4961 pattern is empty). */
4962 bufp->can_be_null |= path_can_be_null;
4963
4964 done:
4965 RESET_FAIL_STACK ();
4966 return 0;
4967 }
4968
4969 #else /* not INSIDE_RECURSION */
4970
4971 int
4972 re_compile_fastmap (bufp)
4973 struct re_pattern_buffer *bufp;
4974 {
4975 # ifdef MBS_SUPPORT
4976 if (MB_CUR_MAX != 1)
4977 return wcs_re_compile_fastmap(bufp);
4978 else
4979 # endif
4980 return byte_re_compile_fastmap(bufp);
4981 } /* re_compile_fastmap */
4982 #ifdef _LIBC
4983 weak_alias (__re_compile_fastmap, re_compile_fastmap)
4984 #endif
4985
4986
4988 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4989 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4990 this memory for recording register information. STARTS and ENDS
4991 must be allocated using the malloc library routine, and must each
4992 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4993
4994 If NUM_REGS == 0, then subsequent matches should allocate their own
4995 register data.
4996
4997 Unless this function is called, the first search or match using
4998 PATTERN_BUFFER will allocate its own register data, without
4999 freeing the old data. */
5000
5001 void
5002 re_set_registers (bufp, regs, num_regs, starts, ends)
5003 struct re_pattern_buffer *bufp;
5004 struct re_registers *regs;
5005 unsigned num_regs;
5006 regoff_t *starts, *ends;
5007 {
5008 if (num_regs)
5009 {
5010 bufp->regs_allocated = REGS_REALLOCATE;
5011 regs->num_regs = num_regs;
5012 regs->start = starts;
5013 regs->end = ends;
5014 }
5015 else
5016 {
5017 bufp->regs_allocated = REGS_UNALLOCATED;
5018 regs->num_regs = 0;
5019 regs->start = regs->end = (regoff_t *) 0;
5020 }
5021 }
5022 #ifdef _LIBC
5023 weak_alias (__re_set_registers, re_set_registers)
5024 #endif
5025
5026 /* Searching routines. */
5028
5029 /* Like re_search_2, below, but only one string is specified, and
5030 doesn't let you say where to stop matching. */
5031
5032 int
5033 re_search (bufp, string, size, startpos, range, regs)
5034 struct re_pattern_buffer *bufp;
5035 const char *string;
5036 int size, startpos, range;
5037 struct re_registers *regs;
5038 {
5039 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
5040 regs, size);
5041 }
5042 #ifdef _LIBC
5043 weak_alias (__re_search, re_search)
5044 #endif
5045
5046
5047 /* Using the compiled pattern in BUFP->buffer, first tries to match the
5048 virtual concatenation of STRING1 and STRING2, starting first at index
5049 STARTPOS, then at STARTPOS + 1, and so on.
5050
5051 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5052
5053 RANGE is how far to scan while trying to match. RANGE = 0 means try
5054 only at STARTPOS; in general, the last start tried is STARTPOS +
5055 RANGE.
5056
5057 In REGS, return the indices of the virtual concatenation of STRING1
5058 and STRING2 that matched the entire BUFP->buffer and its contained
5059 subexpressions.
5060
5061 Do not consider matching one past the index STOP in the virtual
5062 concatenation of STRING1 and STRING2.
5063
5064 We return either the position in the strings at which the match was
5065 found, -1 if no match, or -2 if error (such as failure
5066 stack overflow). */
5067
5068 int
5069 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
5070 struct re_pattern_buffer *bufp;
5071 const char *string1, *string2;
5072 int size1, size2;
5073 int startpos;
5074 int range;
5075 struct re_registers *regs;
5076 int stop;
5077 {
5078 # ifdef MBS_SUPPORT
5079 if (MB_CUR_MAX != 1)
5080 return wcs_re_search_2 (bufp, string1, size1, string2, size2, startpos,
5081 range, regs, stop);
5082 else
5083 # endif
5084 return byte_re_search_2 (bufp, string1, size1, string2, size2, startpos,
5085 range, regs, stop);
5086 } /* re_search_2 */
5087 #ifdef _LIBC
5088 weak_alias (__re_search_2, re_search_2)
5089 #endif
5090
5091 #endif /* not INSIDE_RECURSION */
5092
5093 #ifdef INSIDE_RECURSION
5094
5095 #ifdef MATCH_MAY_ALLOCATE
5096 # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
5097 #else
5098 # define FREE_VAR(var) if (var) free (var); var = NULL
5099 #endif
5100
5101 #ifdef WCHAR
5102 # define MAX_ALLOCA_SIZE 2000
5103
5104 # define FREE_WCS_BUFFERS() \
5105 do { \
5106 if (size1 > MAX_ALLOCA_SIZE) \
5107 { \
5108 free (wcs_string1); \
5109 free (mbs_offset1); \
5110 } \
5111 else \
5112 { \
5113 FREE_VAR (wcs_string1); \
5114 FREE_VAR (mbs_offset1); \
5115 } \
5116 if (size2 > MAX_ALLOCA_SIZE) \
5117 { \
5118 free (wcs_string2); \
5119 free (mbs_offset2); \
5120 } \
5121 else \
5122 { \
5123 FREE_VAR (wcs_string2); \
5124 FREE_VAR (mbs_offset2); \
5125 } \
5126 } while (0)
5127
5128 #endif
5129
5130
5131 static int
5132 PREFIX(re_search_2) (bufp, string1, size1, string2, size2, startpos, range,
5133 regs, stop)
5134 struct re_pattern_buffer *bufp;
5135 const char *string1, *string2;
5136 int size1, size2;
5137 int startpos;
5138 int range;
5139 struct re_registers *regs;
5140 int stop;
5141 {
5142 int val;
5143 register char *fastmap = bufp->fastmap;
5144 register RE_TRANSLATE_TYPE translate = bufp->translate;
5145 int total_size = size1 + size2;
5146 int endpos = startpos + range;
5147 #ifdef WCHAR
5148 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5149 wchar_t *wcs_string1 = NULL, *wcs_string2 = NULL;
5150 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5151 int wcs_size1 = 0, wcs_size2 = 0;
5152 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
5153 int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
5154 /* They hold whether each wchar_t is binary data or not. */
5155 char *is_binary = NULL;
5156 #endif /* WCHAR */
5157
5158 /* Check for out-of-range STARTPOS. */
5159 if (startpos < 0 || startpos > total_size)
5160 return -1;
5161
5162 /* Fix up RANGE if it might eventually take us outside
5163 the virtual concatenation of STRING1 and STRING2.
5164 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
5165 if (endpos < 0)
5166 range = 0 - startpos;
5167 else if (endpos > total_size)
5168 range = total_size - startpos;
5169
5170 /* If the search isn't to be a backwards one, don't waste time in a
5171 search for a pattern that must be anchored. */
5172 if (bufp->used > 0 && range > 0
5173 && ((re_opcode_t) bufp->buffer[0] == begbuf
5174 /* `begline' is like `begbuf' if it cannot match at newlines. */
5175 || ((re_opcode_t) bufp->buffer[0] == begline
5176 && !bufp->newline_anchor)))
5177 {
5178 if (startpos > 0)
5179 return -1;
5180 else
5181 range = 1;
5182 }
5183
5184 #ifdef emacs
5185 /* In a forward search for something that starts with \=.
5186 don't keep searching past point. */
5187 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
5188 {
5189 range = PT - startpos;
5190 if (range <= 0)
5191 return -1;
5192 }
5193 #endif /* emacs */
5194
5195 /* Update the fastmap now if not correct already. */
5196 if (fastmap && !bufp->fastmap_accurate)
5197 if (re_compile_fastmap (bufp) == -2)
5198 return -2;
5199
5200 #ifdef WCHAR
5201 /* Allocate wchar_t array for wcs_string1 and wcs_string2 and
5202 fill them with converted string. */
5203 if (size1 != 0)
5204 {
5205 if (size1 > MAX_ALLOCA_SIZE)
5206 {
5207 wcs_string1 = TALLOC (size1 + 1, CHAR_T);
5208 mbs_offset1 = TALLOC (size1 + 1, int);
5209 is_binary = TALLOC (size1 + 1, char);
5210 }
5211 else
5212 {
5213 wcs_string1 = REGEX_TALLOC (size1 + 1, CHAR_T);
5214 mbs_offset1 = REGEX_TALLOC (size1 + 1, int);
5215 is_binary = REGEX_TALLOC (size1 + 1, char);
5216 }
5217 if (!wcs_string1 || !mbs_offset1 || !is_binary)
5218 {
5219 if (size1 > MAX_ALLOCA_SIZE)
5220 {
5221 free (wcs_string1);
5222 free (mbs_offset1);
5223 free (is_binary);
5224 }
5225 else
5226 {
5227 FREE_VAR (wcs_string1);
5228 FREE_VAR (mbs_offset1);
5229 FREE_VAR (is_binary);
5230 }
5231 return -2;
5232 }
5233 wcs_size1 = convert_mbs_to_wcs(wcs_string1, string1, size1,
5234 mbs_offset1, is_binary);
5235 wcs_string1[wcs_size1] = L'\0'; /* for a sentinel */
5236 if (size1 > MAX_ALLOCA_SIZE)
5237 free (is_binary);
5238 else
5239 FREE_VAR (is_binary);
5240 }
5241 if (size2 != 0)
5242 {
5243 if (size2 > MAX_ALLOCA_SIZE)
5244 {
5245 wcs_string2 = TALLOC (size2 + 1, CHAR_T);
5246 mbs_offset2 = TALLOC (size2 + 1, int);
5247 is_binary = TALLOC (size2 + 1, char);
5248 }
5249 else
5250 {
5251 wcs_string2 = REGEX_TALLOC (size2 + 1, CHAR_T);
5252 mbs_offset2 = REGEX_TALLOC (size2 + 1, int);
5253 is_binary = REGEX_TALLOC (size2 + 1, char);
5254 }
5255 if (!wcs_string2 || !mbs_offset2 || !is_binary)
5256 {
5257 FREE_WCS_BUFFERS ();
5258 if (size2 > MAX_ALLOCA_SIZE)
5259 free (is_binary);
5260 else
5261 FREE_VAR (is_binary);
5262 return -2;
5263 }
5264 wcs_size2 = convert_mbs_to_wcs(wcs_string2, string2, size2,
5265 mbs_offset2, is_binary);
5266 wcs_string2[wcs_size2] = L'\0'; /* for a sentinel */
5267 if (size2 > MAX_ALLOCA_SIZE)
5268 free (is_binary);
5269 else
5270 FREE_VAR (is_binary);
5271 }
5272 #endif /* WCHAR */
5273
5274
5275 /* Loop through the string, looking for a place to start matching. */
5276 for (;;)
5277 {
5278 /* If a fastmap is supplied, skip quickly over characters that
5279 cannot be the start of a match. If the pattern can match the
5280 null string, however, we don't need to skip characters; we want
5281 the first null string. */
5282 if (fastmap && startpos < total_size && !bufp->can_be_null)
5283 {
5284 if (range > 0) /* Searching forwards. */
5285 {
5286 register const char *d;
5287 register int lim = 0;
5288 int irange = range;
5289
5290 if (startpos < size1 && startpos + range >= size1)
5291 lim = range - (size1 - startpos);
5292
5293 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
5294
5295 /* Written out as an if-else to avoid testing `translate'
5296 inside the loop. */
5297 if (translate)
5298 while (range > lim
5299 && !fastmap[(unsigned char)
5300 translate[(unsigned char) *d++]])
5301 range--;
5302 else
5303 while (range > lim && !fastmap[(unsigned char) *d++])
5304 range--;
5305
5306 startpos += irange - range;
5307 }
5308 else /* Searching backwards. */
5309 {
5310 register CHAR_T c = (size1 == 0 || startpos >= size1
5311 ? string2[startpos - size1]
5312 : string1[startpos]);
5313
5314 if (!fastmap[(unsigned char) TRANSLATE (c)])
5315 goto advance;
5316 }
5317 }
5318
5319 /* If can't match the null string, and that's all we have left, fail. */
5320 if (range >= 0 && startpos == total_size && fastmap
5321 && !bufp->can_be_null)
5322 {
5323 #ifdef WCHAR
5324 FREE_WCS_BUFFERS ();
5325 #endif
5326 return -1;
5327 }
5328
5329 #ifdef WCHAR
5330 val = wcs_re_match_2_internal (bufp, string1, size1, string2,
5331 size2, startpos, regs, stop,
5332 wcs_string1, wcs_size1,
5333 wcs_string2, wcs_size2,
5334 mbs_offset1, mbs_offset2);
5335 #else /* BYTE */
5336 val = byte_re_match_2_internal (bufp, string1, size1, string2,
5337 size2, startpos, regs, stop);
5338 #endif /* BYTE */
5339
5340 #ifndef REGEX_MALLOC
5341 # ifdef C_ALLOCA
5342 alloca (0);
5343 # endif
5344 #endif
5345
5346 if (val >= 0)
5347 {
5348 #ifdef WCHAR
5349 FREE_WCS_BUFFERS ();
5350 #endif
5351 return startpos;
5352 }
5353
5354 if (val == -2)
5355 {
5356 #ifdef WCHAR
5357 FREE_WCS_BUFFERS ();
5358 #endif
5359 return -2;
5360 }
5361
5362 advance:
5363 if (!range)
5364 break;
5365 else if (range > 0)
5366 {
5367 range--;
5368 startpos++;
5369 }
5370 else
5371 {
5372 range++;
5373 startpos--;
5374 }
5375 }
5376 #ifdef WCHAR
5377 FREE_WCS_BUFFERS ();
5378 #endif
5379 return -1;
5380 }
5381
5382 #ifdef WCHAR
5383 /* This converts PTR, a pointer into one of the search wchar_t strings
5384 `string1' and `string2' into an multibyte string offset from the
5385 beginning of that string. We use mbs_offset to optimize.
5386 See convert_mbs_to_wcs. */
5387 # define POINTER_TO_OFFSET(ptr) \
5388 (FIRST_STRING_P (ptr) \
5389 ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \
5390 : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \
5391 + csize1)))
5392 #else /* BYTE */
5393 /* This converts PTR, a pointer into one of the search strings `string1'
5394 and `string2' into an offset from the beginning of that string. */
5395 # define POINTER_TO_OFFSET(ptr) \
5396 (FIRST_STRING_P (ptr) \
5397 ? ((regoff_t) ((ptr) - string1)) \
5398 : ((regoff_t) ((ptr) - string2 + size1)))
5399 #endif /* WCHAR */
5400
5401 /* Macros for dealing with the split strings in re_match_2. */
5402
5403 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
5404
5405 /* Call before fetching a character with *d. This switches over to
5406 string2 if necessary. */
5407 #define PREFETCH() \
5408 while (d == dend) \
5409 { \
5410 /* End of string2 => fail. */ \
5411 if (dend == end_match_2) \
5412 goto fail; \
5413 /* End of string1 => advance to string2. */ \
5414 d = string2; \
5415 dend = end_match_2; \
5416 }
5417
5418 /* Test if at very beginning or at very end of the virtual concatenation
5419 of `string1' and `string2'. If only one string, it's `string2'. */
5420 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5421 #define AT_STRINGS_END(d) ((d) == end2)
5422
5423
5424 /* Test if D points to a character which is word-constituent. We have
5425 two special cases to check for: if past the end of string1, look at
5426 the first character in string2; and if before the beginning of
5427 string2, look at the last character in string1. */
5428 #ifdef WCHAR
5429 /* Use internationalized API instead of SYNTAX. */
5430 # define WORDCHAR_P(d) \
5431 (iswalnum ((wint_t)((d) == end1 ? *string2 \
5432 : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0 \
5433 || ((d) == end1 ? *string2 \
5434 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == L'_')
5435 #else /* BYTE */
5436 # define WORDCHAR_P(d) \
5437 (SYNTAX ((d) == end1 ? *string2 \
5438 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
5439 == Sword)
5440 #endif /* WCHAR */
5441
5442 /* Disabled due to a compiler bug -- see comment at case wordbound */
5443 #if 0
5444 /* Test if the character before D and the one at D differ with respect
5445 to being word-constituent. */
5446 #define AT_WORD_BOUNDARY(d) \
5447 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
5448 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
5449 #endif
5450
5451 /* Free everything we malloc. */
5452 #ifdef MATCH_MAY_ALLOCATE
5453 # ifdef WCHAR
5454 # define FREE_VARIABLES() \
5455 do { \
5456 REGEX_FREE_STACK (fail_stack.stack); \
5457 FREE_VAR (regstart); \
5458 FREE_VAR (regend); \
5459 FREE_VAR (old_regstart); \
5460 FREE_VAR (old_regend); \
5461 FREE_VAR (best_regstart); \
5462 FREE_VAR (best_regend); \
5463 FREE_VAR (reg_info); \
5464 FREE_VAR (reg_dummy); \
5465 FREE_VAR (reg_info_dummy); \
5466 if (!cant_free_wcs_buf) \
5467 { \
5468 FREE_VAR (string1); \
5469 FREE_VAR (string2); \
5470 FREE_VAR (mbs_offset1); \
5471 FREE_VAR (mbs_offset2); \
5472 } \
5473 } while (0)
5474 # else /* BYTE */
5475 # define FREE_VARIABLES() \
5476 do { \
5477 REGEX_FREE_STACK (fail_stack.stack); \
5478 FREE_VAR (regstart); \
5479 FREE_VAR (regend); \
5480 FREE_VAR (old_regstart); \
5481 FREE_VAR (old_regend); \
5482 FREE_VAR (best_regstart); \
5483 FREE_VAR (best_regend); \
5484 FREE_VAR (reg_info); \
5485 FREE_VAR (reg_dummy); \
5486 FREE_VAR (reg_info_dummy); \
5487 } while (0)
5488 # endif /* WCHAR */
5489 #else
5490 # ifdef WCHAR
5491 # define FREE_VARIABLES() \
5492 do { \
5493 if (!cant_free_wcs_buf) \
5494 { \
5495 FREE_VAR (string1); \
5496 FREE_VAR (string2); \
5497 FREE_VAR (mbs_offset1); \
5498 FREE_VAR (mbs_offset2); \
5499 } \
5500 } while (0)
5501 # else /* BYTE */
5502 # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
5503 # endif /* WCHAR */
5504 #endif /* not MATCH_MAY_ALLOCATE */
5505
5506 /* These values must meet several constraints. They must not be valid
5507 register values; since we have a limit of 255 registers (because
5508 we use only one byte in the pattern for the register number), we can
5509 use numbers larger than 255. They must differ by 1, because of
5510 NUM_FAILURE_ITEMS above. And the value for the lowest register must
5511 be larger than the value for the highest register, so we do not try
5512 to actually save any registers when none are active. */
5513 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
5514 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
5515
5516 #else /* not INSIDE_RECURSION */
5518 /* Matching routines. */
5519
5520 #ifndef emacs /* Emacs never uses this. */
5521 /* re_match is like re_match_2 except it takes only a single string. */
5522
5523 int
5524 re_match (bufp, string, size, pos, regs)
5525 struct re_pattern_buffer *bufp;
5526 const char *string;
5527 int size, pos;
5528 struct re_registers *regs;
5529 {
5530 int result;
5531 # ifdef MBS_SUPPORT
5532 if (MB_CUR_MAX != 1)
5533 result = wcs_re_match_2_internal (bufp, NULL, 0, string, size,
5534 pos, regs, size,
5535 NULL, 0, NULL, 0, NULL, NULL);
5536 else
5537 # endif
5538 result = byte_re_match_2_internal (bufp, NULL, 0, string, size,
5539 pos, regs, size);
5540 # ifndef REGEX_MALLOC
5541 # ifdef C_ALLOCA
5542 alloca (0);
5543 # endif
5544 # endif
5545 return result;
5546 }
5547 # ifdef _LIBC
5548 weak_alias (__re_match, re_match)
5549 # endif
5550 #endif /* not emacs */
5551
5552 #endif /* not INSIDE_RECURSION */
5553
5554 #ifdef INSIDE_RECURSION
5555 static boolean PREFIX(group_match_null_string_p) _RE_ARGS ((UCHAR_T **p,
5556 UCHAR_T *end,
5557 PREFIX(register_info_type) *reg_info));
5558 static boolean PREFIX(alt_match_null_string_p) _RE_ARGS ((UCHAR_T *p,
5559 UCHAR_T *end,
5560 PREFIX(register_info_type) *reg_info));
5561 static boolean PREFIX(common_op_match_null_string_p) _RE_ARGS ((UCHAR_T **p,
5562 UCHAR_T *end,
5563 PREFIX(register_info_type) *reg_info));
5564 static int PREFIX(bcmp_translate) _RE_ARGS ((const CHAR_T *s1, const CHAR_T *s2,
5565 int len, char *translate));
5566 #else /* not INSIDE_RECURSION */
5567
5568 /* re_match_2 matches the compiled pattern in BUFP against the
5569 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5570 and SIZE2, respectively). We start matching at POS, and stop
5571 matching at STOP.
5572
5573 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
5574 store offsets for the substring each group matched in REGS. See the
5575 documentation for exactly how many groups we fill.
5576
5577 We return -1 if no match, -2 if an internal error (such as the
5578 failure stack overflowing). Otherwise, we return the length of the
5579 matched substring. */
5580
5581 int
5582 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
5583 struct re_pattern_buffer *bufp;
5584 const char *string1, *string2;
5585 int size1, size2;
5586 int pos;
5587 struct re_registers *regs;
5588 int stop;
5589 {
5590 int result;
5591 # ifdef MBS_SUPPORT
5592 if (MB_CUR_MAX != 1)
5593 result = wcs_re_match_2_internal (bufp, string1, size1, string2, size2,
5594 pos, regs, stop,
5595 NULL, 0, NULL, 0, NULL, NULL);
5596 else
5597 # endif
5598 result = byte_re_match_2_internal (bufp, string1, size1, string2, size2,
5599 pos, regs, stop);
5600
5601 #ifndef REGEX_MALLOC
5602 # ifdef C_ALLOCA
5603 alloca (0);
5604 # endif
5605 #endif
5606 return result;
5607 }
5608 #ifdef _LIBC
5609 weak_alias (__re_match_2, re_match_2)
5610 #endif
5611
5612 #endif /* not INSIDE_RECURSION */
5613
5614 #ifdef INSIDE_RECURSION
5615
5616 #ifdef WCHAR
5617 static int count_mbs_length PARAMS ((int *, int));
5618
5619 /* This check the substring (from 0, to length) of the multibyte string,
5620 to which offset_buffer correspond. And count how many wchar_t_characters
5621 the substring occupy. We use offset_buffer to optimization.
5622 See convert_mbs_to_wcs. */
5623
5624 static int
5625 count_mbs_length(offset_buffer, length)
5626 int *offset_buffer;
5627 int length;
5628 {
5629 int upper, lower;
5630
5631 /* Check whether the size is valid. */
5632 if (length < 0)
5633 return -1;
5634
5635 if (offset_buffer == NULL)
5636 return 0;
5637
5638 /* If there are no multibyte character, offset_buffer[i] == i.
5639 Optmize for this case. */
5640 if (offset_buffer[length] == length)
5641 return length;
5642
5643 /* Set up upper with length. (because for all i, offset_buffer[i] >= i) */
5644 upper = length;
5645 lower = 0;
5646
5647 while (true)
5648 {
5649 int middle = (lower + upper) / 2;
5650 if (middle == lower || middle == upper)
5651 break;
5652 if (offset_buffer[middle] > length)
5653 upper = middle;
5654 else if (offset_buffer[middle] < length)
5655 lower = middle;
5656 else
5657 return middle;
5658 }
5659
5660 return -1;
5661 }
5662 #endif /* WCHAR */
5663
5664 /* This is a separate function so that we can force an alloca cleanup
5665 afterwards. */
5666 #ifdef WCHAR
5667 static int
5668 wcs_re_match_2_internal (bufp, cstring1, csize1, cstring2, csize2, pos,
5669 regs, stop, string1, size1, string2, size2,
5670 mbs_offset1, mbs_offset2)
5671 struct re_pattern_buffer *bufp;
5672 const char *cstring1, *cstring2;
5673 int csize1, csize2;
5674 int pos;
5675 struct re_registers *regs;
5676 int stop;
5677 /* string1 == string2 == NULL means string1/2, size1/2 and
5678 mbs_offset1/2 need seting up in this function. */
5679 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5680 wchar_t *string1, *string2;
5681 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5682 int size1, size2;
5683 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
5684 int *mbs_offset1, *mbs_offset2;
5685 #else /* BYTE */
5686 static int
5687 byte_re_match_2_internal (bufp, string1, size1,string2, size2, pos,
5688 regs, stop)
5689 struct re_pattern_buffer *bufp;
5690 const char *string1, *string2;
5691 int size1, size2;
5692 int pos;
5693 struct re_registers *regs;
5694 int stop;
5695 #endif /* BYTE */
5696 {
5697 /* General temporaries. */
5698 int mcnt;
5699 UCHAR_T *p1;
5700 #ifdef WCHAR
5701 /* They hold whether each wchar_t is binary data or not. */
5702 char *is_binary = NULL;
5703 /* If true, we can't free string1/2, mbs_offset1/2. */
5704 int cant_free_wcs_buf = 1;
5705 #endif /* WCHAR */
5706
5707 /* Just past the end of the corresponding string. */
5708 const CHAR_T *end1, *end2;
5709
5710 /* Pointers into string1 and string2, just past the last characters in
5711 each to consider matching. */
5712 const CHAR_T *end_match_1, *end_match_2;
5713
5714 /* Where we are in the data, and the end of the current string. */
5715 const CHAR_T *d, *dend;
5716
5717 /* Where we are in the pattern, and the end of the pattern. */
5718 #ifdef WCHAR
5719 UCHAR_T *pattern, *p;
5720 register UCHAR_T *pend;
5721 #else /* BYTE */
5722 UCHAR_T *p = bufp->buffer;
5723 register UCHAR_T *pend = p + bufp->used;
5724 #endif /* WCHAR */
5725
5726 /* Mark the opcode just after a start_memory, so we can test for an
5727 empty subpattern when we get to the stop_memory. */
5728 UCHAR_T *just_past_start_mem = 0;
5729
5730 /* We use this to map every character in the string. */
5731 RE_TRANSLATE_TYPE translate = bufp->translate;
5732
5733 /* Failure point stack. Each place that can handle a failure further
5734 down the line pushes a failure point on this stack. It consists of
5735 restart, regend, and reg_info for all registers corresponding to
5736 the subexpressions we're currently inside, plus the number of such
5737 registers, and, finally, two char *'s. The first char * is where
5738 to resume scanning the pattern; the second one is where to resume
5739 scanning the strings. If the latter is zero, the failure point is
5740 a ``dummy''; if a failure happens and the failure point is a dummy,
5741 it gets discarded and the next next one is tried. */
5742 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5743 PREFIX(fail_stack_type) fail_stack;
5744 #endif
5745 #ifdef DEBUG
5746 static unsigned failure_id;
5747 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5748 #endif
5749
5750 #ifdef REL_ALLOC
5751 /* This holds the pointer to the failure stack, when
5752 it is allocated relocatably. */
5753 fail_stack_elt_t *failure_stack_ptr;
5754 #endif
5755
5756 /* We fill all the registers internally, independent of what we
5757 return, for use in backreferences. The number here includes
5758 an element for register zero. */
5759 size_t num_regs = bufp->re_nsub + 1;
5760
5761 /* The currently active registers. */
5762 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5763 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5764
5765 /* Information on the contents of registers. These are pointers into
5766 the input strings; they record just what was matched (on this
5767 attempt) by a subexpression part of the pattern, that is, the
5768 regnum-th regstart pointer points to where in the pattern we began
5769 matching and the regnum-th regend points to right after where we
5770 stopped matching the regnum-th subexpression. (The zeroth register
5771 keeps track of what the whole pattern matches.) */
5772 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5773 const CHAR_T **regstart, **regend;
5774 #endif
5775
5776 /* If a group that's operated upon by a repetition operator fails to
5777 match anything, then the register for its start will need to be
5778 restored because it will have been set to wherever in the string we
5779 are when we last see its open-group operator. Similarly for a
5780 register's end. */
5781 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5782 const CHAR_T **old_regstart, **old_regend;
5783 #endif
5784
5785 /* The is_active field of reg_info helps us keep track of which (possibly
5786 nested) subexpressions we are currently in. The matched_something
5787 field of reg_info[reg_num] helps us tell whether or not we have
5788 matched any of the pattern so far this time through the reg_num-th
5789 subexpression. These two fields get reset each time through any
5790 loop their register is in. */
5791 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5792 PREFIX(register_info_type) *reg_info;
5793 #endif
5794
5795 /* The following record the register info as found in the above
5796 variables when we find a match better than any we've seen before.
5797 This happens as we backtrack through the failure points, which in
5798 turn happens only if we have not yet matched the entire string. */
5799 unsigned best_regs_set = false;
5800 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5801 const CHAR_T **best_regstart, **best_regend;
5802 #endif
5803
5804 /* Logically, this is `best_regend[0]'. But we don't want to have to
5805 allocate space for that if we're not allocating space for anything
5806 else (see below). Also, we never need info about register 0 for
5807 any of the other register vectors, and it seems rather a kludge to
5808 treat `best_regend' differently than the rest. So we keep track of
5809 the end of the best match so far in a separate variable. We
5810 initialize this to NULL so that when we backtrack the first time
5811 and need to test it, it's not garbage. */
5812 const CHAR_T *match_end = NULL;
5813
5814 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
5815 int set_regs_matched_done = 0;
5816
5817 /* Used when we pop values we don't care about. */
5818 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5819 const CHAR_T **reg_dummy;
5820 PREFIX(register_info_type) *reg_info_dummy;
5821 #endif
5822
5823 #ifdef DEBUG
5824 /* Counts the total number of registers pushed. */
5825 unsigned num_regs_pushed = 0;
5826 #endif
5827
5828 /* Definitions for state transitions. More efficiently for gcc. */
5829 #ifdef __GNUC__
5830 # if defined HAVE_SUBTRACT_LOCAL_LABELS && defined SHARED
5831 # define NEXT \
5832 do \
5833 { \
5834 int offset; \
5835 const void *__unbounded ptr; \
5836 offset = (p == pend \
5837 ? 0 : jmptable[SWITCH_ENUM_CAST ((re_opcode_t) *p++)]); \
5838 ptr = &&end_of_pattern + offset; \
5839 goto *ptr; \
5840 } \
5841 while (0)
5842 # define REF(x) \
5843 &&label_##x - &&end_of_pattern
5844 # define JUMP_TABLE_TYPE const int
5845 # else
5846 # define NEXT \
5847 do \
5848 { \
5849 const void *__unbounded ptr; \
5850 ptr = (p == pend ? &&end_of_pattern \
5851 : jmptable[SWITCH_ENUM_CAST ((re_opcode_t) *p++)]); \
5852 goto *ptr; \
5853 } \
5854 while (0)
5855 # define REF(x) \
5856 &&label_##x
5857 # define JUMP_TABLE_TYPE const void *const
5858 # endif
5859 # define CASE(x) label_##x
5860 static JUMP_TABLE_TYPE jmptable[] =
5861 {
5862 REF (no_op),
5863 REF (succeed),
5864 REF (exactn),
5865 # ifdef MBS_SUPPORT
5866 REF (exactn_bin),
5867 # endif
5868 REF (anychar),
5869 REF (charset),
5870 REF (charset_not),
5871 REF (start_memory),
5872 REF (stop_memory),
5873 REF (duplicate),
5874 REF (begline),
5875 REF (endline),
5876 REF (begbuf),
5877 REF (endbuf),
5878 REF (jump),
5879 REF (jump_past_alt),
5880 REF (on_failure_jump),
5881 REF (on_failure_keep_string_jump),
5882 REF (pop_failure_jump),
5883 REF (maybe_pop_jump),
5884 REF (dummy_failure_jump),
5885 REF (push_dummy_failure),
5886 REF (succeed_n),
5887 REF (jump_n),
5888 REF (set_number_at),
5889 REF (wordchar),
5890 REF (notwordchar),
5891 REF (wordbeg),
5892 REF (wordend),
5893 REF (wordbound),
5894 REF (notwordbound)
5895 # ifdef emacs
5896 ,REF (before_dot),
5897 REF (at_dot),
5898 REF (after_dot),
5899 REF (syntaxspec),
5900 REF (notsyntaxspec)
5901 # endif
5902 };
5903 #else
5904 # define NEXT \
5905 break
5906 # define CASE(x) \
5907 case x
5908 #endif
5909
5910 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5911
5912 INIT_FAIL_STACK ();
5913
5914 #ifdef MATCH_MAY_ALLOCATE
5915 /* Do not bother to initialize all the register variables if there are
5916 no groups in the pattern, as it takes a fair amount of time. If
5917 there are groups, we include space for register 0 (the whole
5918 pattern), even though we never use it, since it simplifies the
5919 array indexing. We should fix this. */
5920 if (bufp->re_nsub)
5921 {
5922 regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5923 regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5924 old_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5925 old_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5926 best_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5927 best_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5928 reg_info = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5929 reg_dummy = REGEX_TALLOC (num_regs, const CHAR_T *);
5930 reg_info_dummy = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5931
5932 if (!(regstart && regend && old_regstart && old_regend && reg_info
5933 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
5934 {
5935 FREE_VARIABLES ();
5936 return -2;
5937 }
5938 }
5939 else
5940 {
5941 /* We must initialize all our variables to NULL, so that
5942 `FREE_VARIABLES' doesn't try to free them. */
5943 regstart = regend = old_regstart = old_regend = best_regstart
5944 = best_regend = reg_dummy = NULL;
5945 reg_info = reg_info_dummy = (PREFIX(register_info_type) *) NULL;
5946 }
5947 #endif /* MATCH_MAY_ALLOCATE */
5948
5949 /* The starting position is bogus. */
5950 #ifdef WCHAR
5951 if (pos < 0 || pos > csize1 + csize2)
5952 #else /* BYTE */
5953 if (pos < 0 || pos > size1 + size2)
5954 #endif
5955 {
5956 FREE_VARIABLES ();
5957 return -1;
5958 }
5959
5960 #ifdef WCHAR
5961 /* Allocate wchar_t array for string1 and string2 and
5962 fill them with converted string. */
5963 if (string1 == NULL && string2 == NULL)
5964 {
5965 /* We need seting up buffers here. */
5966
5967 /* We must free wcs buffers in this function. */
5968 cant_free_wcs_buf = 0;
5969
5970 if (csize1 != 0)
5971 {
5972 string1 = REGEX_TALLOC (csize1 + 1, CHAR_T);
5973 mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
5974 is_binary = REGEX_TALLOC (csize1 + 1, char);
5975 if (!string1 || !mbs_offset1 || !is_binary)
5976 {
5977 FREE_VAR (string1);
5978 FREE_VAR (mbs_offset1);
5979 FREE_VAR (is_binary);
5980 return -2;
5981 }
5982 }
5983 if (csize2 != 0)
5984 {
5985 string2 = REGEX_TALLOC (csize2 + 1, CHAR_T);
5986 mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
5987 is_binary = REGEX_TALLOC (csize2 + 1, char);
5988 if (!string2 || !mbs_offset2 || !is_binary)
5989 {
5990 FREE_VAR (string1);
5991 FREE_VAR (mbs_offset1);
5992 FREE_VAR (string2);
5993 FREE_VAR (mbs_offset2);
5994 FREE_VAR (is_binary);
5995 return -2;
5996 }
5997 size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
5998 mbs_offset2, is_binary);
5999 string2[size2] = L'\0'; /* for a sentinel */
6000 FREE_VAR (is_binary);
6001 }
6002 }
6003
6004 /* We need to cast pattern to (wchar_t*), because we casted this compiled
6005 pattern to (char*) in regex_compile. */
6006 p = pattern = (CHAR_T*)bufp->buffer;
6007 pend = (CHAR_T*)(bufp->buffer + bufp->used);
6008
6009 #endif /* WCHAR */
6010
6011 /* Initialize subexpression text positions to -1 to mark ones that no
6012 start_memory/stop_memory has been seen for. Also initialize the
6013 register information struct. */
6014 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
6015 {
6016 regstart[mcnt] = regend[mcnt]
6017 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
6018
6019 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
6020 IS_ACTIVE (reg_info[mcnt]) = 0;
6021 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
6022 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
6023 }
6024
6025 /* We move `string1' into `string2' if the latter's empty -- but not if
6026 `string1' is null. */
6027 if (size2 == 0 && string1 != NULL)
6028 {
6029 string2 = string1;
6030 size2 = size1;
6031 string1 = 0;
6032 size1 = 0;
6033 #ifdef WCHAR
6034 mbs_offset2 = mbs_offset1;
6035 csize2 = csize1;
6036 mbs_offset1 = NULL;
6037 csize1 = 0;
6038 #endif
6039 }
6040 end1 = string1 + size1;
6041 end2 = string2 + size2;
6042
6043 /* Compute where to stop matching, within the two strings. */
6044 #ifdef WCHAR
6045 if (stop <= csize1)
6046 {
6047 mcnt = count_mbs_length(mbs_offset1, stop);
6048 end_match_1 = string1 + mcnt;
6049 end_match_2 = string2;
6050 }
6051 else
6052 {
6053 if (stop > csize1 + csize2)
6054 stop = csize1 + csize2;
6055 end_match_1 = end1;
6056 mcnt = count_mbs_length(mbs_offset2, stop-csize1);
6057 end_match_2 = string2 + mcnt;
6058 }
6059 if (mcnt < 0)
6060 { /* count_mbs_length return error. */
6061 FREE_VARIABLES ();
6062 return -1;
6063 }
6064 #else
6065 if (stop <= size1)
6066 {
6067 end_match_1 = string1 + stop;
6068 end_match_2 = string2;
6069 }
6070 else
6071 {
6072 end_match_1 = end1;
6073 end_match_2 = string2 + stop - size1;
6074 }
6075 #endif /* WCHAR */
6076
6077 /* `p' scans through the pattern as `d' scans through the data.
6078 `dend' is the end of the input string that `d' points within. `d'
6079 is advanced into the following input string whenever necessary, but
6080 this happens before fetching; therefore, at the beginning of the
6081 loop, `d' can be pointing at the end of a string, but it cannot
6082 equal `string2'. */
6083 #ifdef WCHAR
6084 if (size1 > 0 && pos <= csize1)
6085 {
6086 mcnt = count_mbs_length(mbs_offset1, pos);
6087 d = string1 + mcnt;
6088 dend = end_match_1;
6089 }
6090 else
6091 {
6092 mcnt = count_mbs_length(mbs_offset2, pos-csize1);
6093 d = string2 + mcnt;
6094 dend = end_match_2;
6095 }
6096
6097 if (mcnt < 0)
6098 { /* count_mbs_length return error. */
6099 FREE_VARIABLES ();
6100 return -1;
6101 }
6102 #else
6103 if (size1 > 0 && pos <= size1)
6104 {
6105 d = string1 + pos;
6106 dend = end_match_1;
6107 }
6108 else
6109 {
6110 d = string2 + pos - size1;
6111 dend = end_match_2;
6112 }
6113 #endif /* WCHAR */
6114
6115 DEBUG_PRINT1 ("The compiled pattern is:\n");
6116 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
6117 DEBUG_PRINT1 ("The string to match is: `");
6118 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
6119 DEBUG_PRINT1 ("'\n");
6120
6121 /* This loops over pattern commands. It exits by returning from the
6122 function if the match is complete, or it drops through if the match
6123 fails at this starting point in the input data. */
6124 for (;;)
6125 {
6126 #ifdef _LIBC
6127 DEBUG_PRINT2 ("\n%p: ", p);
6128 #else
6129 DEBUG_PRINT2 ("\n0x%x: ", p);
6130 #endif
6131
6132 #ifdef __GNUC__
6133 NEXT;
6134 #else
6135 if (p == pend)
6136 #endif
6137 {
6138 #ifdef __GNUC__
6139 end_of_pattern:
6140 #endif
6141 /* End of pattern means we might have succeeded. */
6142 DEBUG_PRINT1 ("end of pattern ... ");
6143
6144 /* If we haven't matched the entire string, and we want the
6145 longest match, try backtracking. */
6146 if (d != end_match_2)
6147 {
6148 /* 1 if this match ends in the same string (string1 or string2)
6149 as the best previous match. */
6150 boolean same_str_p = (FIRST_STRING_P (match_end)
6151 == MATCHING_IN_FIRST_STRING);
6152 /* 1 if this match is the best seen so far. */
6153 boolean best_match_p;
6154
6155 /* AIX compiler got confused when this was combined
6156 with the previous declaration. */
6157 if (same_str_p)
6158 best_match_p = d > match_end;
6159 else
6160 best_match_p = !MATCHING_IN_FIRST_STRING;
6161
6162 DEBUG_PRINT1 ("backtracking.\n");
6163
6164 if (!FAIL_STACK_EMPTY ())
6165 { /* More failure points to try. */
6166
6167 /* If exceeds best match so far, save it. */
6168 if (!best_regs_set || best_match_p)
6169 {
6170 best_regs_set = true;
6171 match_end = d;
6172
6173 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
6174
6175 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
6176 {
6177 best_regstart[mcnt] = regstart[mcnt];
6178 best_regend[mcnt] = regend[mcnt];
6179 }
6180 }
6181 goto fail;
6182 }
6183
6184 /* If no failure points, don't restore garbage. And if
6185 last match is real best match, don't restore second
6186 best one. */
6187 else if (best_regs_set && !best_match_p)
6188 {
6189 restore_best_regs:
6190 /* Restore best match. It may happen that `dend ==
6191 end_match_1' while the restored d is in string2.
6192 For example, the pattern `x.*y.*z' against the
6193 strings `x-' and `y-z-', if the two strings are
6194 not consecutive in memory. */
6195 DEBUG_PRINT1 ("Restoring best registers.\n");
6196
6197 d = match_end;
6198 dend = ((d >= string1 && d <= end1)
6199 ? end_match_1 : end_match_2);
6200
6201 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
6202 {
6203 regstart[mcnt] = best_regstart[mcnt];
6204 regend[mcnt] = best_regend[mcnt];
6205 }
6206 }
6207 } /* d != end_match_2 */
6208
6209 succeed_label:
6210 DEBUG_PRINT1 ("Accepting match.\n");
6211 /* If caller wants register contents data back, do it. */
6212 if (regs && !bufp->no_sub)
6213 {
6214 /* Have the register data arrays been allocated? */
6215 if (bufp->regs_allocated == REGS_UNALLOCATED)
6216 { /* No. So allocate them with malloc. We need one
6217 extra element beyond `num_regs' for the `-1' marker
6218 GNU code uses. */
6219 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
6220 regs->start = TALLOC (regs->num_regs, regoff_t);
6221 regs->end = TALLOC (regs->num_regs, regoff_t);
6222 if (regs->start == NULL || regs->end == NULL)
6223 {
6224 FREE_VARIABLES ();
6225 return -2;
6226 }
6227 bufp->regs_allocated = REGS_REALLOCATE;
6228 }
6229 else if (bufp->regs_allocated == REGS_REALLOCATE)
6230 { /* Yes. If we need more elements than were already
6231 allocated, reallocate them. If we need fewer, just
6232 leave it alone. */
6233 if (regs->num_regs < num_regs + 1)
6234 {
6235 regs->num_regs = num_regs + 1;
6236 RETALLOC (regs->start, regs->num_regs, regoff_t);
6237 RETALLOC (regs->end, regs->num_regs, regoff_t);
6238 if (regs->start == NULL || regs->end == NULL)
6239 {
6240 FREE_VARIABLES ();
6241 return -2;
6242 }
6243 }
6244 }
6245 else
6246 {
6247 /* These braces fend off a "empty body in an else-statement"
6248 warning under GCC when assert expands to nothing. */
6249 assert (bufp->regs_allocated == REGS_FIXED);
6250 }
6251
6252 /* Convert the pointer data in `regstart' and `regend' to
6253 indices. Register zero has to be set differently,
6254 since we haven't kept track of any info for it. */
6255 if (regs->num_regs > 0)
6256 {
6257 regs->start[0] = pos;
6258 #ifdef WCHAR
6259 if (MATCHING_IN_FIRST_STRING)
6260 regs->end[0] = (mbs_offset1 != NULL ?
6261 mbs_offset1[d-string1] : 0);
6262 else
6263 regs->end[0] = csize1 + (mbs_offset2 != NULL
6264 ? mbs_offset2[d-string2] : 0);
6265 #else
6266 regs->end[0] = (MATCHING_IN_FIRST_STRING
6267 ? ((regoff_t) (d - string1))
6268 : ((regoff_t) (d - string2 + size1)));
6269 #endif /* WCHAR */
6270 }
6271
6272 /* Go through the first `min (num_regs, regs->num_regs)'
6273 registers, since that is all we initialized. */
6274 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
6275 mcnt++)
6276 {
6277 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
6278 regs->start[mcnt] = regs->end[mcnt] = -1;
6279 else
6280 {
6281 regs->start[mcnt]
6282 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
6283 regs->end[mcnt]
6284 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
6285 }
6286 }
6287
6288 /* If the regs structure we return has more elements than
6289 were in the pattern, set the extra elements to -1. If
6290 we (re)allocated the registers, this is the case,
6291 because we always allocate enough to have at least one
6292 -1 at the end. */
6293 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
6294 regs->start[mcnt] = regs->end[mcnt] = -1;
6295 } /* regs && !bufp->no_sub */
6296
6297 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
6298 nfailure_points_pushed, nfailure_points_popped,
6299 nfailure_points_pushed - nfailure_points_popped);
6300 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
6301
6302 #ifdef WCHAR
6303 if (MATCHING_IN_FIRST_STRING)
6304 mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
6305 else
6306 mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
6307 csize1;
6308 mcnt -= pos;
6309 #else
6310 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
6311 ? string1 : string2 - size1);
6312 #endif /* WCHAR */
6313
6314 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
6315
6316 FREE_VARIABLES ();
6317 return mcnt;
6318 }
6319
6320 #ifndef __GNUC__
6321 /* Otherwise match next pattern command. */
6322 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
6323 {
6324 #endif
6325 /* Ignore these. Used to ignore the n of succeed_n's which
6326 currently have n == 0. */
6327 CASE (no_op):
6328 DEBUG_PRINT1 ("EXECUTING no_op.\n");
6329 NEXT;
6330
6331 CASE (succeed):
6332 DEBUG_PRINT1 ("EXECUTING succeed.\n");
6333 goto succeed_label;
6334
6335 /* Match the next n pattern characters exactly. The following
6336 byte in the pattern defines n, and the n bytes after that
6337 are the characters to match. */
6338 CASE (exactn):
6339 #ifdef MBS_SUPPORT
6340 CASE (exactn_bin):
6341 #endif
6342 mcnt = *p++;
6343 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
6344
6345 /* This is written out as an if-else so we don't waste time
6346 testing `translate' inside the loop. */
6347 if (translate)
6348 {
6349 do
6350 {
6351 PREFETCH ();
6352 #ifdef WCHAR
6353 if (*d <= 0xff)
6354 {
6355 if ((UCHAR_T) translate[(unsigned char) *d++]
6356 != (UCHAR_T) *p++)
6357 goto fail;
6358 }
6359 else
6360 {
6361 if (*d++ != (CHAR_T) *p++)
6362 goto fail;
6363 }
6364 #else
6365 if ((UCHAR_T) translate[(unsigned char) *d++]
6366 != (UCHAR_T) *p++)
6367 goto fail;
6368 #endif /* WCHAR */
6369 }
6370 while (--mcnt);
6371 }
6372 else
6373 {
6374 do
6375 {
6376 PREFETCH ();
6377 if (*d++ != (CHAR_T) *p++) goto fail;
6378 }
6379 while (--mcnt);
6380 }
6381 SET_REGS_MATCHED ();
6382 NEXT;
6383
6384
6385 /* Match any character except possibly a newline or a null. */
6386 CASE (anychar):
6387 DEBUG_PRINT1 ("EXECUTING anychar.\n");
6388
6389 PREFETCH ();
6390
6391 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
6392 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
6393 goto fail;
6394
6395 SET_REGS_MATCHED ();
6396 DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d);
6397 d++;
6398 NEXT;
6399
6400
6401 CASE (charset):
6402 CASE (charset_not):
6403 {
6404 register UCHAR_T c;
6405 #ifdef WCHAR
6406 unsigned int i, char_class_length, coll_symbol_length,
6407 equiv_class_length, ranges_length, chars_length, length;
6408 CHAR_T *workp, *workp2, *charset_top;
6409 #define WORK_BUFFER_SIZE 128
6410 CHAR_T str_buf[WORK_BUFFER_SIZE];
6411 # ifdef _LIBC
6412 uint32_t nrules;
6413 # endif /* _LIBC */
6414 #endif /* WCHAR */
6415 boolean not = (re_opcode_t) *(p - 1) == charset_not;
6416
6417 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
6418 PREFETCH ();
6419 c = TRANSLATE (*d); /* The character to match. */
6420 #ifdef WCHAR
6421 # ifdef _LIBC
6422 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
6423 # endif /* _LIBC */
6424 charset_top = p - 1;
6425 char_class_length = *p++;
6426 coll_symbol_length = *p++;
6427 equiv_class_length = *p++;
6428 ranges_length = *p++;
6429 chars_length = *p++;
6430 /* p points charset[6], so the address of the next instruction
6431 (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'],
6432 where l=length of char_classes, m=length of collating_symbol,
6433 n=equivalence_class, o=length of char_range,
6434 p'=length of character. */
6435 workp = p;
6436 /* Update p to indicate the next instruction. */
6437 p += char_class_length + coll_symbol_length+ equiv_class_length +
6438 2*ranges_length + chars_length;
6439
6440 /* match with char_class? */
6441 for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
6442 {
6443 wctype_t wctype;
6444 uintptr_t alignedp = ((uintptr_t)workp
6445 + __alignof__(wctype_t) - 1)
6446 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
6447 wctype = *((wctype_t*)alignedp);
6448 workp += CHAR_CLASS_SIZE;
6449 if (iswctype((wint_t)c, wctype))
6450 goto char_set_matched;
6451 }
6452
6453 /* match with collating_symbol? */
6454 # ifdef _LIBC
6455 if (nrules != 0)
6456 {
6457 const unsigned char *extra = (const unsigned char *)
6458 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
6459
6460 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
6461 workp++)
6462 {
6463 int32_t *wextra;
6464 wextra = (int32_t*)(extra + *workp++);
6465 for (i = 0; i < *wextra; ++i)
6466 if (TRANSLATE(d[i]) != wextra[1 + i])
6467 break;
6468
6469 if (i == *wextra)
6470 {
6471 /* Update d, however d will be incremented at
6472 char_set_matched:, we decrement d here. */
6473 d += i - 1;
6474 goto char_set_matched;
6475 }
6476 }
6477 }
6478 else /* (nrules == 0) */
6479 # endif
6480 /* If we can't look up collation data, we use wcscoll
6481 instead. */
6482 {
6483 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;)
6484 {
6485 const CHAR_T *backup_d = d, *backup_dend = dend;
6486 length = wcslen (workp);
6487
6488 /* If wcscoll(the collating symbol, whole string) > 0,
6489 any substring of the string never match with the
6490 collating symbol. */
6491 if (wcscoll (workp, d) > 0)
6492 {
6493 workp += length + 1;
6494 continue;
6495 }
6496
6497 /* First, we compare the collating symbol with
6498 the first character of the string.
6499 If it don't match, we add the next character to
6500 the compare buffer in turn. */
6501 for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++)
6502 {
6503 int match;
6504 if (d == dend)
6505 {
6506 if (dend == end_match_2)
6507 break;
6508 d = string2;
6509 dend = end_match_2;
6510 }
6511
6512 /* add next character to the compare buffer. */
6513 str_buf[i] = TRANSLATE(*d);
6514 str_buf[i+1] = '\0';
6515
6516 match = wcscoll (workp, str_buf);
6517 if (match == 0)
6518 goto char_set_matched;
6519
6520 if (match < 0)
6521 /* (str_buf > workp) indicate (str_buf + X > workp),
6522 because for all X (str_buf + X > str_buf).
6523 So we don't need continue this loop. */
6524 break;
6525
6526 /* Otherwise(str_buf < workp),
6527 (str_buf+next_character) may equals (workp).
6528 So we continue this loop. */
6529 }
6530 /* not matched */
6531 d = backup_d;
6532 dend = backup_dend;
6533 workp += length + 1;
6534 }
6535 }
6536 /* match with equivalence_class? */
6537 # ifdef _LIBC
6538 if (nrules != 0)
6539 {
6540 const CHAR_T *backup_d = d, *backup_dend = dend;
6541 /* Try to match the equivalence class against
6542 those known to the collate implementation. */
6543 const int32_t *table;
6544 const int32_t *weights;
6545 const int32_t *extra;
6546 const int32_t *indirect;
6547 int32_t idx, idx2;
6548 wint_t *cp;
6549 size_t len;
6550
6551 /* This #include defines a local function! */
6552 # include <locale/weightwc.h>
6553
6554 table = (const int32_t *)
6555 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC);
6556 weights = (const wint_t *)
6557 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC);
6558 extra = (const wint_t *)
6559 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC);
6560 indirect = (const int32_t *)
6561 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC);
6562
6563 /* Write 1 collating element to str_buf, and
6564 get its index. */
6565 idx2 = 0;
6566
6567 for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++)
6568 {
6569 cp = (wint_t*)str_buf;
6570 if (d == dend)
6571 {
6572 if (dend == end_match_2)
6573 break;
6574 d = string2;
6575 dend = end_match_2;
6576 }
6577 str_buf[i] = TRANSLATE(*(d+i));
6578 str_buf[i+1] = '\0'; /* sentinel */
6579 idx2 = findidx ((const wint_t**)&cp);
6580 }
6581
6582 /* Update d, however d will be incremented at
6583 char_set_matched:, we decrement d here. */
6584 d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1);
6585 if (d >= dend)
6586 {
6587 if (dend == end_match_2)
6588 d = dend;
6589 else
6590 {
6591 d = string2;
6592 dend = end_match_2;
6593 }
6594 }
6595
6596 len = weights[idx2];
6597
6598 for (workp2 = workp + equiv_class_length ; workp < workp2 ;
6599 workp++)
6600 {
6601 idx = (int32_t)*workp;
6602 /* We already checked idx != 0 in regex_compile. */
6603
6604 if (idx2 != 0 && len == weights[idx])
6605 {
6606 int cnt = 0;
6607 while (cnt < len && (weights[idx + 1 + cnt]
6608 == weights[idx2 + 1 + cnt]))
6609 ++cnt;
6610
6611 if (cnt == len)
6612 goto char_set_matched;
6613 }
6614 }
6615 /* not matched */
6616 d = backup_d;
6617 dend = backup_dend;
6618 }
6619 else /* (nrules == 0) */
6620 # endif
6621 /* If we can't look up collation data, we use wcscoll
6622 instead. */
6623 {
6624 for (workp2 = workp + equiv_class_length ; workp < workp2 ;)
6625 {
6626 const CHAR_T *backup_d = d, *backup_dend = dend;
6627 length = wcslen (workp);
6628
6629 /* If wcscoll(the collating symbol, whole string) > 0,
6630 any substring of the string never match with the
6631 collating symbol. */
6632 if (wcscoll (workp, d) > 0)
6633 {
6634 workp += length + 1;
6635 break;
6636 }
6637
6638 /* First, we compare the equivalence class with
6639 the first character of the string.
6640 If it don't match, we add the next character to
6641 the compare buffer in turn. */
6642 for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++)
6643 {
6644 int match;
6645 if (d == dend)
6646 {
6647 if (dend == end_match_2)
6648 break;
6649 d = string2;
6650 dend = end_match_2;
6651 }
6652
6653 /* add next character to the compare buffer. */
6654 str_buf[i] = TRANSLATE(*d);
6655 str_buf[i+1] = '\0';
6656
6657 match = wcscoll (workp, str_buf);
6658
6659 if (match == 0)
6660 goto char_set_matched;
6661
6662 if (match < 0)
6663 /* (str_buf > workp) indicate (str_buf + X > workp),
6664 because for all X (str_buf + X > str_buf).
6665 So we don't need continue this loop. */
6666 break;
6667
6668 /* Otherwise(str_buf < workp),
6669 (str_buf+next_character) may equals (workp).
6670 So we continue this loop. */
6671 }
6672 /* not matched */
6673 d = backup_d;
6674 dend = backup_dend;
6675 workp += length + 1;
6676 }
6677 }
6678
6679 /* match with char_range? */
6680 # ifdef _LIBC
6681 if (nrules != 0)
6682 {
6683 uint32_t collseqval;
6684 const char *collseq = (const char *)
6685 _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC);
6686
6687 collseqval = collseq_table_lookup (collseq, c);
6688
6689 for (; workp < p - chars_length ;)
6690 {
6691 uint32_t start_val, end_val;
6692
6693 /* We already compute the collation sequence value
6694 of the characters (or collating symbols). */
6695 start_val = (uint32_t) *workp++; /* range_start */
6696 end_val = (uint32_t) *workp++; /* range_end */
6697
6698 if (start_val <= collseqval && collseqval <= end_val)
6699 goto char_set_matched;
6700 }
6701 }
6702 else
6703 # endif
6704 {
6705 /* We set range_start_char at str_buf[0], range_end_char
6706 at str_buf[4], and compared char at str_buf[2]. */
6707 str_buf[1] = 0;
6708 str_buf[2] = c;
6709 str_buf[3] = 0;
6710 str_buf[5] = 0;
6711 for (; workp < p - chars_length ;)
6712 {
6713 wchar_t *range_start_char, *range_end_char;
6714
6715 /* match if (range_start_char <= c <= range_end_char). */
6716
6717 /* If range_start(or end) < 0, we assume -range_start(end)
6718 is the offset of the collating symbol which is specified
6719 as the character of the range start(end). */
6720
6721 /* range_start */
6722 if (*workp < 0)
6723 range_start_char = charset_top - (*workp++);
6724 else
6725 {
6726 str_buf[0] = *workp++;
6727 range_start_char = str_buf;
6728 }
6729
6730 /* range_end */
6731 if (*workp < 0)
6732 range_end_char = charset_top - (*workp++);
6733 else
6734 {
6735 str_buf[4] = *workp++;
6736 range_end_char = str_buf + 4;
6737 }
6738
6739 if (wcscoll (range_start_char, str_buf+2) <= 0
6740 && wcscoll (str_buf+2, range_end_char) <= 0)
6741 goto char_set_matched;
6742 }
6743 }
6744
6745 /* match with char? */
6746 for (; workp < p ; workp++)
6747 if (c == *workp)
6748 goto char_set_matched;
6749
6750 not = !not;
6751
6752 char_set_matched:
6753 if (not) goto fail;
6754 #else
6755 /* Cast to `unsigned' instead of `unsigned char' in case the
6756 bit list is a full 32 bytes long. */
6757 if (c < (unsigned) (*p * BYTEWIDTH)
6758 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
6759 not = !not;
6760
6761 p += 1 + *p;
6762
6763 if (!not) goto fail;
6764 #undef WORK_BUFFER_SIZE
6765 #endif /* WCHAR */
6766 SET_REGS_MATCHED ();
6767 d++;
6768 NEXT;
6769 }
6770
6771
6772 /* The beginning of a group is represented by start_memory.
6773 The arguments are the register number in the next byte, and the
6774 number of groups inner to this one in the next. The text
6775 matched within the group is recorded (in the internal
6776 registers data structure) under the register number. */
6777 CASE (start_memory):
6778 DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n",
6779 (long int) *p, (long int) p[1]);
6780
6781 /* Find out if this group can match the empty string. */
6782 p1 = p; /* To send to group_match_null_string_p. */
6783
6784 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
6785 REG_MATCH_NULL_STRING_P (reg_info[*p])
6786 = PREFIX(group_match_null_string_p) (&p1, pend, reg_info);
6787
6788 /* Save the position in the string where we were the last time
6789 we were at this open-group operator in case the group is
6790 operated upon by a repetition operator, e.g., with `(a*)*b'
6791 against `ab'; then we want to ignore where we are now in
6792 the string in case this attempt to match fails. */
6793 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6794 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
6795 : regstart[*p];
6796 DEBUG_PRINT2 (" old_regstart: %d\n",
6797 POINTER_TO_OFFSET (old_regstart[*p]));
6798
6799 regstart[*p] = d;
6800 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
6801
6802 IS_ACTIVE (reg_info[*p]) = 1;
6803 MATCHED_SOMETHING (reg_info[*p]) = 0;
6804
6805 /* Clear this whenever we change the register activity status. */
6806 set_regs_matched_done = 0;
6807
6808 /* This is the new highest active register. */
6809 highest_active_reg = *p;
6810
6811 /* If nothing was active before, this is the new lowest active
6812 register. */
6813 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6814 lowest_active_reg = *p;
6815
6816 /* Move past the register number and inner group count. */
6817 p += 2;
6818 just_past_start_mem = p;
6819
6820 NEXT;
6821
6822
6823 /* The stop_memory opcode represents the end of a group. Its
6824 arguments are the same as start_memory's: the register
6825 number, and the number of inner groups. */
6826 CASE (stop_memory):
6827 DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n",
6828 (long int) *p, (long int) p[1]);
6829
6830 /* We need to save the string position the last time we were at
6831 this close-group operator in case the group is operated
6832 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
6833 against `aba'; then we want to ignore where we are now in
6834 the string in case this attempt to match fails. */
6835 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6836 ? REG_UNSET (regend[*p]) ? d : regend[*p]
6837 : regend[*p];
6838 DEBUG_PRINT2 (" old_regend: %d\n",
6839 POINTER_TO_OFFSET (old_regend[*p]));
6840
6841 regend[*p] = d;
6842 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
6843
6844 /* This register isn't active anymore. */
6845 IS_ACTIVE (reg_info[*p]) = 0;
6846
6847 /* Clear this whenever we change the register activity status. */
6848 set_regs_matched_done = 0;
6849
6850 /* If this was the only register active, nothing is active
6851 anymore. */
6852 if (lowest_active_reg == highest_active_reg)
6853 {
6854 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6855 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6856 }
6857 else
6858 { /* We must scan for the new highest active register, since
6859 it isn't necessarily one less than now: consider
6860 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
6861 new highest active register is 1. */
6862 UCHAR_T r = *p - 1;
6863 while (r > 0 && !IS_ACTIVE (reg_info[r]))
6864 r--;
6865
6866 /* If we end up at register zero, that means that we saved
6867 the registers as the result of an `on_failure_jump', not
6868 a `start_memory', and we jumped to past the innermost
6869 `stop_memory'. For example, in ((.)*) we save
6870 registers 1 and 2 as a result of the *, but when we pop
6871 back to the second ), we are at the stop_memory 1.
6872 Thus, nothing is active. */
6873 if (r == 0)
6874 {
6875 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6876 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6877 }
6878 else
6879 highest_active_reg = r;
6880 }
6881
6882 /* If just failed to match something this time around with a
6883 group that's operated on by a repetition operator, try to
6884 force exit from the ``loop'', and restore the register
6885 information for this group that we had before trying this
6886 last match. */
6887 if ((!MATCHED_SOMETHING (reg_info[*p])
6888 || just_past_start_mem == p - 1)
6889 && (p + 2) < pend)
6890 {
6891 boolean is_a_jump_n = false;
6892
6893 p1 = p + 2;
6894 mcnt = 0;
6895 switch ((re_opcode_t) *p1++)
6896 {
6897 case jump_n:
6898 is_a_jump_n = true;
6899 case pop_failure_jump:
6900 case maybe_pop_jump:
6901 case jump:
6902 case dummy_failure_jump:
6903 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6904 if (is_a_jump_n)
6905 p1 += OFFSET_ADDRESS_SIZE;
6906 break;
6907
6908 default:
6909 /* do nothing */ ;
6910 }
6911 p1 += mcnt;
6912
6913 /* If the next operation is a jump backwards in the pattern
6914 to an on_failure_jump right before the start_memory
6915 corresponding to this stop_memory, exit from the loop
6916 by forcing a failure after pushing on the stack the
6917 on_failure_jump's jump in the pattern, and d. */
6918 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
6919 && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory
6920 && p1[2+OFFSET_ADDRESS_SIZE] == *p)
6921 {
6922 /* If this group ever matched anything, then restore
6923 what its registers were before trying this last
6924 failed match, e.g., with `(a*)*b' against `ab' for
6925 regstart[1], and, e.g., with `((a*)*(b*)*)*'
6926 against `aba' for regend[3].
6927
6928 Also restore the registers for inner groups for,
6929 e.g., `((a*)(b*))*' against `aba' (register 3 would
6930 otherwise get trashed). */
6931
6932 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
6933 {
6934 unsigned r;
6935
6936 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
6937
6938 /* Restore this and inner groups' (if any) registers. */
6939 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
6940 r++)
6941 {
6942 regstart[r] = old_regstart[r];
6943
6944 /* xx why this test? */
6945 if (old_regend[r] >= regstart[r])
6946 regend[r] = old_regend[r];
6947 }
6948 }
6949 p1++;
6950 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6951 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
6952
6953 goto fail;
6954 }
6955 }
6956
6957 /* Move past the register number and the inner group count. */
6958 p += 2;
6959 NEXT;
6960
6961
6962 /* \<digit> has been turned into a `duplicate' command which is
6963 followed by the numeric value of <digit> as the register number. */
6964 CASE (duplicate):
6965 {
6966 register const CHAR_T *d2, *dend2;
6967 int regno = *p++; /* Get which register to match against. */
6968 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
6969
6970 /* Can't back reference a group which we've never matched. */
6971 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
6972 goto fail;
6973
6974 /* Where in input to try to start matching. */
6975 d2 = regstart[regno];
6976
6977 /* Where to stop matching; if both the place to start and
6978 the place to stop matching are in the same string, then
6979 set to the place to stop, otherwise, for now have to use
6980 the end of the first string. */
6981
6982 dend2 = ((FIRST_STRING_P (regstart[regno])
6983 == FIRST_STRING_P (regend[regno]))
6984 ? regend[regno] : end_match_1);
6985 for (;;)
6986 {
6987 /* If necessary, advance to next segment in register
6988 contents. */
6989 while (d2 == dend2)
6990 {
6991 if (dend2 == end_match_2) break;
6992 if (dend2 == regend[regno]) break;
6993
6994 /* End of string1 => advance to string2. */
6995 d2 = string2;
6996 dend2 = regend[regno];
6997 }
6998 /* At end of register contents => success */
6999 if (d2 == dend2) break;
7000
7001 /* If necessary, advance to next segment in data. */
7002 PREFETCH ();
7003
7004 /* How many characters left in this segment to match. */
7005 mcnt = dend - d;
7006
7007 /* Want how many consecutive characters we can match in
7008 one shot, so, if necessary, adjust the count. */
7009 if (mcnt > dend2 - d2)
7010 mcnt = dend2 - d2;
7011
7012 /* Compare that many; failure if mismatch, else move
7013 past them. */
7014 if (translate
7015 ? PREFIX(bcmp_translate) (d, d2, mcnt, translate)
7016 : memcmp (d, d2, mcnt*sizeof(UCHAR_T)))
7017 goto fail;
7018 d += mcnt, d2 += mcnt;
7019
7020 /* Do this because we've match some characters. */
7021 SET_REGS_MATCHED ();
7022 }
7023 }
7024 NEXT;
7025
7026
7027 /* begline matches the empty string at the beginning of the string
7028 (unless `not_bol' is set in `bufp'), and, if
7029 `newline_anchor' is set, after newlines. */
7030 CASE (begline):
7031 DEBUG_PRINT1 ("EXECUTING begline.\n");
7032
7033 if (AT_STRINGS_BEG (d))
7034 {
7035 if (!bufp->not_bol)
7036 {
7037 NEXT;
7038 }
7039 }
7040 else if (d[-1] == '\n' && bufp->newline_anchor)
7041 {
7042 NEXT;
7043 }
7044 /* In all other cases, we fail. */
7045 goto fail;
7046
7047
7048 /* endline is the dual of begline. */
7049 CASE (endline):
7050 DEBUG_PRINT1 ("EXECUTING endline.\n");
7051
7052 if (AT_STRINGS_END (d))
7053 {
7054 if (!bufp->not_eol)
7055 {
7056 NEXT;
7057 }
7058 }
7059
7060 /* We have to ``prefetch'' the next character. */
7061 else if ((d == end1 ? *string2 : *d) == '\n'
7062 && bufp->newline_anchor)
7063 {
7064 NEXT;
7065 }
7066 goto fail;
7067
7068
7069 /* Match at the very beginning of the data. */
7070 CASE (begbuf):
7071 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
7072 if (AT_STRINGS_BEG (d))
7073 {
7074 NEXT;
7075 }
7076 goto fail;
7077
7078
7079 /* Match at the very end of the data. */
7080 CASE (endbuf):
7081 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
7082 if (AT_STRINGS_END (d))
7083 {
7084 NEXT;
7085 }
7086 goto fail;
7087
7088
7089 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
7090 pushes NULL as the value for the string on the stack. Then
7091 `pop_failure_point' will keep the current value for the
7092 string, instead of restoring it. To see why, consider
7093 matching `foo\nbar' against `.*\n'. The .* matches the foo;
7094 then the . fails against the \n. But the next thing we want
7095 to do is match the \n against the \n; if we restored the
7096 string value, we would be back at the foo.
7097
7098 Because this is used only in specific cases, we don't need to
7099 check all the things that `on_failure_jump' does, to make
7100 sure the right things get saved on the stack. Hence we don't
7101 share its code. The only reason to push anything on the
7102 stack at all is that otherwise we would have to change
7103 `anychar's code to do something besides goto fail in this
7104 case; that seems worse than this. */
7105 CASE (on_failure_keep_string_jump):
7106 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
7107
7108 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7109 #ifdef _LIBC
7110 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
7111 #else
7112 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
7113 #endif
7114
7115 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
7116 NEXT;
7117
7118
7119 /* Uses of on_failure_jump:
7120
7121 Each alternative starts with an on_failure_jump that points
7122 to the beginning of the next alternative. Each alternative
7123 except the last ends with a jump that in effect jumps past
7124 the rest of the alternatives. (They really jump to the
7125 ending jump of the following alternative, because tensioning
7126 these jumps is a hassle.)
7127
7128 Repeats start with an on_failure_jump that points past both
7129 the repetition text and either the following jump or
7130 pop_failure_jump back to this on_failure_jump. */
7131 CASE (on_failure_jump):
7132 on_failure:
7133 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
7134
7135 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7136 #ifdef _LIBC
7137 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
7138 #else
7139 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
7140 #endif
7141
7142 /* If this on_failure_jump comes right before a group (i.e.,
7143 the original * applied to a group), save the information
7144 for that group and all inner ones, so that if we fail back
7145 to this point, the group's information will be correct.
7146 For example, in \(a*\)*\1, we need the preceding group,
7147 and in \(zz\(a*\)b*\)\2, we need the inner group. */
7148
7149 /* We can't use `p' to check ahead because we push
7150 a failure point to `p + mcnt' after we do this. */
7151 p1 = p;
7152
7153 /* We need to skip no_op's before we look for the
7154 start_memory in case this on_failure_jump is happening as
7155 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
7156 against aba. */
7157 while (p1 < pend && (re_opcode_t) *p1 == no_op)
7158 p1++;
7159
7160 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
7161 {
7162 /* We have a new highest active register now. This will
7163 get reset at the start_memory we are about to get to,
7164 but we will have saved all the registers relevant to
7165 this repetition op, as described above. */
7166 highest_active_reg = *(p1 + 1) + *(p1 + 2);
7167 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
7168 lowest_active_reg = *(p1 + 1);
7169 }
7170
7171 DEBUG_PRINT1 (":\n");
7172 PUSH_FAILURE_POINT (p + mcnt, d, -2);
7173 NEXT;
7174
7175
7176 /* A smart repeat ends with `maybe_pop_jump'.
7177 We change it to either `pop_failure_jump' or `jump'. */
7178 CASE (maybe_pop_jump):
7179 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7180 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
7181 {
7182 register UCHAR_T *p2 = p;
7183
7184 /* Compare the beginning of the repeat with what in the
7185 pattern follows its end. If we can establish that there
7186 is nothing that they would both match, i.e., that we
7187 would have to backtrack because of (as in, e.g., `a*a')
7188 then we can change to pop_failure_jump, because we'll
7189 never have to backtrack.
7190
7191 This is not true in the case of alternatives: in
7192 `(a|ab)*' we do need to backtrack to the `ab' alternative
7193 (e.g., if the string was `ab'). But instead of trying to
7194 detect that here, the alternative has put on a dummy
7195 failure point which is what we will end up popping. */
7196
7197 /* Skip over open/close-group commands.
7198 If what follows this loop is a ...+ construct,
7199 look at what begins its body, since we will have to
7200 match at least one of that. */
7201 while (1)
7202 {
7203 if (p2 + 2 < pend
7204 && ((re_opcode_t) *p2 == stop_memory
7205 || (re_opcode_t) *p2 == start_memory))
7206 p2 += 3;
7207 else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend
7208 && (re_opcode_t) *p2 == dummy_failure_jump)
7209 p2 += 2 + 2 * OFFSET_ADDRESS_SIZE;
7210 else
7211 break;
7212 }
7213
7214 p1 = p + mcnt;
7215 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
7216 to the `maybe_finalize_jump' of this case. Examine what
7217 follows. */
7218
7219 /* If we're at the end of the pattern, we can change. */
7220 if (p2 == pend)
7221 {
7222 /* Consider what happens when matching ":\(.*\)"
7223 against ":/". I don't really understand this code
7224 yet. */
7225 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7226 pop_failure_jump;
7227 DEBUG_PRINT1
7228 (" End of pattern: change to `pop_failure_jump'.\n");
7229 }
7230
7231 else if ((re_opcode_t) *p2 == exactn
7232 #ifdef MBS_SUPPORT
7233 || (re_opcode_t) *p2 == exactn_bin
7234 #endif
7235 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
7236 {
7237 register UCHAR_T c
7238 = *p2 == (UCHAR_T) endline ? '\n' : p2[2];
7239
7240 if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn
7241 #ifdef MBS_SUPPORT
7242 || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin
7243 #endif
7244 ) && p1[3+OFFSET_ADDRESS_SIZE] != c)
7245 {
7246 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7247 pop_failure_jump;
7248 #ifdef WCHAR
7249 DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n",
7250 (wint_t) c,
7251 (wint_t) p1[3+OFFSET_ADDRESS_SIZE]);
7252 #else
7253 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
7254 (char) c,
7255 (char) p1[3+OFFSET_ADDRESS_SIZE]);
7256 #endif
7257 }
7258
7259 #ifndef WCHAR
7260 else if ((re_opcode_t) p1[3] == charset
7261 || (re_opcode_t) p1[3] == charset_not)
7262 {
7263 int not = (re_opcode_t) p1[3] == charset_not;
7264
7265 if (c < (unsigned) (p1[4] * BYTEWIDTH)
7266 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
7267 not = !not;
7268
7269 /* `not' is equal to 1 if c would match, which means
7270 that we can't change to pop_failure_jump. */
7271 if (!not)
7272 {
7273 p[-3] = (unsigned char) pop_failure_jump;
7274 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7275 }
7276 }
7277 #endif /* not WCHAR */
7278 }
7279 #ifndef WCHAR
7280 else if ((re_opcode_t) *p2 == charset)
7281 {
7282 /* We win if the first character of the loop is not part
7283 of the charset. */
7284 if ((re_opcode_t) p1[3] == exactn
7285 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
7286 && (p2[2 + p1[5] / BYTEWIDTH]
7287 & (1 << (p1[5] % BYTEWIDTH)))))
7288 {
7289 p[-3] = (unsigned char) pop_failure_jump;
7290 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7291 }
7292
7293 else if ((re_opcode_t) p1[3] == charset_not)
7294 {
7295 int idx;
7296 /* We win if the charset_not inside the loop
7297 lists every character listed in the charset after. */
7298 for (idx = 0; idx < (int) p2[1]; idx++)
7299 if (! (p2[2 + idx] == 0
7300 || (idx < (int) p1[4]
7301 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
7302 break;
7303
7304 if (idx == p2[1])
7305 {
7306 p[-3] = (unsigned char) pop_failure_jump;
7307 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7308 }
7309 }
7310 else if ((re_opcode_t) p1[3] == charset)
7311 {
7312 int idx;
7313 /* We win if the charset inside the loop
7314 has no overlap with the one after the loop. */
7315 for (idx = 0;
7316 idx < (int) p2[1] && idx < (int) p1[4];
7317 idx++)
7318 if ((p2[2 + idx] & p1[5 + idx]) != 0)
7319 break;
7320
7321 if (idx == p2[1] || idx == p1[4])
7322 {
7323 p[-3] = (unsigned char) pop_failure_jump;
7324 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7325 }
7326 }
7327 }
7328 #endif /* not WCHAR */
7329 }
7330 p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */
7331 if ((re_opcode_t) p[-1] != pop_failure_jump)
7332 {
7333 p[-1] = (UCHAR_T) jump;
7334 DEBUG_PRINT1 (" Match => jump.\n");
7335 goto unconditional_jump;
7336 }
7337 /* Note fall through. */
7338
7339
7340 /* The end of a simple repeat has a pop_failure_jump back to
7341 its matching on_failure_jump, where the latter will push a
7342 failure point. The pop_failure_jump takes off failure
7343 points put on by this pop_failure_jump's matching
7344 on_failure_jump; we got through the pattern to here from the
7345 matching on_failure_jump, so didn't fail. */
7346 CASE (pop_failure_jump):
7347 {
7348 /* We need to pass separate storage for the lowest and
7349 highest registers, even though we don't care about the
7350 actual values. Otherwise, we will restore only one
7351 register from the stack, since lowest will == highest in
7352 `pop_failure_point'. */
7353 active_reg_t dummy_low_reg, dummy_high_reg;
7354 UCHAR_T *pdummy = NULL;
7355 const CHAR_T *sdummy = NULL;
7356
7357 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
7358 POP_FAILURE_POINT (sdummy, pdummy,
7359 dummy_low_reg, dummy_high_reg,
7360 reg_dummy, reg_dummy, reg_info_dummy);
7361 }
7362 /* Note fall through. */
7363
7364 unconditional_jump:
7365 #ifdef _LIBC
7366 DEBUG_PRINT2 ("\n%p: ", p);
7367 #else
7368 DEBUG_PRINT2 ("\n0x%x: ", p);
7369 #endif
7370 /* Note fall through. */
7371
7372 /* Unconditionally jump (without popping any failure points). */
7373 CASE (jump):
7374 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
7375 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7376 p += mcnt; /* Do the jump. */
7377 #ifdef _LIBC
7378 DEBUG_PRINT2 ("(to %p).\n", p);
7379 #else
7380 DEBUG_PRINT2 ("(to 0x%x).\n", p);
7381 #endif
7382 NEXT;
7383
7384
7385 /* We need this opcode so we can detect where alternatives end
7386 in `group_match_null_string_p' et al. */
7387 CASE (jump_past_alt):
7388 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
7389 goto unconditional_jump;
7390
7391
7392 /* Normally, the on_failure_jump pushes a failure point, which
7393 then gets popped at pop_failure_jump. We will end up at
7394 pop_failure_jump, also, and with a pattern of, say, `a+', we
7395 are skipping over the on_failure_jump, so we have to push
7396 something meaningless for pop_failure_jump to pop. */
7397 CASE (dummy_failure_jump):
7398 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
7399 /* It doesn't matter what we push for the string here. What
7400 the code at `fail' tests is the value for the pattern. */
7401 PUSH_FAILURE_POINT (NULL, NULL, -2);
7402 goto unconditional_jump;
7403
7404
7405 /* At the end of an alternative, we need to push a dummy failure
7406 point in case we are followed by a `pop_failure_jump', because
7407 we don't want the failure point for the alternative to be
7408 popped. For example, matching `(a|ab)*' against `aab'
7409 requires that we match the `ab' alternative. */
7410 CASE (push_dummy_failure):
7411 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
7412 /* See comments just above at `dummy_failure_jump' about the
7413 two zeroes. */
7414 PUSH_FAILURE_POINT (NULL, NULL, -2);
7415 NEXT;
7416
7417 /* Have to succeed matching what follows at least n times.
7418 After that, handle like `on_failure_jump'. */
7419 CASE (succeed_n):
7420 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7421 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
7422
7423 assert (mcnt >= 0);
7424 /* Originally, this is how many times we HAVE to succeed. */
7425 if (mcnt > 0)
7426 {
7427 mcnt--;
7428 p += OFFSET_ADDRESS_SIZE;
7429 STORE_NUMBER_AND_INCR (p, mcnt);
7430 #ifdef _LIBC
7431 DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE
7432 , mcnt);
7433 #else
7434 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE
7435 , mcnt);
7436 #endif
7437 }
7438 else if (mcnt == 0)
7439 {
7440 #ifdef _LIBC
7441 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n",
7442 p + OFFSET_ADDRESS_SIZE);
7443 #else
7444 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n",
7445 p + OFFSET_ADDRESS_SIZE);
7446 #endif /* _LIBC */
7447
7448 #ifdef WCHAR
7449 p[1] = (UCHAR_T) no_op;
7450 #else
7451 p[2] = (UCHAR_T) no_op;
7452 p[3] = (UCHAR_T) no_op;
7453 #endif /* WCHAR */
7454 goto on_failure;
7455 }
7456 NEXT;
7457
7458 CASE (jump_n):
7459 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7460 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
7461
7462 /* Originally, this is how many times we CAN jump. */
7463 if (mcnt)
7464 {
7465 mcnt--;
7466 STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt);
7467
7468 #ifdef _LIBC
7469 DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE,
7470 mcnt);
7471 #else
7472 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE,
7473 mcnt);
7474 #endif /* _LIBC */
7475 goto unconditional_jump;
7476 }
7477 /* If don't have to jump any more, skip over the rest of command. */
7478 else
7479 p += 2 * OFFSET_ADDRESS_SIZE;
7480 NEXT;
7481
7482 CASE (set_number_at):
7483 {
7484 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
7485
7486 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7487 p1 = p + mcnt;
7488 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7489 #ifdef _LIBC
7490 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
7491 #else
7492 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
7493 #endif
7494 STORE_NUMBER (p1, mcnt);
7495 NEXT;
7496 }
7497
7498 #if 0
7499 /* The DEC Alpha C compiler 3.x generates incorrect code for the
7500 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7501 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
7502 macro and introducing temporary variables works around the bug. */
7503
7504 CASE (wordbound):
7505 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7506 if (AT_WORD_BOUNDARY (d))
7507 {
7508 NEXT;
7509 }
7510 goto fail;
7511
7512 CASE (notwordbound):
7513 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7514 if (AT_WORD_BOUNDARY (d))
7515 goto fail;
7516 NEXT;
7517 #else
7518 CASE (wordbound):
7519 {
7520 boolean prevchar, thischar;
7521
7522 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7523 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7524 {
7525 NEXT;
7526 }
7527
7528 prevchar = WORDCHAR_P (d - 1);
7529 thischar = WORDCHAR_P (d);
7530 if (prevchar != thischar)
7531 {
7532 NEXT;
7533 }
7534 goto fail;
7535 }
7536
7537 CASE (notwordbound):
7538 {
7539 boolean prevchar, thischar;
7540
7541 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7542 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7543 goto fail;
7544
7545 prevchar = WORDCHAR_P (d - 1);
7546 thischar = WORDCHAR_P (d);
7547 if (prevchar != thischar)
7548 goto fail;
7549 NEXT;
7550 }
7551 #endif
7552
7553 CASE (wordbeg):
7554 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
7555 if (!AT_STRINGS_END (d) && WORDCHAR_P (d)
7556 && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
7557 {
7558 NEXT;
7559 }
7560 goto fail;
7561
7562 CASE (wordend):
7563 DEBUG_PRINT1 ("EXECUTING wordend.\n");
7564 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
7565 && (AT_STRINGS_END (d) || !WORDCHAR_P (d)))
7566 {
7567 NEXT;
7568 }
7569 goto fail;
7570
7571 #ifdef emacs
7572 CASE (before_dot):
7573 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
7574 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
7575 goto fail;
7576 NEXT;
7577
7578 CASE (at_dot):
7579 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
7580 if (PTR_CHAR_POS ((unsigned char *) d) != point)
7581 goto fail;
7582 NEXT;
7583
7584 CASE (after_dot):
7585 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
7586 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
7587 goto fail;
7588 NEXT;
7589
7590 CASE (syntaxspec):
7591 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
7592 mcnt = *p++;
7593 goto matchsyntax;
7594
7595 CASE (wordchar):
7596 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
7597 mcnt = (int) Sword;
7598 matchsyntax:
7599 PREFETCH ();
7600 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7601 d++;
7602 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
7603 goto fail;
7604 SET_REGS_MATCHED ();
7605 NEXT;
7606
7607 CASE (notsyntaxspec):
7608 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
7609 mcnt = *p++;
7610 goto matchnotsyntax;
7611
7612 CASE (notwordchar):
7613 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
7614 mcnt = (int) Sword;
7615 matchnotsyntax:
7616 PREFETCH ();
7617 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7618 d++;
7619 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
7620 goto fail;
7621 SET_REGS_MATCHED ();
7622 NEXT;
7623
7624 #else /* not emacs */
7625 CASE (wordchar):
7626 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
7627 PREFETCH ();
7628 if (!WORDCHAR_P (d))
7629 goto fail;
7630 SET_REGS_MATCHED ();
7631 d++;
7632 NEXT;
7633
7634 CASE (notwordchar):
7635 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
7636 PREFETCH ();
7637 if (WORDCHAR_P (d))
7638 goto fail;
7639 SET_REGS_MATCHED ();
7640 d++;
7641 NEXT;
7642 #endif /* not emacs */
7643
7644 #ifndef __GNUC__
7645 default:
7646 abort ();
7647 }
7648 continue; /* Successfully executed one pattern command; keep going. */
7649 #endif
7650
7651
7652 /* We goto here if a matching operation fails. */
7653 fail:
7654 if (!FAIL_STACK_EMPTY ())
7655 { /* A restart point is known. Restore to that state. */
7656 DEBUG_PRINT1 ("\nFAIL:\n");
7657 POP_FAILURE_POINT (d, p,
7658 lowest_active_reg, highest_active_reg,
7659 regstart, regend, reg_info);
7660
7661 /* If this failure point is a dummy, try the next one. */
7662 if (!p)
7663 goto fail;
7664
7665 /* If we failed to the end of the pattern, don't examine *p. */
7666 assert (p <= pend);
7667 if (p < pend)
7668 {
7669 boolean is_a_jump_n = false;
7670
7671 /* If failed to a backwards jump that's part of a repetition
7672 loop, need to pop this failure point and use the next one. */
7673 switch ((re_opcode_t) *p)
7674 {
7675 case jump_n:
7676 is_a_jump_n = true;
7677 case maybe_pop_jump:
7678 case pop_failure_jump:
7679 case jump:
7680 p1 = p + 1;
7681 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7682 p1 += mcnt;
7683
7684 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
7685 || (!is_a_jump_n
7686 && (re_opcode_t) *p1 == on_failure_jump))
7687 goto fail;
7688 break;
7689 default:
7690 /* do nothing */ ;
7691 }
7692 }
7693
7694 if (d >= string1 && d <= end1)
7695 dend = end_match_1;
7696 }
7697 else
7698 break; /* Matching at this starting point really fails. */
7699 } /* for (;;) */
7700
7701 if (best_regs_set)
7702 goto restore_best_regs;
7703
7704 FREE_VARIABLES ();
7705
7706 return -1; /* Failure to match. */
7707 } /* re_match_2 */
7708
7709 /* Subroutine definitions for re_match_2. */
7711
7712
7713 /* We are passed P pointing to a register number after a start_memory.
7714
7715 Return true if the pattern up to the corresponding stop_memory can
7716 match the empty string, and false otherwise.
7717
7718 If we find the matching stop_memory, sets P to point to one past its number.
7719 Otherwise, sets P to an undefined byte less than or equal to END.
7720
7721 We don't handle duplicates properly (yet). */
7722
7723 static boolean
7724 PREFIX(group_match_null_string_p) (p, end, reg_info)
7725 UCHAR_T **p, *end;
7726 PREFIX(register_info_type) *reg_info;
7727 {
7728 int mcnt;
7729 /* Point to after the args to the start_memory. */
7730 UCHAR_T *p1 = *p + 2;
7731
7732 while (p1 < end)
7733 {
7734 /* Skip over opcodes that can match nothing, and return true or
7735 false, as appropriate, when we get to one that can't, or to the
7736 matching stop_memory. */
7737
7738 switch ((re_opcode_t) *p1)
7739 {
7740 /* Could be either a loop or a series of alternatives. */
7741 case on_failure_jump:
7742 p1++;
7743 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7744
7745 /* If the next operation is not a jump backwards in the
7746 pattern. */
7747
7748 if (mcnt >= 0)
7749 {
7750 /* Go through the on_failure_jumps of the alternatives,
7751 seeing if any of the alternatives cannot match nothing.
7752 The last alternative starts with only a jump,
7753 whereas the rest start with on_failure_jump and end
7754 with a jump, e.g., here is the pattern for `a|b|c':
7755
7756 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
7757 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
7758 /exactn/1/c
7759
7760 So, we have to first go through the first (n-1)
7761 alternatives and then deal with the last one separately. */
7762
7763
7764 /* Deal with the first (n-1) alternatives, which start
7765 with an on_failure_jump (see above) that jumps to right
7766 past a jump_past_alt. */
7767
7768 while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] ==
7769 jump_past_alt)
7770 {
7771 /* `mcnt' holds how many bytes long the alternative
7772 is, including the ending `jump_past_alt' and
7773 its number. */
7774
7775 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt -
7776 (1 + OFFSET_ADDRESS_SIZE),
7777 reg_info))
7778 return false;
7779
7780 /* Move to right after this alternative, including the
7781 jump_past_alt. */
7782 p1 += mcnt;
7783
7784 /* Break if it's the beginning of an n-th alternative
7785 that doesn't begin with an on_failure_jump. */
7786 if ((re_opcode_t) *p1 != on_failure_jump)
7787 break;
7788
7789 /* Still have to check that it's not an n-th
7790 alternative that starts with an on_failure_jump. */
7791 p1++;
7792 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7793 if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] !=
7794 jump_past_alt)
7795 {
7796 /* Get to the beginning of the n-th alternative. */
7797 p1 -= 1 + OFFSET_ADDRESS_SIZE;
7798 break;
7799 }
7800 }
7801
7802 /* Deal with the last alternative: go back and get number
7803 of the `jump_past_alt' just before it. `mcnt' contains
7804 the length of the alternative. */
7805 EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE);
7806
7807 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt, reg_info))
7808 return false;
7809
7810 p1 += mcnt; /* Get past the n-th alternative. */
7811 } /* if mcnt > 0 */
7812 break;
7813
7814
7815 case stop_memory:
7816 assert (p1[1] == **p);
7817 *p = p1 + 2;
7818 return true;
7819
7820
7821 default:
7822 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7823 return false;
7824 }
7825 } /* while p1 < end */
7826
7827 return false;
7828 } /* group_match_null_string_p */
7829
7830
7831 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
7832 It expects P to be the first byte of a single alternative and END one
7833 byte past the last. The alternative can contain groups. */
7834
7835 static boolean
7836 PREFIX(alt_match_null_string_p) (p, end, reg_info)
7837 UCHAR_T *p, *end;
7838 PREFIX(register_info_type) *reg_info;
7839 {
7840 int mcnt;
7841 UCHAR_T *p1 = p;
7842
7843 while (p1 < end)
7844 {
7845 /* Skip over opcodes that can match nothing, and break when we get
7846 to one that can't. */
7847
7848 switch ((re_opcode_t) *p1)
7849 {
7850 /* It's a loop. */
7851 case on_failure_jump:
7852 p1++;
7853 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7854 p1 += mcnt;
7855 break;
7856
7857 default:
7858 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7859 return false;
7860 }
7861 } /* while p1 < end */
7862
7863 return true;
7864 } /* alt_match_null_string_p */
7865
7866
7867 /* Deals with the ops common to group_match_null_string_p and
7868 alt_match_null_string_p.
7869
7870 Sets P to one after the op and its arguments, if any. */
7871
7872 static boolean
7873 PREFIX(common_op_match_null_string_p) (p, end, reg_info)
7874 UCHAR_T **p, *end;
7875 PREFIX(register_info_type) *reg_info;
7876 {
7877 int mcnt;
7878 boolean ret;
7879 int reg_no;
7880 UCHAR_T *p1 = *p;
7881
7882 switch ((re_opcode_t) *p1++)
7883 {
7884 case no_op:
7885 case begline:
7886 case endline:
7887 case begbuf:
7888 case endbuf:
7889 case wordbeg:
7890 case wordend:
7891 case wordbound:
7892 case notwordbound:
7893 #ifdef emacs
7894 case before_dot:
7895 case at_dot:
7896 case after_dot:
7897 #endif
7898 break;
7899
7900 case start_memory:
7901 reg_no = *p1;
7902 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
7903 ret = PREFIX(group_match_null_string_p) (&p1, end, reg_info);
7904
7905 /* Have to set this here in case we're checking a group which
7906 contains a group and a back reference to it. */
7907
7908 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
7909 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
7910
7911 if (!ret)
7912 return false;
7913 break;
7914
7915 /* If this is an optimized succeed_n for zero times, make the jump. */
7916 case jump:
7917 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7918 if (mcnt >= 0)
7919 p1 += mcnt;
7920 else
7921 return false;
7922 break;
7923
7924 case succeed_n:
7925 /* Get to the number of times to succeed. */
7926 p1 += OFFSET_ADDRESS_SIZE;
7927 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7928
7929 if (mcnt == 0)
7930 {
7931 p1 -= 2 * OFFSET_ADDRESS_SIZE;
7932 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7933 p1 += mcnt;
7934 }
7935 else
7936 return false;
7937 break;
7938
7939 case duplicate:
7940 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
7941 return false;
7942 break;
7943
7944 case set_number_at:
7945 p1 += 2 * OFFSET_ADDRESS_SIZE;
7946
7947 default:
7948 /* All other opcodes mean we cannot match the empty string. */
7949 return false;
7950 }
7951
7952 *p = p1;
7953 return true;
7954 } /* common_op_match_null_string_p */
7955
7956
7957 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
7958 bytes; nonzero otherwise. */
7959
7960 static int
7961 PREFIX(bcmp_translate) (s1, s2, len, translate)
7962 const CHAR_T *s1, *s2;
7963 register int len;
7964 RE_TRANSLATE_TYPE translate;
7965 {
7966 register const UCHAR_T *p1 = (const UCHAR_T *) s1;
7967 register const UCHAR_T *p2 = (const UCHAR_T *) s2;
7968 while (len)
7969 {
7970 #ifdef WCHAR
7971 if (((*p1<=0xff)?translate[*p1++]:*p1++)
7972 != ((*p2<=0xff)?translate[*p2++]:*p2++))
7973 return 1;
7974 #else /* BYTE */
7975 if (translate[*p1++] != translate[*p2++]) return 1;
7976 #endif /* WCHAR */
7977 len--;
7978 }
7979 return 0;
7980 }
7981
7982
7984 #else /* not INSIDE_RECURSION */
7985
7986 /* Entry points for GNU code. */
7987
7988 /* re_compile_pattern is the GNU regular expression compiler: it
7989 compiles PATTERN (of length SIZE) and puts the result in BUFP.
7990 Returns 0 if the pattern was valid, otherwise an error string.
7991
7992 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
7993 are set in BUFP on entry.
7994
7995 We call regex_compile to do the actual compilation. */
7996
7997 const char *
7998 re_compile_pattern (pattern, length, bufp)
7999 const char *pattern;
8000 size_t length;
8001 struct re_pattern_buffer *bufp;
8002 {
8003 reg_errcode_t ret;
8004
8005 /* GNU code is written to assume at least RE_NREGS registers will be set
8006 (and at least one extra will be -1). */
8007 bufp->regs_allocated = REGS_UNALLOCATED;
8008
8009 /* And GNU code determines whether or not to get register information
8010 by passing null for the REGS argument to re_match, etc., not by
8011 setting no_sub. */
8012 bufp->no_sub = 0;
8013
8014 /* Match anchors at newline. */
8015 bufp->newline_anchor = 1;
8016
8017 # ifdef MBS_SUPPORT
8018 if (MB_CUR_MAX != 1)
8019 ret = wcs_regex_compile (pattern, length, re_syntax_options, bufp);
8020 else
8021 # endif
8022 ret = byte_regex_compile (pattern, length, re_syntax_options, bufp);
8023
8024 if (!ret)
8025 return NULL;
8026 return gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
8027 }
8028 #ifdef _LIBC
8029 weak_alias (__re_compile_pattern, re_compile_pattern)
8030 #endif
8031
8032 /* Entry points compatible with 4.2 BSD regex library. We don't define
8034 them unless specifically requested. */
8035
8036 #if defined _REGEX_RE_COMP || defined _LIBC
8037
8038 /* BSD has one and only one pattern buffer. */
8039 static struct re_pattern_buffer re_comp_buf;
8040
8041 char *
8042 #ifdef _LIBC
8043 /* Make these definitions weak in libc, so POSIX programs can redefine
8044 these names if they don't use our functions, and still use
8045 regcomp/regexec below without link errors. */
8046 weak_function
8047 #endif
8048 re_comp (s)
8049 const char *s;
8050 {
8051 reg_errcode_t ret;
8052
8053 if (!s)
8054 {
8055 if (!re_comp_buf.buffer)
8056 return gettext ("No previous regular expression");
8057 return 0;
8058 }
8059
8060 if (!re_comp_buf.buffer)
8061 {
8062 re_comp_buf.buffer = (unsigned char *) malloc (200);
8063 if (re_comp_buf.buffer == NULL)
8064 return (char *) gettext (re_error_msgid
8065 + re_error_msgid_idx[(int) REG_ESPACE]);
8066 re_comp_buf.allocated = 200;
8067
8068 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
8069 if (re_comp_buf.fastmap == NULL)
8070 return (char *) gettext (re_error_msgid
8071 + re_error_msgid_idx[(int) REG_ESPACE]);
8072 }
8073
8074 /* Since `re_exec' always passes NULL for the `regs' argument, we
8075 don't need to initialize the pattern buffer fields which affect it. */
8076
8077 /* Match anchors at newlines. */
8078 re_comp_buf.newline_anchor = 1;
8079
8080 # ifdef MBS_SUPPORT
8081 if (MB_CUR_MAX != 1)
8082 ret = wcs_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
8083 else
8084 # endif
8085 ret = byte_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
8086
8087 if (!ret)
8088 return NULL;
8089
8090 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
8091 return (char *) gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
8092 }
8093
8094
8095 int
8096 #ifdef _LIBC
8097 weak_function
8098 #endif
8099 re_exec (s)
8100 const char *s;
8101 {
8102 const int len = strlen (s);
8103 return
8104 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
8105 }
8106
8107 #endif /* _REGEX_RE_COMP */
8108
8109 /* POSIX.2 functions. Don't define these for Emacs. */
8111
8112 #ifndef emacs
8113
8114 /* regcomp takes a regular expression as a string and compiles it.
8115
8116 PREG is a regex_t *. We do not expect any fields to be initialized,
8117 since POSIX says we shouldn't. Thus, we set
8118
8119 `buffer' to the compiled pattern;
8120 `used' to the length of the compiled pattern;
8121 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
8122 REG_EXTENDED bit in CFLAGS is set; otherwise, to
8123 RE_SYNTAX_POSIX_BASIC;
8124 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
8125 `fastmap' to an allocated space for the fastmap;
8126 `fastmap_accurate' to zero;
8127 `re_nsub' to the number of subexpressions in PATTERN.
8128
8129 PATTERN is the address of the pattern string.
8130
8131 CFLAGS is a series of bits which affect compilation.
8132
8133 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
8134 use POSIX basic syntax.
8135
8136 If REG_NEWLINE is set, then . and [^...] don't match newline.
8137 Also, regexec will try a match beginning after every newline.
8138
8139 If REG_ICASE is set, then we considers upper- and lowercase
8140 versions of letters to be equivalent when matching.
8141
8142 If REG_NOSUB is set, then when PREG is passed to regexec, that
8143 routine will report only success or failure, and nothing about the
8144 registers.
8145
8146 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
8147 the return codes and their meanings.) */
8148
8149 int
8150 regcomp (preg, pattern, cflags)
8151 regex_t *preg;
8152 const char *pattern;
8153 int cflags;
8154 {
8155 reg_errcode_t ret;
8156 reg_syntax_t syntax
8157 = (cflags & REG_EXTENDED) ?
8158 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
8159
8160 /* regex_compile will allocate the space for the compiled pattern. */
8161 preg->buffer = 0;
8162 preg->allocated = 0;
8163 preg->used = 0;
8164
8165 /* Try to allocate space for the fastmap. */
8166 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
8167
8168 if (cflags & REG_ICASE)
8169 {
8170 unsigned i;
8171
8172 preg->translate
8173 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
8174 * sizeof (*(RE_TRANSLATE_TYPE)0));
8175 if (preg->translate == NULL)
8176 return (int) REG_ESPACE;
8177
8178 /* Map uppercase characters to corresponding lowercase ones. */
8179 for (i = 0; i < CHAR_SET_SIZE; i++)
8180 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
8181 }
8182 else
8183 preg->translate = NULL;
8184
8185 /* If REG_NEWLINE is set, newlines are treated differently. */
8186 if (cflags & REG_NEWLINE)
8187 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
8188 syntax &= ~RE_DOT_NEWLINE;
8189 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
8190 /* It also changes the matching behavior. */
8191 preg->newline_anchor = 1;
8192 }
8193 else
8194 preg->newline_anchor = 0;
8195
8196 preg->no_sub = !!(cflags & REG_NOSUB);
8197
8198 /* POSIX says a null character in the pattern terminates it, so we
8199 can use strlen here in compiling the pattern. */
8200 # ifdef MBS_SUPPORT
8201 if (MB_CUR_MAX != 1)
8202 ret = wcs_regex_compile (pattern, strlen (pattern), syntax, preg);
8203 else
8204 # endif
8205 ret = byte_regex_compile (pattern, strlen (pattern), syntax, preg);
8206
8207 /* POSIX doesn't distinguish between an unmatched open-group and an
8208 unmatched close-group: both are REG_EPAREN. */
8209 if (ret == REG_ERPAREN) ret = REG_EPAREN;
8210
8211 if (ret == REG_NOERROR && preg->fastmap)
8212 {
8213 /* Compute the fastmap now, since regexec cannot modify the pattern
8214 buffer. */
8215 if (re_compile_fastmap (preg) == -2)
8216 {
8217 /* Some error occurred while computing the fastmap, just forget
8218 about it. */
8219 free (preg->fastmap);
8220 preg->fastmap = NULL;
8221 }
8222 }
8223
8224 return (int) ret;
8225 }
8226 #ifdef _LIBC
8227 weak_alias (__regcomp, regcomp)
8228 #endif
8229
8230
8231 /* regexec searches for a given pattern, specified by PREG, in the
8232 string STRING.
8233
8234 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
8235 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
8236 least NMATCH elements, and we set them to the offsets of the
8237 corresponding matched substrings.
8238
8239 EFLAGS specifies `execution flags' which affect matching: if
8240 REG_NOTBOL is set, then ^ does not match at the beginning of the
8241 string; if REG_NOTEOL is set, then $ does not match at the end.
8242
8243 We return 0 if we find a match and REG_NOMATCH if not. */
8244
8245 int
8246 regexec (preg, string, nmatch, pmatch, eflags)
8247 const regex_t *preg;
8248 const char *string;
8249 size_t nmatch;
8250 regmatch_t pmatch[];
8251 int eflags;
8252 {
8253 int ret;
8254 struct re_registers regs;
8255 regex_t private_preg;
8256 int len = strlen (string);
8257 boolean want_reg_info = !preg->no_sub && nmatch > 0;
8258
8259 private_preg = *preg;
8260
8261 private_preg.not_bol = !!(eflags & REG_NOTBOL);
8262 private_preg.not_eol = !!(eflags & REG_NOTEOL);
8263
8264 /* The user has told us exactly how many registers to return
8265 information about, via `nmatch'. We have to pass that on to the
8266 matching routines. */
8267 private_preg.regs_allocated = REGS_FIXED;
8268
8269 if (want_reg_info)
8270 {
8271 regs.num_regs = nmatch;
8272 regs.start = TALLOC (nmatch * 2, regoff_t);
8273 if (regs.start == NULL)
8274 return (int) REG_NOMATCH;
8275 regs.end = regs.start + nmatch;
8276 }
8277
8278 /* Perform the searching operation. */
8279 ret = re_search (&private_preg, string, len,
8280 /* start: */ 0, /* range: */ len,
8281 want_reg_info ? ®s : (struct re_registers *) 0);
8282
8283 /* Copy the register information to the POSIX structure. */
8284 if (want_reg_info)
8285 {
8286 if (ret >= 0)
8287 {
8288 unsigned r;
8289
8290 for (r = 0; r < nmatch; r++)
8291 {
8292 pmatch[r].rm_so = regs.start[r];
8293 pmatch[r].rm_eo = regs.end[r];
8294 }
8295 }
8296
8297 /* If we needed the temporary register info, free the space now. */
8298 free (regs.start);
8299 }
8300
8301 /* We want zero return to mean success, unlike `re_search'. */
8302 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
8303 }
8304 #ifdef _LIBC
8305 weak_alias (__regexec, regexec)
8306 #endif
8307
8308
8309 /* Returns a message corresponding to an error code, ERRCODE, returned
8310 from either regcomp or regexec. We don't use PREG here. */
8311
8312 size_t
8313 regerror (errcode, preg, errbuf, errbuf_size)
8314 int errcode;
8315 const regex_t *preg;
8316 char *errbuf;
8317 size_t errbuf_size;
8318 {
8319 const char *msg;
8320 size_t msg_size;
8321
8322 if (errcode < 0
8323 || errcode >= (int) (sizeof (re_error_msgid_idx)
8324 / sizeof (re_error_msgid_idx[0])))
8325 /* Only error codes returned by the rest of the code should be passed
8326 to this routine. If we are given anything else, or if other regex
8327 code generates an invalid error code, then the program has a bug.
8328 Dump core so we can fix it. */
8329 abort ();
8330
8331 msg = gettext (re_error_msgid + re_error_msgid_idx[errcode]);
8332
8333 msg_size = strlen (msg) + 1; /* Includes the null. */
8334
8335 if (errbuf_size != 0)
8336 {
8337 if (msg_size > errbuf_size)
8338 {
8339 #if defined HAVE_MEMPCPY || defined _LIBC
8340 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
8341 #else
8342 memcpy (errbuf, msg, errbuf_size - 1);
8343 errbuf[errbuf_size - 1] = 0;
8344 #endif
8345 }
8346 else
8347 memcpy (errbuf, msg, msg_size);
8348 }
8349
8350 return msg_size;
8351 }
8352 #ifdef _LIBC
8353 weak_alias (__regerror, regerror)
8354 #endif
8355
8356
8357 /* Free dynamically allocated space used by PREG. */
8358
8359 void
8360 regfree (preg)
8361 regex_t *preg;
8362 {
8363 if (preg->buffer != NULL)
8364 free (preg->buffer);
8365 preg->buffer = NULL;
8366
8367 preg->allocated = 0;
8368 preg->used = 0;
8369
8370 if (preg->fastmap != NULL)
8371 free (preg->fastmap);
8372 preg->fastmap = NULL;
8373 preg->fastmap_accurate = 0;
8374
8375 if (preg->translate != NULL)
8376 free (preg->translate);
8377 preg->translate = NULL;
8378 }
8379 #ifdef _LIBC
8380 weak_alias (__regfree, regfree)
8381 #endif
8382
8383 #endif /* not emacs */
8384
8385 #endif /* not INSIDE_RECURSION */
8386
8387
8388 #undef STORE_NUMBER
8390 #undef STORE_NUMBER_AND_INCR
8391 #undef EXTRACT_NUMBER
8392 #undef EXTRACT_NUMBER_AND_INCR
8393
8394 #undef DEBUG_PRINT_COMPILED_PATTERN
8395 #undef DEBUG_PRINT_DOUBLE_STRING
8396
8397 #undef INIT_FAIL_STACK
8398 #undef RESET_FAIL_STACK
8399 #undef DOUBLE_FAIL_STACK
8400 #undef PUSH_PATTERN_OP
8401 #undef PUSH_FAILURE_POINTER
8402 #undef PUSH_FAILURE_INT
8403 #undef PUSH_FAILURE_ELT
8404 #undef POP_FAILURE_POINTER
8405 #undef POP_FAILURE_INT
8406 #undef POP_FAILURE_ELT
8407 #undef DEBUG_PUSH
8408 #undef DEBUG_POP
8409 #undef PUSH_FAILURE_POINT
8410 #undef POP_FAILURE_POINT
8411
8412 #undef REG_UNSET_VALUE
8413 #undef REG_UNSET
8414
8415 #undef PATFETCH
8416 #undef PATFETCH_RAW
8417 #undef PATUNFETCH
8418 #undef TRANSLATE
8419
8420 #undef INIT_BUF_SIZE
8421 #undef GET_BUFFER_SPACE
8422 #undef BUF_PUSH
8423 #undef BUF_PUSH_2
8424 #undef BUF_PUSH_3
8425 #undef STORE_JUMP
8426 #undef STORE_JUMP2
8427 #undef INSERT_JUMP
8428 #undef INSERT_JUMP2
8429 #undef EXTEND_BUFFER
8430 #undef GET_UNSIGNED_NUMBER
8431 #undef FREE_STACK_RETURN
8432
8433 # undef POINTER_TO_OFFSET
8434 # undef MATCHING_IN_FRST_STRING
8435 # undef PREFETCH
8436 # undef AT_STRINGS_BEG
8437 # undef AT_STRINGS_END
8438 # undef WORDCHAR_P
8439 # undef FREE_VAR
8440 # undef FREE_VARIABLES
8441 # undef NO_HIGHEST_ACTIVE_REG
8442 # undef NO_LOWEST_ACTIVE_REG
8443
8444 # undef CHAR_T
8445 # undef UCHAR_T
8446 # undef COMPILED_BUFFER_VAR
8447 # undef OFFSET_ADDRESS_SIZE
8448 # undef CHAR_CLASS_SIZE
8449 # undef PREFIX
8450 # undef ARG_PREFIX
8451 # undef PUT_CHAR
8452 # undef BYTE
8453 # undef WCHAR
8454
8455 # define DEFINED_ONCE
8456