Home | History | Annotate | Line # | Download | only in libgrep
      1  1.1  christos /* dfa.c - deterministic extended regexp routines for GNU
      2  1.1  christos    Copyright 1988, 1998, 2000, 2005 Free Software Foundation, Inc.
      3  1.1  christos 
      4  1.1  christos    This program is free software; you can redistribute it and/or modify
      5  1.1  christos    it under the terms of the GNU General Public License as published by
      6  1.1  christos    the Free Software Foundation; either version 2, or (at your option)
      7  1.1  christos    any later version.
      8  1.1  christos 
      9  1.1  christos    This program is distributed in the hope that it will be useful,
     10  1.1  christos    but WITHOUT ANY WARRANTY; without even the implied warranty of
     11  1.1  christos    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     12  1.1  christos    GNU General Public License for more details.
     13  1.1  christos 
     14  1.1  christos    You should have received a copy of the GNU General Public License
     15  1.1  christos    along with this program; if not, write to the Free Software Foundation,
     16  1.1  christos    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-13017, USA */
     17  1.1  christos 
     18  1.1  christos /* Written June, 1988 by Mike Haertel
     19  1.1  christos    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
     20  1.1  christos 
     21  1.1  christos #ifdef HAVE_CONFIG_H
     22  1.1  christos #include <config.h>
     23  1.1  christos #endif
     24  1.1  christos 
     25  1.1  christos #include <assert.h>
     26  1.1  christos #include <ctype.h>
     27  1.1  christos #include <stdio.h>
     28  1.1  christos 
     29  1.1  christos #include <sys/types.h>
     30  1.1  christos #include <stdlib.h>
     31  1.1  christos #include <string.h>
     32  1.1  christos #include <locale.h>
     33  1.1  christos 
     34  1.1  christos #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
     35  1.1  christos /* We can handle multibyte string.  */
     36  1.1  christos # define MBS_SUPPORT
     37  1.1  christos #endif
     38  1.1  christos 
     39  1.1  christos #ifdef MBS_SUPPORT
     40  1.1  christos # include <wchar.h>
     41  1.1  christos # include <wctype.h>
     42  1.1  christos #endif
     43  1.1  christos 
     44  1.1  christos #ifndef DEBUG	/* use the same approach as regex.c */
     45  1.1  christos #undef assert
     46  1.1  christos #define assert(e)
     47  1.1  christos #endif /* DEBUG */
     48  1.1  christos 
     49  1.1  christos #ifndef isgraph
     50  1.1  christos #define isgraph(C) (isprint(C) && !isspace(C))
     51  1.1  christos #endif
     52  1.1  christos 
     53  1.1  christos #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
     54  1.1  christos #define ISALPHA(C) isalpha(C)
     55  1.1  christos #define ISUPPER(C) isupper(C)
     56  1.1  christos #define ISLOWER(C) islower(C)
     57  1.1  christos #define ISDIGIT(C) isdigit(C)
     58  1.1  christos #define ISXDIGIT(C) isxdigit(C)
     59  1.1  christos #define ISSPACE(C) isspace(C)
     60  1.1  christos #define ISPUNCT(C) ispunct(C)
     61  1.1  christos #define ISALNUM(C) isalnum(C)
     62  1.1  christos #define ISPRINT(C) isprint(C)
     63  1.1  christos #define ISGRAPH(C) isgraph(C)
     64  1.1  christos #define ISCNTRL(C) iscntrl(C)
     65  1.1  christos #else
     66  1.1  christos #define ISALPHA(C) (isascii(C) && isalpha(C))
     67  1.1  christos #define ISUPPER(C) (isascii(C) && isupper(C))
     68  1.1  christos #define ISLOWER(C) (isascii(C) && islower(C))
     69  1.1  christos #define ISDIGIT(C) (isascii(C) && isdigit(C))
     70  1.1  christos #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
     71  1.1  christos #define ISSPACE(C) (isascii(C) && isspace(C))
     72  1.1  christos #define ISPUNCT(C) (isascii(C) && ispunct(C))
     73  1.1  christos #define ISALNUM(C) (isascii(C) && isalnum(C))
     74  1.1  christos #define ISPRINT(C) (isascii(C) && isprint(C))
     75  1.1  christos #define ISGRAPH(C) (isascii(C) && isgraph(C))
     76  1.1  christos #define ISCNTRL(C) (isascii(C) && iscntrl(C))
     77  1.1  christos #endif
     78  1.1  christos 
     79  1.1  christos /* ISASCIIDIGIT differs from ISDIGIT, as follows:
     80  1.1  christos    - Its arg may be any int or unsigned int; it need not be an unsigned char.
     81  1.1  christos    - It's guaranteed to evaluate its argument exactly once.
     82  1.1  christos    - It's typically faster.
     83  1.1  christos    Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
     84  1.1  christos    only '0' through '9' are digits.  Prefer ISASCIIDIGIT to ISDIGIT unless
     85  1.1  christos    it's important to use the locale's definition of `digit' even when the
     86  1.1  christos    host does not conform to Posix.  */
     87  1.1  christos #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
     88  1.1  christos 
     89  1.1  christos #include "regex.h"
     90  1.1  christos #include "dfa.h"
     91  1.1  christos #include "hard-locale.h"
     92  1.1  christos #include "gettext.h"
     93  1.1  christos #define _(str) gettext (str)
     94  1.1  christos 
     95  1.1  christos /* HPUX, define those as macros in sys/param.h */
     96  1.1  christos #ifdef setbit
     97  1.1  christos # undef setbit
     98  1.1  christos #endif
     99  1.1  christos #ifdef clrbit
    100  1.1  christos # undef clrbit
    101  1.1  christos #endif
    102  1.1  christos 
    103  1.1  christos static void dfamust (struct dfa *dfa);
    104  1.1  christos static void regexp (int toplevel);
    105  1.1  christos 
    106  1.1  christos static ptr_t
    107  1.1  christos xcalloc (size_t n, size_t s)
    108  1.1  christos {
    109  1.1  christos   ptr_t r = calloc(n, s);
    110  1.1  christos 
    111  1.1  christos   if (!r)
    112  1.1  christos     dfaerror(_("Memory exhausted"));
    113  1.1  christos   return r;
    114  1.1  christos }
    115  1.1  christos 
    116  1.1  christos static ptr_t
    117  1.1  christos xmalloc (size_t n)
    118  1.1  christos {
    119  1.1  christos   ptr_t r = malloc(n);
    120  1.1  christos 
    121  1.1  christos   assert(n != 0);
    122  1.1  christos   if (!r)
    123  1.1  christos     dfaerror(_("Memory exhausted"));
    124  1.1  christos   return r;
    125  1.1  christos }
    126  1.1  christos 
    127  1.1  christos static ptr_t
    128  1.1  christos xrealloc (ptr_t p, size_t n)
    129  1.1  christos {
    130  1.1  christos   ptr_t r = realloc(p, n);
    131  1.1  christos 
    132  1.1  christos   assert(n != 0);
    133  1.1  christos   if (!r)
    134  1.1  christos     dfaerror(_("Memory exhausted"));
    135  1.1  christos   return r;
    136  1.1  christos }
    137  1.1  christos 
    138  1.1  christos #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
    139  1.1  christos #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
    140  1.1  christos #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
    141  1.1  christos 
    142  1.1  christos /* Reallocate an array of type t if nalloc is too small for index. */
    143  1.1  christos #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
    144  1.1  christos   if ((index) >= (nalloc))			  \
    145  1.1  christos     {						  \
    146  1.1  christos       do					  \
    147  1.1  christos 	(nalloc) *= 2;				  \
    148  1.1  christos       while ((index) >= (nalloc));		  \
    149  1.1  christos       REALLOC(p, t, nalloc);			  \
    150  1.1  christos     }
    151  1.1  christos 
    152  1.1  christos #ifdef DEBUG
    153  1.1  christos 
    154  1.1  christos static void
    155  1.1  christos prtok (token t)
    156  1.1  christos {
    157  1.1  christos   char const *s;
    158  1.1  christos 
    159  1.1  christos   if (t < 0)
    160  1.1  christos     fprintf(stderr, "END");
    161  1.1  christos   else if (t < NOTCHAR)
    162  1.1  christos     fprintf(stderr, "%c", t);
    163  1.1  christos   else
    164  1.1  christos     {
    165  1.1  christos       switch (t)
    166  1.1  christos 	{
    167  1.1  christos 	case EMPTY: s = "EMPTY"; break;
    168  1.1  christos 	case BACKREF: s = "BACKREF"; break;
    169  1.1  christos 	case BEGLINE: s = "BEGLINE"; break;
    170  1.1  christos 	case ENDLINE: s = "ENDLINE"; break;
    171  1.1  christos 	case BEGWORD: s = "BEGWORD"; break;
    172  1.1  christos 	case ENDWORD: s = "ENDWORD"; break;
    173  1.1  christos 	case LIMWORD: s = "LIMWORD"; break;
    174  1.1  christos 	case NOTLIMWORD: s = "NOTLIMWORD"; break;
    175  1.1  christos 	case QMARK: s = "QMARK"; break;
    176  1.1  christos 	case STAR: s = "STAR"; break;
    177  1.1  christos 	case PLUS: s = "PLUS"; break;
    178  1.1  christos 	case CAT: s = "CAT"; break;
    179  1.1  christos 	case OR: s = "OR"; break;
    180  1.1  christos 	case ORTOP: s = "ORTOP"; break;
    181  1.1  christos 	case LPAREN: s = "LPAREN"; break;
    182  1.1  christos 	case RPAREN: s = "RPAREN"; break;
    183  1.1  christos 	case CRANGE: s = "CRANGE"; break;
    184  1.1  christos #ifdef MBS_SUPPORT
    185  1.1  christos 	case ANYCHAR: s = "ANYCHAR"; break;
    186  1.1  christos 	case MBCSET: s = "MBCSET"; break;
    187  1.1  christos #endif /* MBS_SUPPORT */
    188  1.1  christos 	default: s = "CSET"; break;
    189  1.1  christos 	}
    190  1.1  christos       fprintf(stderr, "%s", s);
    191  1.1  christos     }
    192  1.1  christos }
    193  1.1  christos #endif /* DEBUG */
    194  1.1  christos 
    195  1.1  christos /* Stuff pertaining to charclasses. */
    196  1.1  christos 
    197  1.1  christos static int
    198  1.1  christos tstbit (unsigned b, charclass c)
    199  1.1  christos {
    200  1.1  christos   return c[b / INTBITS] & 1 << b % INTBITS;
    201  1.1  christos }
    202  1.1  christos 
    203  1.1  christos static void
    204  1.1  christos setbit (unsigned b, charclass c)
    205  1.1  christos {
    206  1.1  christos   c[b / INTBITS] |= 1 << b % INTBITS;
    207  1.1  christos }
    208  1.1  christos 
    209  1.1  christos static void
    210  1.1  christos clrbit (unsigned b, charclass c)
    211  1.1  christos {
    212  1.1  christos   c[b / INTBITS] &= ~(1 << b % INTBITS);
    213  1.1  christos }
    214  1.1  christos 
    215  1.1  christos static void
    216  1.1  christos copyset (charclass src, charclass dst)
    217  1.1  christos {
    218  1.1  christos   memcpy (dst, src, sizeof (charclass));
    219  1.1  christos }
    220  1.1  christos 
    221  1.1  christos static void
    222  1.1  christos zeroset (charclass s)
    223  1.1  christos {
    224  1.1  christos   memset (s, 0, sizeof (charclass));
    225  1.1  christos }
    226  1.1  christos 
    227  1.1  christos static void
    228  1.1  christos notset (charclass s)
    229  1.1  christos {
    230  1.1  christos   int i;
    231  1.1  christos 
    232  1.1  christos   for (i = 0; i < CHARCLASS_INTS; ++i)
    233  1.1  christos     s[i] = ~s[i];
    234  1.1  christos }
    235  1.1  christos 
    236  1.1  christos static int
    237  1.1  christos equal (charclass s1, charclass s2)
    238  1.1  christos {
    239  1.1  christos   return memcmp (s1, s2, sizeof (charclass)) == 0;
    240  1.1  christos }
    241  1.1  christos 
    242  1.1  christos /* A pointer to the current dfa is kept here during parsing. */
    243  1.1  christos static struct dfa *dfa;
    244  1.1  christos 
    245  1.1  christos /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
    246  1.1  christos static int
    247  1.1  christos charclass_index (charclass s)
    248  1.1  christos {
    249  1.1  christos   int i;
    250  1.1  christos 
    251  1.1  christos   for (i = 0; i < dfa->cindex; ++i)
    252  1.1  christos     if (equal(s, dfa->charclasses[i]))
    253  1.1  christos       return i;
    254  1.1  christos   REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
    255  1.1  christos   ++dfa->cindex;
    256  1.1  christos   copyset(s, dfa->charclasses[i]);
    257  1.1  christos   return i;
    258  1.1  christos }
    259  1.1  christos 
    260  1.1  christos /* Syntax bits controlling the behavior of the lexical analyzer. */
    261  1.1  christos static reg_syntax_t syntax_bits, syntax_bits_set;
    262  1.1  christos 
    263  1.1  christos /* Flag for case-folding letters into sets. */
    264  1.1  christos static int case_fold;
    265  1.1  christos 
    266  1.1  christos /* End-of-line byte in data.  */
    267  1.1  christos static unsigned char eolbyte;
    268  1.1  christos 
    269  1.1  christos /* Entry point to set syntax options. */
    270  1.1  christos void
    271  1.1  christos dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
    272  1.1  christos {
    273  1.1  christos   syntax_bits_set = 1;
    274  1.1  christos   syntax_bits = bits;
    275  1.1  christos   case_fold = fold;
    276  1.1  christos   eolbyte = eol;
    277  1.1  christos }
    278  1.1  christos 
    279  1.1  christos /* Like setbit, but if case is folded, set both cases of a letter.  */
    280  1.1  christos static void
    281  1.1  christos setbit_case_fold (unsigned b, charclass c)
    282  1.1  christos {
    283  1.1  christos   setbit (b, c);
    284  1.1  christos   if (case_fold)
    285  1.1  christos     {
    286  1.1  christos       if (ISUPPER (b))
    287  1.1  christos 	setbit (tolower (b), c);
    288  1.1  christos       else if (ISLOWER (b))
    289  1.1  christos 	setbit (toupper (b), c);
    290  1.1  christos     }
    291  1.1  christos }
    292  1.1  christos 
    293  1.1  christos /* Lexical analyzer.  All the dross that deals with the obnoxious
    294  1.1  christos    GNU Regex syntax bits is located here.  The poor, suffering
    295  1.1  christos    reader is referred to the GNU Regex documentation for the
    296  1.1  christos    meaning of the @#%!@#%^!@ syntax bits. */
    297  1.1  christos 
    298  1.1  christos static char const *lexstart;	/* Pointer to beginning of input string. */
    299  1.1  christos static char const *lexptr;	/* Pointer to next input character. */
    300  1.1  christos static int lexleft;		/* Number of characters remaining. */
    301  1.1  christos static token lasttok;		/* Previous token returned; initially END. */
    302  1.1  christos static int laststart;		/* True if we're separated from beginning or (, |
    303  1.1  christos 				   only by zero-width characters. */
    304  1.1  christos static int parens;		/* Count of outstanding left parens. */
    305  1.1  christos static int minrep, maxrep;	/* Repeat counts for {m,n}. */
    306  1.1  christos static int hard_LC_COLLATE;	/* Nonzero if LC_COLLATE is hard.  */
    307  1.1  christos 
    308  1.1  christos #ifdef MBS_SUPPORT
    309  1.1  christos /* These variables are used only if (MB_CUR_MAX > 1).  */
    310  1.1  christos static mbstate_t mbs;		/* Mbstate for mbrlen().  */
    311  1.1  christos static int cur_mb_len;		/* Byte length of the current scanning
    312  1.1  christos 				   multibyte character.  */
    313  1.1  christos static int cur_mb_index;        /* Byte index of the current scanning multibyte
    314  1.1  christos                                    character.
    315  1.1  christos 
    316  1.1  christos 				   singlebyte character : cur_mb_index = 0
    317  1.1  christos 				   multibyte character
    318  1.1  christos 				       1st byte : cur_mb_index = 1
    319  1.1  christos 				       2nd byte : cur_mb_index = 2
    320  1.1  christos 				         ...
    321  1.1  christos 				       nth byte : cur_mb_index = n  */
    322  1.1  christos static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
    323  1.1  christos                                   Each element store the amount of remain
    324  1.1  christos                                   byte of corresponding multibyte character
    325  1.1  christos                                   in the input string.  A element's value
    326  1.1  christos                                   is 0 if corresponding character is a
    327  1.1  christos                                   singlebyte chracter.
    328  1.1  christos                                   e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
    329  1.1  christos                                    mblen_buf :   0,       3,       2,       1
    330  1.1  christos                                */
    331  1.1  christos static wchar_t *inputwcs;	/* Wide character representation of input
    332  1.1  christos 				   string in dfaexec().
    333  1.1  christos 				   The length of this array is same as
    334  1.1  christos 				   the length of input string(char array).
    335  1.1  christos 				   inputstring[i] is a single-byte char,
    336  1.1  christos 				   or 1st byte of a multibyte char.
    337  1.1  christos 				   And inputwcs[i] is the codepoint.  */
    338  1.1  christos static unsigned char const *buf_begin;/* refference to begin in dfaexec().  */
    339  1.1  christos static unsigned char const *buf_end;	/* refference to end in dfaexec().  */
    340  1.1  christos #endif /* MBS_SUPPORT  */
    341  1.1  christos 
    342  1.1  christos #ifdef MBS_SUPPORT
    343  1.1  christos /* This function update cur_mb_len, and cur_mb_index.
    344  1.1  christos    p points current lexptr, len is the remaining buffer length.  */
    345  1.1  christos static void
    346  1.1  christos update_mb_len_index (char const *p, int len)
    347  1.1  christos {
    348  1.1  christos   /* If last character is a part of a multibyte character,
    349  1.1  christos      we update cur_mb_index.  */
    350  1.1  christos   if (cur_mb_index)
    351  1.1  christos     cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
    352  1.1  christos 			: cur_mb_index + 1;
    353  1.1  christos 
    354  1.1  christos   /* If last character is a single byte character, or the
    355  1.1  christos      last portion of a multibyte character, we check whether
    356  1.1  christos      next character is a multibyte character or not.  */
    357  1.1  christos   if (! cur_mb_index)
    358  1.1  christos     {
    359  1.1  christos       cur_mb_len = mbrlen(p, len, &mbs);
    360  1.1  christos       if (cur_mb_len > 1)
    361  1.1  christos 	/* It is a multibyte character.
    362  1.1  christos 	   cur_mb_len was already set by mbrlen().  */
    363  1.1  christos 	cur_mb_index = 1;
    364  1.1  christos       else if (cur_mb_len < 1)
    365  1.1  christos 	/* Invalid sequence.  We treat it as a singlebyte character.
    366  1.1  christos 	   cur_mb_index is aleady 0.  */
    367  1.1  christos 	cur_mb_len = 1;
    368  1.1  christos       /* Otherwise, cur_mb_len == 1, it is a singlebyte character.
    369  1.1  christos 	 cur_mb_index is aleady 0.  */
    370  1.1  christos     }
    371  1.1  christos }
    372  1.1  christos #endif /* MBS_SUPPORT */
    373  1.1  christos 
    374  1.1  christos #ifdef MBS_SUPPORT
    375  1.1  christos /* Note that characters become unsigned here. */
    376  1.1  christos # define FETCH(c, eoferr)			\
    377  1.1  christos   {						\
    378  1.1  christos     if (! lexleft)				\
    379  1.1  christos      {						\
    380  1.1  christos 	if (eoferr != 0)			\
    381  1.1  christos 	  dfaerror (eoferr);			\
    382  1.1  christos 	else					\
    383  1.1  christos 	  return lasttok = END;			\
    384  1.1  christos       }						\
    385  1.1  christos     if (MB_CUR_MAX > 1)				\
    386  1.1  christos       update_mb_len_index(lexptr, lexleft);	\
    387  1.1  christos     (c) = (unsigned char) *lexptr++;		\
    388  1.1  christos     --lexleft;					\
    389  1.1  christos   }
    390  1.1  christos 
    391  1.1  christos /* This function fetch a wide character, and update cur_mb_len,
    392  1.1  christos    used only if the current locale is a multibyte environment.  */
    393  1.1  christos static wchar_t
    394  1.1  christos fetch_wc (char const *eoferr)
    395  1.1  christos {
    396  1.1  christos   wchar_t wc;
    397  1.1  christos   if (! lexleft)
    398  1.1  christos     {
    399  1.1  christos       if (eoferr != 0)
    400  1.1  christos 	dfaerror (eoferr);
    401  1.1  christos       else
    402  1.1  christos 	return -1;
    403  1.1  christos     }
    404  1.1  christos 
    405  1.1  christos   cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
    406  1.1  christos   if (cur_mb_len <= 0)
    407  1.1  christos    {
    408  1.1  christos       cur_mb_len = 1;
    409  1.1  christos       wc = *lexptr;
    410  1.1  christos     }
    411  1.1  christos   lexptr += cur_mb_len;
    412  1.1  christos   lexleft -= cur_mb_len;
    413  1.1  christos   return wc;
    414  1.1  christos }
    415  1.1  christos #else
    416  1.1  christos /* Note that characters become unsigned here. */
    417  1.1  christos # define FETCH(c, eoferr)   	      \
    418  1.1  christos   {			   	      \
    419  1.1  christos     if (! lexleft)	   	      \
    420  1.1  christos       {				      \
    421  1.1  christos 	if (eoferr != 0)	      \
    422  1.1  christos 	  dfaerror (eoferr);	      \
    423  1.1  christos 	else		   	      \
    424  1.1  christos 	  return lasttok = END;	      \
    425  1.1  christos       }				      \
    426  1.1  christos     (c) = (unsigned char) *lexptr++;  \
    427  1.1  christos     --lexleft;		   	      \
    428  1.1  christos   }
    429  1.1  christos #endif /* MBS_SUPPORT */
    430  1.1  christos 
    431  1.1  christos #ifdef MBS_SUPPORT
    432  1.1  christos /* Multibyte character handling sub-routin for lex.
    433  1.1  christos    This function  parse a bracket expression and build a struct
    434  1.1  christos    mb_char_classes.  */
    435  1.1  christos static void
    436  1.1  christos parse_bracket_exp_mb ()
    437  1.1  christos {
    438  1.1  christos   wchar_t wc, wc1, wc2;
    439  1.1  christos 
    440  1.1  christos   /* Work area to build a mb_char_classes.  */
    441  1.1  christos   struct mb_char_classes *work_mbc;
    442  1.1  christos   int chars_al, range_sts_al, range_ends_al, ch_classes_al,
    443  1.1  christos     equivs_al, coll_elems_al;
    444  1.1  christos 
    445  1.1  christos   REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
    446  1.1  christos 		       dfa->mbcsets_alloc, dfa->nmbcsets + 1);
    447  1.1  christos   /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
    448  1.1  christos      We will update dfa->multibyte_prop in addtok(), because we can't
    449  1.1  christos      decide the index in dfa->tokens[].  */
    450  1.1  christos 
    451  1.1  christos   /* Initialize work are */
    452  1.1  christos   work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
    453  1.1  christos 
    454  1.1  christos   chars_al = 1;
    455  1.1  christos   range_sts_al = range_ends_al = 0;
    456  1.1  christos   ch_classes_al = equivs_al = coll_elems_al = 0;
    457  1.1  christos   MALLOC(work_mbc->chars, wchar_t, chars_al);
    458  1.1  christos 
    459  1.1  christos   work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
    460  1.1  christos   work_mbc->nequivs = work_mbc->ncoll_elems = 0;
    461  1.1  christos   work_mbc->chars = NULL;
    462  1.1  christos   work_mbc->ch_classes = NULL;
    463  1.1  christos   work_mbc->range_sts = work_mbc->range_ends = NULL;
    464  1.1  christos   work_mbc->equivs = work_mbc->coll_elems = NULL;
    465  1.1  christos 
    466  1.1  christos   wc = fetch_wc(_("Unbalanced ["));
    467  1.1  christos   if (wc == L'^')
    468  1.1  christos     {
    469  1.1  christos       wc = fetch_wc(_("Unbalanced ["));
    470  1.1  christos       work_mbc->invert = 1;
    471  1.1  christos     }
    472  1.1  christos   else
    473  1.1  christos     work_mbc->invert = 0;
    474  1.1  christos   do
    475  1.1  christos     {
    476  1.1  christos       wc1 = -1; /* mark wc1 is not initialized".  */
    477  1.1  christos 
    478  1.1  christos       /* Note that if we're looking at some other [:...:] construct,
    479  1.1  christos 	 we just treat it as a bunch of ordinary characters.  We can do
    480  1.1  christos 	 this because we assume regex has checked for syntax errors before
    481  1.1  christos 	 dfa is ever called. */
    482  1.1  christos       if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
    483  1.1  christos 	{
    484  1.1  christos #define BRACKET_BUFFER_SIZE 128
    485  1.1  christos 	  char str[BRACKET_BUFFER_SIZE];
    486  1.1  christos 	  wc1 = wc;
    487  1.1  christos 	  wc = fetch_wc(_("Unbalanced ["));
    488  1.1  christos 
    489  1.1  christos 	  /* If pattern contains `[[:', `[[.', or `[[='.  */
    490  1.1  christos 	  if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
    491  1.1  christos 	    {
    492  1.1  christos 	      unsigned char c;
    493  1.1  christos 	      unsigned char delim = (unsigned char)wc;
    494  1.1  christos 	      int len = 0;
    495  1.1  christos 	      for (;;)
    496  1.1  christos 		{
    497  1.1  christos 		  if (! lexleft)
    498  1.1  christos 		    dfaerror (_("Unbalanced ["));
    499  1.1  christos 		  c = (unsigned char) *lexptr++;
    500  1.1  christos 		  --lexleft;
    501  1.1  christos 
    502  1.1  christos 		  if ((c == delim && *lexptr == ']') || lexleft == 0)
    503  1.1  christos 		    break;
    504  1.1  christos 		  if (len < BRACKET_BUFFER_SIZE)
    505  1.1  christos 		    str[len++] = c;
    506  1.1  christos 		  else
    507  1.1  christos 		    /* This is in any case an invalid class name.  */
    508  1.1  christos 		    str[0] = '\0';
    509  1.1  christos 		}
    510  1.1  christos 	      str[len] = '\0';
    511  1.1  christos 
    512  1.1  christos 	      if (lexleft == 0)
    513  1.1  christos 		{
    514  1.1  christos 		  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
    515  1.1  christos 				       work_mbc->nchars + 2);
    516  1.1  christos 		  work_mbc->chars[work_mbc->nchars++] = L'[';
    517  1.1  christos 		  work_mbc->chars[work_mbc->nchars++] = delim;
    518  1.1  christos 		  break;
    519  1.1  christos 		}
    520  1.1  christos 
    521  1.1  christos 	      if (--lexleft, *lexptr++ != ']')
    522  1.1  christos 		dfaerror (_("Unbalanced ["));
    523  1.1  christos 	      if (delim == ':')
    524  1.1  christos 		/* build character class.  */
    525  1.1  christos 		{
    526  1.1  christos 		  wctype_t wt;
    527  1.1  christos 		  /* Query the character class as wctype_t.  */
    528  1.1  christos 		  wt = wctype (str);
    529  1.1  christos 
    530  1.1  christos 		  if (ch_classes_al == 0)
    531  1.1  christos 		    MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
    532  1.1  christos 		  REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
    533  1.1  christos 				       ch_classes_al,
    534  1.1  christos 				       work_mbc->nch_classes + 1);
    535  1.1  christos 		  work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
    536  1.1  christos 
    537  1.1  christos  		}
    538  1.1  christos 	      else if (delim == '=' || delim == '.')
    539  1.1  christos 		{
    540  1.1  christos 		  char *elem;
    541  1.1  christos 		  MALLOC(elem, char, len + 1);
    542  1.1  christos 		  strncpy(elem, str, len + 1);
    543  1.1  christos 
    544  1.1  christos 		  if (delim == '=')
    545  1.1  christos 		    /* build equivalent class.  */
    546  1.1  christos 		    {
    547  1.1  christos 		      if (equivs_al == 0)
    548  1.1  christos 			MALLOC(work_mbc->equivs, char*, ++equivs_al);
    549  1.1  christos 		      REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
    550  1.1  christos 					   equivs_al,
    551  1.1  christos 					   work_mbc->nequivs + 1);
    552  1.1  christos 		      work_mbc->equivs[work_mbc->nequivs++] = elem;
    553  1.1  christos 		    }
    554  1.1  christos 
    555  1.1  christos 		  if (delim == '.')
    556  1.1  christos 		    /* build collating element.  */
    557  1.1  christos 		    {
    558  1.1  christos 		      if (coll_elems_al == 0)
    559  1.1  christos 			MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
    560  1.1  christos 		      REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
    561  1.1  christos 					   coll_elems_al,
    562  1.1  christos 					   work_mbc->ncoll_elems + 1);
    563  1.1  christos 		      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
    564  1.1  christos 		    }
    565  1.1  christos  		}
    566  1.1  christos 	      wc = -1;
    567  1.1  christos 	    }
    568  1.1  christos 	  else
    569  1.1  christos 	    /* We treat '[' as a normal character here.  */
    570  1.1  christos 	    {
    571  1.1  christos 	      wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
    572  1.1  christos 	    }
    573  1.1  christos 	}
    574  1.1  christos       else
    575  1.1  christos 	{
    576  1.1  christos 	  if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
    577  1.1  christos 	    wc = fetch_wc(("Unbalanced ["));
    578  1.1  christos 	}
    579  1.1  christos 
    580  1.1  christos       if (wc1 == -1)
    581  1.1  christos 	wc1 = fetch_wc(_("Unbalanced ["));
    582  1.1  christos 
    583  1.1  christos       if (wc1 == L'-')
    584  1.1  christos 	/* build range characters.  */
    585  1.1  christos 	{
    586  1.1  christos 	  wc2 = fetch_wc(_("Unbalanced ["));
    587  1.1  christos 	  if (wc2 == L']')
    588  1.1  christos 	    {
    589  1.1  christos 	      /* In the case [x-], the - is an ordinary hyphen,
    590  1.1  christos 		 which is left in c1, the lookahead character. */
    591  1.1  christos 	      lexptr -= cur_mb_len;
    592  1.1  christos 	      lexleft += cur_mb_len;
    593  1.1  christos 	      wc2 = wc;
    594  1.1  christos 	    }
    595  1.1  christos 	  else
    596  1.1  christos 	    {
    597  1.1  christos 	      if (wc2 == L'\\'
    598  1.1  christos 		  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
    599  1.1  christos 		wc2 = fetch_wc(_("Unbalanced ["));
    600  1.1  christos 	      wc1 = fetch_wc(_("Unbalanced ["));
    601  1.1  christos 	    }
    602  1.1  christos 
    603  1.1  christos 	  if (range_sts_al == 0)
    604  1.1  christos 	    {
    605  1.1  christos 	      MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
    606  1.1  christos 	      MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
    607  1.1  christos 	    }
    608  1.1  christos 	  REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
    609  1.1  christos 			       range_sts_al, work_mbc->nranges + 1);
    610  1.1  christos 	  work_mbc->range_sts[work_mbc->nranges] = wc;
    611  1.1  christos 	  REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
    612  1.1  christos 			       range_ends_al, work_mbc->nranges + 1);
    613  1.1  christos 	  work_mbc->range_ends[work_mbc->nranges++] = wc2;
    614  1.1  christos 	}
    615  1.1  christos       else if (wc != -1)
    616  1.1  christos 	/* build normal characters.  */
    617  1.1  christos 	{
    618  1.1  christos 	  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
    619  1.1  christos 			       work_mbc->nchars + 1);
    620  1.1  christos 	  work_mbc->chars[work_mbc->nchars++] = wc;
    621  1.1  christos 	}
    622  1.1  christos     }
    623  1.1  christos   while ((wc = wc1) != L']');
    624  1.1  christos }
    625  1.1  christos #endif /* MBS_SUPPORT */
    626  1.1  christos 
    627  1.1  christos #ifdef __STDC__
    628  1.1  christos #define FUNC(F, P) static int F(int c) { return P(c); }
    629  1.1  christos #else
    630  1.1  christos #define FUNC(F, P) static int F(c) int c; { return P(c); }
    631  1.1  christos #endif
    632  1.1  christos 
    633  1.1  christos FUNC(is_alpha, ISALPHA)
    634  1.1  christos FUNC(is_upper, ISUPPER)
    635  1.1  christos FUNC(is_lower, ISLOWER)
    636  1.1  christos FUNC(is_digit, ISDIGIT)
    637  1.1  christos FUNC(is_xdigit, ISXDIGIT)
    638  1.1  christos FUNC(is_space, ISSPACE)
    639  1.1  christos FUNC(is_punct, ISPUNCT)
    640  1.1  christos FUNC(is_alnum, ISALNUM)
    641  1.1  christos FUNC(is_print, ISPRINT)
    642  1.1  christos FUNC(is_graph, ISGRAPH)
    643  1.1  christos FUNC(is_cntrl, ISCNTRL)
    644  1.1  christos 
    645  1.1  christos static int
    646  1.1  christos is_blank (int c)
    647  1.1  christos {
    648  1.1  christos    return (c == ' ' || c == '\t');
    649  1.1  christos }
    650  1.1  christos 
    651  1.1  christos /* The following list maps the names of the Posix named character classes
    652  1.1  christos    to predicate functions that determine whether a given character is in
    653  1.1  christos    the class.  The leading [ has already been eaten by the lexical analyzer. */
    654  1.1  christos static struct {
    655  1.1  christos   const char *name;
    656  1.1  christos   int (*pred) (int);
    657  1.1  christos } const prednames[] = {
    658  1.1  christos   { ":alpha:]", is_alpha },
    659  1.1  christos   { ":upper:]", is_upper },
    660  1.1  christos   { ":lower:]", is_lower },
    661  1.1  christos   { ":digit:]", is_digit },
    662  1.1  christos   { ":xdigit:]", is_xdigit },
    663  1.1  christos   { ":space:]", is_space },
    664  1.1  christos   { ":punct:]", is_punct },
    665  1.1  christos   { ":alnum:]", is_alnum },
    666  1.1  christos   { ":print:]", is_print },
    667  1.1  christos   { ":graph:]", is_graph },
    668  1.1  christos   { ":cntrl:]", is_cntrl },
    669  1.1  christos   { ":blank:]", is_blank },
    670  1.1  christos   { 0 }
    671  1.1  christos };
    672  1.1  christos 
    673  1.1  christos /* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
    674  1.1  christos #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
    675  1.1  christos 
    676  1.1  christos static int
    677  1.1  christos looking_at (char const *s)
    678  1.1  christos {
    679  1.1  christos   size_t len;
    680  1.1  christos 
    681  1.1  christos   len = strlen(s);
    682  1.1  christos   if (lexleft < len)
    683  1.1  christos     return 0;
    684  1.1  christos   return strncmp(s, lexptr, len) == 0;
    685  1.1  christos }
    686  1.1  christos 
    687  1.1  christos static token
    688  1.1  christos lex (void)
    689  1.1  christos {
    690  1.1  christos   unsigned c, c1, c2;
    691  1.1  christos   int backslash = 0, invert;
    692  1.1  christos   charclass ccl;
    693  1.1  christos   int i;
    694  1.1  christos 
    695  1.1  christos   /* Basic plan: We fetch a character.  If it's a backslash,
    696  1.1  christos      we set the backslash flag and go through the loop again.
    697  1.1  christos      On the plus side, this avoids having a duplicate of the
    698  1.1  christos      main switch inside the backslash case.  On the minus side,
    699  1.1  christos      it means that just about every case begins with
    700  1.1  christos      "if (backslash) ...".  */
    701  1.1  christos   for (i = 0; i < 2; ++i)
    702  1.1  christos     {
    703  1.1  christos       FETCH(c, 0);
    704  1.1  christos #ifdef MBS_SUPPORT
    705  1.1  christos       if (MB_CUR_MAX > 1 && cur_mb_index)
    706  1.1  christos 	/* If this is a part of a multi-byte character, we must treat
    707  1.1  christos 	   this byte data as a normal character.
    708  1.1  christos 	   e.g. In case of SJIS encoding, some character contains '\',
    709  1.1  christos 	        but they must not be backslash.  */
    710  1.1  christos 	goto normal_char;
    711  1.1  christos #endif /* MBS_SUPPORT  */
    712  1.1  christos       switch (c)
    713  1.1  christos 	{
    714  1.1  christos 	case '\\':
    715  1.1  christos 	  if (backslash)
    716  1.1  christos 	    goto normal_char;
    717  1.1  christos 	  if (lexleft == 0)
    718  1.1  christos 	    dfaerror(_("Unfinished \\ escape"));
    719  1.1  christos 	  backslash = 1;
    720  1.1  christos 	  break;
    721  1.1  christos 
    722  1.1  christos 	case '^':
    723  1.1  christos 	  if (backslash)
    724  1.1  christos 	    goto normal_char;
    725  1.1  christos 	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
    726  1.1  christos 	      || lasttok == END
    727  1.1  christos 	      || lasttok == LPAREN
    728  1.1  christos 	      || lasttok == OR)
    729  1.1  christos 	    return lasttok = BEGLINE;
    730  1.1  christos 	  goto normal_char;
    731  1.1  christos 
    732  1.1  christos 	case '$':
    733  1.1  christos 	  if (backslash)
    734  1.1  christos 	    goto normal_char;
    735  1.1  christos 	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
    736  1.1  christos 	      || lexleft == 0
    737  1.1  christos 	      || (syntax_bits & RE_NO_BK_PARENS
    738  1.1  christos 		  ? lexleft > 0 && *lexptr == ')'
    739  1.1  christos 		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
    740  1.1  christos 	      || (syntax_bits & RE_NO_BK_VBAR
    741  1.1  christos 		  ? lexleft > 0 && *lexptr == '|'
    742  1.1  christos 		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
    743  1.1  christos 	      || ((syntax_bits & RE_NEWLINE_ALT)
    744  1.1  christos 	          && lexleft > 0 && *lexptr == '\n'))
    745  1.1  christos 	    return lasttok = ENDLINE;
    746  1.1  christos 	  goto normal_char;
    747  1.1  christos 
    748  1.1  christos 	case '1':
    749  1.1  christos 	case '2':
    750  1.1  christos 	case '3':
    751  1.1  christos 	case '4':
    752  1.1  christos 	case '5':
    753  1.1  christos 	case '6':
    754  1.1  christos 	case '7':
    755  1.1  christos 	case '8':
    756  1.1  christos 	case '9':
    757  1.1  christos 	  if (backslash && !(syntax_bits & RE_NO_BK_REFS))
    758  1.1  christos 	    {
    759  1.1  christos 	      laststart = 0;
    760  1.1  christos 	      return lasttok = BACKREF;
    761  1.1  christos 	    }
    762  1.1  christos 	  goto normal_char;
    763  1.1  christos 
    764  1.1  christos 	case '`':
    765  1.1  christos 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
    766  1.1  christos 	    return lasttok = BEGLINE;	/* FIXME: should be beginning of string */
    767  1.1  christos 	  goto normal_char;
    768  1.1  christos 
    769  1.1  christos 	case '\'':
    770  1.1  christos 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
    771  1.1  christos 	    return lasttok = ENDLINE;	/* FIXME: should be end of string */
    772  1.1  christos 	  goto normal_char;
    773  1.1  christos 
    774  1.1  christos 	case '<':
    775  1.1  christos 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
    776  1.1  christos 	    return lasttok = BEGWORD;
    777  1.1  christos 	  goto normal_char;
    778  1.1  christos 
    779  1.1  christos 	case '>':
    780  1.1  christos 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
    781  1.1  christos 	    return lasttok = ENDWORD;
    782  1.1  christos 	  goto normal_char;
    783  1.1  christos 
    784  1.1  christos 	case 'b':
    785  1.1  christos 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
    786  1.1  christos 	    return lasttok = LIMWORD;
    787  1.1  christos 	  goto normal_char;
    788  1.1  christos 
    789  1.1  christos 	case 'B':
    790  1.1  christos 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
    791  1.1  christos 	    return lasttok = NOTLIMWORD;
    792  1.1  christos 	  goto normal_char;
    793  1.1  christos 
    794  1.1  christos 	case '?':
    795  1.1  christos 	  if (syntax_bits & RE_LIMITED_OPS)
    796  1.1  christos 	    goto normal_char;
    797  1.1  christos 	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
    798  1.1  christos 	    goto normal_char;
    799  1.1  christos 	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
    800  1.1  christos 	    goto normal_char;
    801  1.1  christos 	  return lasttok = QMARK;
    802  1.1  christos 
    803  1.1  christos 	case '*':
    804  1.1  christos 	  if (backslash)
    805  1.1  christos 	    goto normal_char;
    806  1.1  christos 	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
    807  1.1  christos 	    goto normal_char;
    808  1.1  christos 	  return lasttok = STAR;
    809  1.1  christos 
    810  1.1  christos 	case '+':
    811  1.1  christos 	  if (syntax_bits & RE_LIMITED_OPS)
    812  1.1  christos 	    goto normal_char;
    813  1.1  christos 	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
    814  1.1  christos 	    goto normal_char;
    815  1.1  christos 	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
    816  1.1  christos 	    goto normal_char;
    817  1.1  christos 	  return lasttok = PLUS;
    818  1.1  christos 
    819  1.1  christos 	case '{':
    820  1.1  christos 	  if (!(syntax_bits & RE_INTERVALS))
    821  1.1  christos 	    goto normal_char;
    822  1.1  christos 	  if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
    823  1.1  christos 	    goto normal_char;
    824  1.1  christos 	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
    825  1.1  christos 	    goto normal_char;
    826  1.1  christos 
    827  1.1  christos 	  if (syntax_bits & RE_NO_BK_BRACES)
    828  1.1  christos 	    {
    829  1.1  christos 	      /* Scan ahead for a valid interval; if it's not valid,
    830  1.1  christos 		 treat it as a literal '{'.  */
    831  1.1  christos 	      int lo = -1, hi = -1;
    832  1.1  christos 	      char const *p = lexptr;
    833  1.1  christos 	      char const *lim = p + lexleft;
    834  1.1  christos 	      for (;  p != lim && ISASCIIDIGIT (*p);  p++)
    835  1.1  christos 		lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
    836  1.1  christos 	      if (p != lim && *p == ',')
    837  1.1  christos 		while (++p != lim && ISASCIIDIGIT (*p))
    838  1.1  christos 		  hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
    839  1.1  christos 	      else
    840  1.1  christos 		hi = lo;
    841  1.1  christos 	      if (p == lim || *p != '}'
    842  1.1  christos 		  || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
    843  1.1  christos 		goto normal_char;
    844  1.1  christos 	    }
    845  1.1  christos 
    846  1.1  christos 	  minrep = 0;
    847  1.1  christos 	  /* Cases:
    848  1.1  christos 	     {M} - exact count
    849  1.1  christos 	     {M,} - minimum count, maximum is infinity
    850  1.1  christos 	     {M,N} - M through N */
    851  1.1  christos 	  FETCH(c, _("unfinished repeat count"));
    852  1.1  christos 	  if (ISASCIIDIGIT (c))
    853  1.1  christos 	    {
    854  1.1  christos 	      minrep = c - '0';
    855  1.1  christos 	      for (;;)
    856  1.1  christos 		{
    857  1.1  christos 		  FETCH(c, _("unfinished repeat count"));
    858  1.1  christos 		  if (! ISASCIIDIGIT (c))
    859  1.1  christos 		    break;
    860  1.1  christos 		  minrep = 10 * minrep + c - '0';
    861  1.1  christos 		}
    862  1.1  christos 	    }
    863  1.1  christos 	  else
    864  1.1  christos 	    dfaerror(_("malformed repeat count"));
    865  1.1  christos 	  if (c == ',')
    866  1.1  christos 	    {
    867  1.1  christos 	      FETCH (c, _("unfinished repeat count"));
    868  1.1  christos 	      if (! ISASCIIDIGIT (c))
    869  1.1  christos 		maxrep = -1;
    870  1.1  christos 	      else
    871  1.1  christos 		{
    872  1.1  christos 		  maxrep = c - '0';
    873  1.1  christos 		  for (;;)
    874  1.1  christos 		    {
    875  1.1  christos 		      FETCH (c, _("unfinished repeat count"));
    876  1.1  christos 		      if (! ISASCIIDIGIT (c))
    877  1.1  christos 			break;
    878  1.1  christos 		      maxrep = 10 * maxrep + c - '0';
    879  1.1  christos 		    }
    880  1.1  christos 		  if (0 <= maxrep && maxrep < minrep)
    881  1.1  christos 		    dfaerror (_("malformed repeat count"));
    882  1.1  christos 		}
    883  1.1  christos 	    }
    884  1.1  christos 	  else
    885  1.1  christos 	    maxrep = minrep;
    886  1.1  christos 	  if (!(syntax_bits & RE_NO_BK_BRACES))
    887  1.1  christos 	    {
    888  1.1  christos 	      if (c != '\\')
    889  1.1  christos 		dfaerror(_("malformed repeat count"));
    890  1.1  christos 	      FETCH(c, _("unfinished repeat count"));
    891  1.1  christos 	    }
    892  1.1  christos 	  if (c != '}')
    893  1.1  christos 	    dfaerror(_("malformed repeat count"));
    894  1.1  christos 	  laststart = 0;
    895  1.1  christos 	  return lasttok = REPMN;
    896  1.1  christos 
    897  1.1  christos 	case '|':
    898  1.1  christos 	  if (syntax_bits & RE_LIMITED_OPS)
    899  1.1  christos 	    goto normal_char;
    900  1.1  christos 	  if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
    901  1.1  christos 	    goto normal_char;
    902  1.1  christos 	  laststart = 1;
    903  1.1  christos 	  return lasttok = OR;
    904  1.1  christos 
    905  1.1  christos 	case '\n':
    906  1.1  christos 	  if (syntax_bits & RE_LIMITED_OPS
    907  1.1  christos 	      || backslash
    908  1.1  christos 	      || !(syntax_bits & RE_NEWLINE_ALT))
    909  1.1  christos 	    goto normal_char;
    910  1.1  christos 	  laststart = 1;
    911  1.1  christos 	  return lasttok = OR;
    912  1.1  christos 
    913  1.1  christos 	case '(':
    914  1.1  christos 	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
    915  1.1  christos 	    goto normal_char;
    916  1.1  christos 	  ++parens;
    917  1.1  christos 	  laststart = 1;
    918  1.1  christos 	  return lasttok = LPAREN;
    919  1.1  christos 
    920  1.1  christos 	case ')':
    921  1.1  christos 	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
    922  1.1  christos 	    goto normal_char;
    923  1.1  christos 	  if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
    924  1.1  christos 	    goto normal_char;
    925  1.1  christos 	  --parens;
    926  1.1  christos 	  laststart = 0;
    927  1.1  christos 	  return lasttok = RPAREN;
    928  1.1  christos 
    929  1.1  christos 	case '.':
    930  1.1  christos 	  if (backslash)
    931  1.1  christos 	    goto normal_char;
    932  1.1  christos #ifdef MBS_SUPPORT
    933  1.1  christos 	  if (MB_CUR_MAX > 1)
    934  1.1  christos 	    {
    935  1.1  christos 	      /* In multibyte environment period must match with a single
    936  1.1  christos 		 character not a byte.  So we use ANYCHAR.  */
    937  1.1  christos 	      laststart = 0;
    938  1.1  christos 	      return lasttok = ANYCHAR;
    939  1.1  christos 	    }
    940  1.1  christos #endif /* MBS_SUPPORT */
    941  1.1  christos 	  zeroset(ccl);
    942  1.1  christos 	  notset(ccl);
    943  1.1  christos 	  if (!(syntax_bits & RE_DOT_NEWLINE))
    944  1.1  christos 	    clrbit(eolbyte, ccl);
    945  1.1  christos 	  if (syntax_bits & RE_DOT_NOT_NULL)
    946  1.1  christos 	    clrbit('\0', ccl);
    947  1.1  christos 	  laststart = 0;
    948  1.1  christos 	  return lasttok = CSET + charclass_index(ccl);
    949  1.1  christos 
    950  1.1  christos 	case 'w':
    951  1.1  christos 	case 'W':
    952  1.1  christos 	  if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
    953  1.1  christos 	    goto normal_char;
    954  1.1  christos 	  zeroset(ccl);
    955  1.1  christos 	  for (c2 = 0; c2 < NOTCHAR; ++c2)
    956  1.1  christos 	    if (IS_WORD_CONSTITUENT(c2))
    957  1.1  christos 	      setbit(c2, ccl);
    958  1.1  christos 	  if (c == 'W')
    959  1.1  christos 	    notset(ccl);
    960  1.1  christos 	  laststart = 0;
    961  1.1  christos 	  return lasttok = CSET + charclass_index(ccl);
    962  1.1  christos 
    963  1.1  christos 	case '[':
    964  1.1  christos 	  if (backslash)
    965  1.1  christos 	    goto normal_char;
    966  1.1  christos 	  laststart = 0;
    967  1.1  christos #ifdef MBS_SUPPORT
    968  1.1  christos 	  if (MB_CUR_MAX > 1)
    969  1.1  christos 	    {
    970  1.1  christos 	      /* In multibyte environment a bracket expression may contain
    971  1.1  christos 		 multibyte characters, which must be treated as characters
    972  1.1  christos 		 (not bytes).  So we parse it by parse_bracket_exp_mb().  */
    973  1.1  christos 	      parse_bracket_exp_mb();
    974  1.1  christos 	      return lasttok = MBCSET;
    975  1.1  christos 	    }
    976  1.1  christos #endif
    977  1.1  christos 	  zeroset(ccl);
    978  1.1  christos 	  FETCH(c, _("Unbalanced ["));
    979  1.1  christos 	  if (c == '^')
    980  1.1  christos 	    {
    981  1.1  christos 	      FETCH(c, _("Unbalanced ["));
    982  1.1  christos 	      invert = 1;
    983  1.1  christos 	    }
    984  1.1  christos 	  else
    985  1.1  christos 	    invert = 0;
    986  1.1  christos 	  do
    987  1.1  christos 	    {
    988  1.1  christos 	      /* Nobody ever said this had to be fast. :-)
    989  1.1  christos 		 Note that if we're looking at some other [:...:]
    990  1.1  christos 		 construct, we just treat it as a bunch of ordinary
    991  1.1  christos 		 characters.  We can do this because we assume
    992  1.1  christos 		 regex has checked for syntax errors before
    993  1.1  christos 		 dfa is ever called. */
    994  1.1  christos 	      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
    995  1.1  christos 		for (c1 = 0; prednames[c1].name; ++c1)
    996  1.1  christos 		  if (looking_at(prednames[c1].name))
    997  1.1  christos 		    {
    998  1.1  christos 		      int (*pred) (int) = prednames[c1].pred;
    999  1.1  christos 
   1000  1.1  christos 		      for (c2 = 0; c2 < NOTCHAR; ++c2)
   1001  1.1  christos 			if ((*pred)(c2))
   1002  1.1  christos 			  setbit_case_fold (c2, ccl);
   1003  1.1  christos 		      lexptr += strlen(prednames[c1].name);
   1004  1.1  christos 		      lexleft -= strlen(prednames[c1].name);
   1005  1.1  christos 		      FETCH(c1, _("Unbalanced ["));
   1006  1.1  christos 		      goto skip;
   1007  1.1  christos 		    }
   1008  1.1  christos 	      if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
   1009  1.1  christos 		FETCH(c, _("Unbalanced ["));
   1010  1.1  christos 	      FETCH(c1, _("Unbalanced ["));
   1011  1.1  christos 	      if (c1 == '-')
   1012  1.1  christos 		{
   1013  1.1  christos 		  FETCH(c2, _("Unbalanced ["));
   1014  1.1  christos 		  if (c2 == ']')
   1015  1.1  christos 		    {
   1016  1.1  christos 		      /* In the case [x-], the - is an ordinary hyphen,
   1017  1.1  christos 			 which is left in c1, the lookahead character. */
   1018  1.1  christos 		      --lexptr;
   1019  1.1  christos 		      ++lexleft;
   1020  1.1  christos 		    }
   1021  1.1  christos 		  else
   1022  1.1  christos 		    {
   1023  1.1  christos 		      if (c2 == '\\'
   1024  1.1  christos 			  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
   1025  1.1  christos 			FETCH(c2, _("Unbalanced ["));
   1026  1.1  christos 		      FETCH(c1, _("Unbalanced ["));
   1027  1.1  christos 		      if (!hard_LC_COLLATE) {
   1028  1.1  christos 		        for (; c <= c2; c++)
   1029  1.1  christos 			  setbit_case_fold (c, ccl);
   1030  1.1  christos 		      } else {
   1031  1.1  christos 			/* POSIX locales are painful - leave the decision to libc */
   1032  1.1  christos 			regex_t re;
   1033  1.1  christos 			char expr[6]; /* = { '[', c, '-', c2, ']', '\0' }; */
   1034  1.1  christos 
   1035  1.1  christos 			expr[0] = '['; expr[1] = c; expr[2] = '-';
   1036  1.1  christos 			expr[3] = c2; expr[4] = ']'; expr[5] = '\0';
   1037  1.1  christos 			if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
   1038  1.1  christos 			  for (c = 0; c < NOTCHAR; ++c) {
   1039  1.1  christos 			    regmatch_t mat;
   1040  1.1  christos 			    char buf[2]; /* = { c, '\0' }; */
   1041  1.1  christos 
   1042  1.1  christos 			    buf[0] = c; buf[1] = '\0';
   1043  1.1  christos 			    if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
   1044  1.1  christos                                && mat.rm_so == 0 && mat.rm_eo == 1)
   1045  1.1  christos                               setbit_case_fold (c, ccl);
   1046  1.1  christos 			  }
   1047  1.1  christos 			  regfree (&re);
   1048  1.1  christos 			}
   1049  1.1  christos 		      }
   1050  1.1  christos 		      continue;
   1051  1.1  christos 		    }
   1052  1.1  christos 		}
   1053  1.1  christos 
   1054  1.1  christos 	      setbit_case_fold (c, ccl);
   1055  1.1  christos 
   1056  1.1  christos 	    skip:
   1057  1.1  christos 	      ;
   1058  1.1  christos 	    }
   1059  1.1  christos 	  while ((c = c1) != ']');
   1060  1.1  christos 	  if (invert)
   1061  1.1  christos 	    {
   1062  1.1  christos 	      notset(ccl);
   1063  1.1  christos 	      if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
   1064  1.1  christos 		clrbit(eolbyte, ccl);
   1065  1.1  christos 	    }
   1066  1.1  christos 	  return lasttok = CSET + charclass_index(ccl);
   1067  1.1  christos 
   1068  1.1  christos 	default:
   1069  1.1  christos 	normal_char:
   1070  1.1  christos 	  laststart = 0;
   1071  1.1  christos 	  if (case_fold && ISALPHA(c))
   1072  1.1  christos 	    {
   1073  1.1  christos 	      zeroset(ccl);
   1074  1.1  christos 	      setbit_case_fold (c, ccl);
   1075  1.1  christos 	      return lasttok = CSET + charclass_index(ccl);
   1076  1.1  christos 	    }
   1077  1.1  christos 	  return c;
   1078  1.1  christos 	}
   1079  1.1  christos     }
   1080  1.1  christos 
   1081  1.1  christos   /* The above loop should consume at most a backslash
   1082  1.1  christos      and some other character. */
   1083  1.1  christos   abort();
   1084  1.1  christos   return END;	/* keeps pedantic compilers happy. */
   1085  1.1  christos }
   1086  1.1  christos 
   1087  1.1  christos /* Recursive descent parser for regular expressions. */
   1088  1.1  christos 
   1089  1.1  christos static token tok;		/* Lookahead token. */
   1090  1.1  christos static int depth;		/* Current depth of a hypothetical stack
   1091  1.1  christos 				   holding deferred productions.  This is
   1092  1.1  christos 				   used to determine the depth that will be
   1093  1.1  christos 				   required of the real stack later on in
   1094  1.1  christos 				   dfaanalyze(). */
   1095  1.1  christos 
   1096  1.1  christos /* Add the given token to the parse tree, maintaining the depth count and
   1097  1.1  christos    updating the maximum depth if necessary. */
   1098  1.1  christos static void
   1099  1.1  christos addtok (token t)
   1100  1.1  christos {
   1101  1.1  christos #ifdef MBS_SUPPORT
   1102  1.1  christos   if (MB_CUR_MAX > 1)
   1103  1.1  christos     {
   1104  1.1  christos       REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
   1105  1.1  christos 			   dfa->tindex);
   1106  1.1  christos       /* Set dfa->multibyte_prop.  See struct dfa in dfa.h.  */
   1107  1.1  christos       if (t == MBCSET)
   1108  1.1  christos 	dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
   1109  1.1  christos       else if (t < NOTCHAR)
   1110  1.1  christos 	dfa->multibyte_prop[dfa->tindex]
   1111  1.1  christos 	  = (cur_mb_len == 1)? 3 /* single-byte char */
   1112  1.1  christos 	  : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
   1113  1.1  christos 	     + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
   1114  1.1  christos       else
   1115  1.1  christos 	/* It may be unnecesssary, but it is safer to treat other
   1116  1.1  christos 	   symbols as singlebyte characters.  */
   1117  1.1  christos 	dfa->multibyte_prop[dfa->tindex] = 3;
   1118  1.1  christos     }
   1119  1.1  christos #endif
   1120  1.1  christos 
   1121  1.1  christos   REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
   1122  1.1  christos   dfa->tokens[dfa->tindex++] = t;
   1123  1.1  christos 
   1124  1.1  christos   switch (t)
   1125  1.1  christos     {
   1126  1.1  christos     case QMARK:
   1127  1.1  christos     case STAR:
   1128  1.1  christos     case PLUS:
   1129  1.1  christos       break;
   1130  1.1  christos 
   1131  1.1  christos     case CAT:
   1132  1.1  christos     case OR:
   1133  1.1  christos     case ORTOP:
   1134  1.1  christos       --depth;
   1135  1.1  christos       break;
   1136  1.1  christos 
   1137  1.1  christos     default:
   1138  1.1  christos       ++dfa->nleaves;
   1139  1.1  christos     case EMPTY:
   1140  1.1  christos       ++depth;
   1141  1.1  christos       break;
   1142  1.1  christos     }
   1143  1.1  christos   if (depth > dfa->depth)
   1144  1.1  christos     dfa->depth = depth;
   1145  1.1  christos }
   1146  1.1  christos 
   1147  1.1  christos /* The grammar understood by the parser is as follows.
   1148  1.1  christos 
   1149  1.1  christos    regexp:
   1150  1.1  christos      regexp OR branch
   1151  1.1  christos      branch
   1152  1.1  christos 
   1153  1.1  christos    branch:
   1154  1.1  christos      branch closure
   1155  1.1  christos      closure
   1156  1.1  christos 
   1157  1.1  christos    closure:
   1158  1.1  christos      closure QMARK
   1159  1.1  christos      closure STAR
   1160  1.1  christos      closure PLUS
   1161  1.1  christos      closure REPMN
   1162  1.1  christos      atom
   1163  1.1  christos 
   1164  1.1  christos    atom:
   1165  1.1  christos      <normal character>
   1166  1.1  christos      <multibyte character>
   1167  1.1  christos      ANYCHAR
   1168  1.1  christos      MBCSET
   1169  1.1  christos      CSET
   1170  1.1  christos      BACKREF
   1171  1.1  christos      BEGLINE
   1172  1.1  christos      ENDLINE
   1173  1.1  christos      BEGWORD
   1174  1.1  christos      ENDWORD
   1175  1.1  christos      LIMWORD
   1176  1.1  christos      NOTLIMWORD
   1177  1.1  christos      CRANGE
   1178  1.1  christos      LPAREN regexp RPAREN
   1179  1.1  christos      <empty>
   1180  1.1  christos 
   1181  1.1  christos    The parser builds a parse tree in postfix form in an array of tokens. */
   1182  1.1  christos 
   1183  1.1  christos static void
   1184  1.1  christos atom (void)
   1185  1.1  christos {
   1186  1.1  christos   if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
   1187  1.1  christos       || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
   1188  1.1  christos #ifdef MBS_SUPPORT
   1189  1.1  christos       || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
   1190  1.1  christos #endif /* MBS_SUPPORT */
   1191  1.1  christos       || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
   1192  1.1  christos     {
   1193  1.1  christos       addtok(tok);
   1194  1.1  christos       tok = lex();
   1195  1.1  christos #ifdef MBS_SUPPORT
   1196  1.1  christos       /* We treat a multibyte character as a single atom, so that DFA
   1197  1.1  christos 	 can treat a multibyte character as a single expression.
   1198  1.1  christos 
   1199  1.1  christos          e.g. We construct following tree from "<mb1><mb2>".
   1200  1.1  christos               <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
   1201  1.1  christos               <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
   1202  1.1  christos       */
   1203  1.1  christos       if (MB_CUR_MAX > 1)
   1204  1.1  christos 	{
   1205  1.1  christos 	  while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
   1206  1.1  christos 	    {
   1207  1.1  christos 	      addtok(tok);
   1208  1.1  christos 	      addtok(CAT);
   1209  1.1  christos 	      tok = lex();
   1210  1.1  christos 	    }
   1211  1.1  christos 	}
   1212  1.1  christos #endif /* MBS_SUPPORT  */
   1213  1.1  christos     }
   1214  1.1  christos   else if (tok == CRANGE)
   1215  1.1  christos     {
   1216  1.1  christos       /* A character range like "[a-z]" in a locale other than "C" or
   1217  1.1  christos 	 "POSIX".  This range might any sequence of one or more
   1218  1.1  christos 	 characters.  Unfortunately the POSIX locale primitives give
   1219  1.1  christos 	 us no practical way to find what character sequences might be
   1220  1.1  christos 	 matched.  Treat this approximately like "(.\1)" -- i.e. match
   1221  1.1  christos 	 one character, and then punt to the full matcher.  */
   1222  1.1  christos       charclass ccl;
   1223  1.1  christos       zeroset (ccl);
   1224  1.1  christos       notset (ccl);
   1225  1.1  christos       addtok (CSET + charclass_index (ccl));
   1226  1.1  christos       addtok (BACKREF);
   1227  1.1  christos       addtok (CAT);
   1228  1.1  christos       tok = lex ();
   1229  1.1  christos     }
   1230  1.1  christos   else if (tok == LPAREN)
   1231  1.1  christos     {
   1232  1.1  christos       tok = lex();
   1233  1.1  christos       regexp(0);
   1234  1.1  christos       if (tok != RPAREN)
   1235  1.1  christos 	dfaerror(_("Unbalanced ("));
   1236  1.1  christos       tok = lex();
   1237  1.1  christos     }
   1238  1.1  christos   else
   1239  1.1  christos     addtok(EMPTY);
   1240  1.1  christos }
   1241  1.1  christos 
   1242  1.1  christos /* Return the number of tokens in the given subexpression. */
   1243  1.1  christos static int
   1244  1.1  christos nsubtoks (int tindex)
   1245  1.1  christos {
   1246  1.1  christos   int ntoks1;
   1247  1.1  christos 
   1248  1.1  christos   switch (dfa->tokens[tindex - 1])
   1249  1.1  christos     {
   1250  1.1  christos     default:
   1251  1.1  christos       return 1;
   1252  1.1  christos     case QMARK:
   1253  1.1  christos     case STAR:
   1254  1.1  christos     case PLUS:
   1255  1.1  christos       return 1 + nsubtoks(tindex - 1);
   1256  1.1  christos     case CAT:
   1257  1.1  christos     case OR:
   1258  1.1  christos     case ORTOP:
   1259  1.1  christos       ntoks1 = nsubtoks(tindex - 1);
   1260  1.1  christos       return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
   1261  1.1  christos     }
   1262  1.1  christos }
   1263  1.1  christos 
   1264  1.1  christos /* Copy the given subexpression to the top of the tree. */
   1265  1.1  christos static void
   1266  1.1  christos copytoks (int tindex, int ntokens)
   1267  1.1  christos {
   1268  1.1  christos   int i;
   1269  1.1  christos 
   1270  1.1  christos   for (i = 0; i < ntokens; ++i)
   1271  1.1  christos     addtok(dfa->tokens[tindex + i]);
   1272  1.1  christos }
   1273  1.1  christos 
   1274  1.1  christos static void
   1275  1.1  christos closure (void)
   1276  1.1  christos {
   1277  1.1  christos   int tindex, ntokens, i;
   1278  1.1  christos 
   1279  1.1  christos   atom();
   1280  1.1  christos   while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
   1281  1.1  christos     if (tok == REPMN)
   1282  1.1  christos       {
   1283  1.1  christos 	ntokens = nsubtoks(dfa->tindex);
   1284  1.1  christos 	tindex = dfa->tindex - ntokens;
   1285  1.1  christos 	if (maxrep < 0)
   1286  1.1  christos 	  addtok(PLUS);
   1287  1.1  christos 	if (minrep == 0)
   1288  1.1  christos 	  addtok(QMARK);
   1289  1.1  christos 	for (i = 1; i < minrep; ++i)
   1290  1.1  christos 	  {
   1291  1.1  christos 	    copytoks(tindex, ntokens);
   1292  1.1  christos 	    addtok(CAT);
   1293  1.1  christos 	  }
   1294  1.1  christos 	for (; i < maxrep; ++i)
   1295  1.1  christos 	  {
   1296  1.1  christos 	    copytoks(tindex, ntokens);
   1297  1.1  christos 	    addtok(QMARK);
   1298  1.1  christos 	    addtok(CAT);
   1299  1.1  christos 	  }
   1300  1.1  christos 	tok = lex();
   1301  1.1  christos       }
   1302  1.1  christos     else
   1303  1.1  christos       {
   1304  1.1  christos 	addtok(tok);
   1305  1.1  christos 	tok = lex();
   1306  1.1  christos       }
   1307  1.1  christos }
   1308  1.1  christos 
   1309  1.1  christos static void
   1310  1.1  christos branch (void)
   1311  1.1  christos {
   1312  1.1  christos   closure();
   1313  1.1  christos   while (tok != RPAREN && tok != OR && tok >= 0)
   1314  1.1  christos     {
   1315  1.1  christos       closure();
   1316  1.1  christos       addtok(CAT);
   1317  1.1  christos     }
   1318  1.1  christos }
   1319  1.1  christos 
   1320  1.1  christos static void
   1321  1.1  christos regexp (int toplevel)
   1322  1.1  christos {
   1323  1.1  christos   branch();
   1324  1.1  christos   while (tok == OR)
   1325  1.1  christos     {
   1326  1.1  christos       tok = lex();
   1327  1.1  christos       branch();
   1328  1.1  christos       if (toplevel)
   1329  1.1  christos 	addtok(ORTOP);
   1330  1.1  christos       else
   1331  1.1  christos 	addtok(OR);
   1332  1.1  christos     }
   1333  1.1  christos }
   1334  1.1  christos 
   1335  1.1  christos /* Main entry point for the parser.  S is a string to be parsed, len is the
   1336  1.1  christos    length of the string, so s can include NUL characters.  D is a pointer to
   1337  1.1  christos    the struct dfa to parse into. */
   1338  1.1  christos void
   1339  1.1  christos dfaparse (char const *s, size_t len, struct dfa *d)
   1340  1.1  christos {
   1341  1.1  christos   dfa = d;
   1342  1.1  christos   lexstart = lexptr = s;
   1343  1.1  christos   lexleft = len;
   1344  1.1  christos   lasttok = END;
   1345  1.1  christos   laststart = 1;
   1346  1.1  christos   parens = 0;
   1347  1.1  christos #if ENABLE_NLS
   1348  1.1  christos   hard_LC_COLLATE = hard_locale (LC_COLLATE);
   1349  1.1  christos #endif
   1350  1.1  christos #ifdef MBS_SUPPORT
   1351  1.1  christos   if (MB_CUR_MAX > 1)
   1352  1.1  christos     {
   1353  1.1  christos       cur_mb_index = 0;
   1354  1.1  christos       cur_mb_len = 0;
   1355  1.1  christos       memset(&mbs, 0, sizeof(mbstate_t));
   1356  1.1  christos     }
   1357  1.1  christos #endif /* MBS_SUPPORT  */
   1358  1.1  christos 
   1359  1.1  christos   if (! syntax_bits_set)
   1360  1.1  christos     dfaerror(_("No syntax specified"));
   1361  1.1  christos 
   1362  1.1  christos   tok = lex();
   1363  1.1  christos   depth = d->depth;
   1364  1.1  christos 
   1365  1.1  christos   regexp(1);
   1366  1.1  christos 
   1367  1.1  christos   if (tok != END)
   1368  1.1  christos     dfaerror(_("Unbalanced )"));
   1369  1.1  christos 
   1370  1.1  christos   addtok(END - d->nregexps);
   1371  1.1  christos   addtok(CAT);
   1372  1.1  christos 
   1373  1.1  christos   if (d->nregexps)
   1374  1.1  christos     addtok(ORTOP);
   1375  1.1  christos 
   1376  1.1  christos   ++d->nregexps;
   1377  1.1  christos }
   1378  1.1  christos 
   1379  1.1  christos /* Some primitives for operating on sets of positions. */
   1380  1.1  christos 
   1381  1.1  christos /* Copy one set to another; the destination must be large enough. */
   1382  1.1  christos static void
   1383  1.1  christos copy (position_set const *src, position_set *dst)
   1384  1.1  christos {
   1385  1.1  christos   int i;
   1386  1.1  christos 
   1387  1.1  christos   for (i = 0; i < src->nelem; ++i)
   1388  1.1  christos     dst->elems[i] = src->elems[i];
   1389  1.1  christos   dst->nelem = src->nelem;
   1390  1.1  christos }
   1391  1.1  christos 
   1392  1.1  christos /* Insert a position in a set.  Position sets are maintained in sorted
   1393  1.1  christos    order according to index.  If position already exists in the set with
   1394  1.1  christos    the same index then their constraints are logically or'd together.
   1395  1.1  christos    S->elems must point to an array large enough to hold the resulting set. */
   1396  1.1  christos static void
   1397  1.1  christos insert (position p, position_set *s)
   1398  1.1  christos {
   1399  1.1  christos   int i;
   1400  1.1  christos   position t1, t2;
   1401  1.1  christos 
   1402  1.1  christos   for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
   1403  1.1  christos     continue;
   1404  1.1  christos   if (i < s->nelem && p.index == s->elems[i].index)
   1405  1.1  christos     s->elems[i].constraint |= p.constraint;
   1406  1.1  christos   else
   1407  1.1  christos     {
   1408  1.1  christos       t1 = p;
   1409  1.1  christos       ++s->nelem;
   1410  1.1  christos       while (i < s->nelem)
   1411  1.1  christos 	{
   1412  1.1  christos 	  t2 = s->elems[i];
   1413  1.1  christos 	  s->elems[i++] = t1;
   1414  1.1  christos 	  t1 = t2;
   1415  1.1  christos 	}
   1416  1.1  christos     }
   1417  1.1  christos }
   1418  1.1  christos 
   1419  1.1  christos /* Merge two sets of positions into a third.  The result is exactly as if
   1420  1.1  christos    the positions of both sets were inserted into an initially empty set. */
   1421  1.1  christos static void
   1422  1.1  christos merge (position_set const *s1, position_set const *s2, position_set *m)
   1423  1.1  christos {
   1424  1.1  christos   int i = 0, j = 0;
   1425  1.1  christos 
   1426  1.1  christos   m->nelem = 0;
   1427  1.1  christos   while (i < s1->nelem && j < s2->nelem)
   1428  1.1  christos     if (s1->elems[i].index > s2->elems[j].index)
   1429  1.1  christos       m->elems[m->nelem++] = s1->elems[i++];
   1430  1.1  christos     else if (s1->elems[i].index < s2->elems[j].index)
   1431  1.1  christos       m->elems[m->nelem++] = s2->elems[j++];
   1432  1.1  christos     else
   1433  1.1  christos       {
   1434  1.1  christos 	m->elems[m->nelem] = s1->elems[i++];
   1435  1.1  christos 	m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
   1436  1.1  christos       }
   1437  1.1  christos   while (i < s1->nelem)
   1438  1.1  christos     m->elems[m->nelem++] = s1->elems[i++];
   1439  1.1  christos   while (j < s2->nelem)
   1440  1.1  christos     m->elems[m->nelem++] = s2->elems[j++];
   1441  1.1  christos }
   1442  1.1  christos 
   1443  1.1  christos /* Delete a position from a set. */
   1444  1.1  christos static void
   1445  1.1  christos delete (position p, position_set *s)
   1446  1.1  christos {
   1447  1.1  christos   int i;
   1448  1.1  christos 
   1449  1.1  christos   for (i = 0; i < s->nelem; ++i)
   1450  1.1  christos     if (p.index == s->elems[i].index)
   1451  1.1  christos       break;
   1452  1.1  christos   if (i < s->nelem)
   1453  1.1  christos     for (--s->nelem; i < s->nelem; ++i)
   1454  1.1  christos       s->elems[i] = s->elems[i + 1];
   1455  1.1  christos }
   1456  1.1  christos 
   1457  1.1  christos /* Find the index of the state corresponding to the given position set with
   1458  1.1  christos    the given preceding context, or create a new state if there is no such
   1459  1.1  christos    state.  Newline and letter tell whether we got here on a newline or
   1460  1.1  christos    letter, respectively. */
   1461  1.1  christos static int
   1462  1.1  christos state_index (struct dfa *d, position_set const *s, int newline, int letter)
   1463  1.1  christos {
   1464  1.1  christos   int hash = 0;
   1465  1.1  christos   int constraint;
   1466  1.1  christos   int i, j;
   1467  1.1  christos 
   1468  1.1  christos   newline = newline ? 1 : 0;
   1469  1.1  christos   letter = letter ? 1 : 0;
   1470  1.1  christos 
   1471  1.1  christos   for (i = 0; i < s->nelem; ++i)
   1472  1.1  christos     hash ^= s->elems[i].index + s->elems[i].constraint;
   1473  1.1  christos 
   1474  1.1  christos   /* Try to find a state that exactly matches the proposed one. */
   1475  1.1  christos   for (i = 0; i < d->sindex; ++i)
   1476  1.1  christos     {
   1477  1.1  christos       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
   1478  1.1  christos 	  || newline != d->states[i].newline || letter != d->states[i].letter)
   1479  1.1  christos 	continue;
   1480  1.1  christos       for (j = 0; j < s->nelem; ++j)
   1481  1.1  christos 	if (s->elems[j].constraint
   1482  1.1  christos 	    != d->states[i].elems.elems[j].constraint
   1483  1.1  christos 	    || s->elems[j].index != d->states[i].elems.elems[j].index)
   1484  1.1  christos 	  break;
   1485  1.1  christos       if (j == s->nelem)
   1486  1.1  christos 	return i;
   1487  1.1  christos     }
   1488  1.1  christos 
   1489  1.1  christos   /* We'll have to create a new state. */
   1490  1.1  christos   REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
   1491  1.1  christos   d->states[i].hash = hash;
   1492  1.1  christos   MALLOC(d->states[i].elems.elems, position, s->nelem);
   1493  1.1  christos   copy(s, &d->states[i].elems);
   1494  1.1  christos   d->states[i].newline = newline;
   1495  1.1  christos   d->states[i].letter = letter;
   1496  1.1  christos   d->states[i].backref = 0;
   1497  1.1  christos   d->states[i].constraint = 0;
   1498  1.1  christos   d->states[i].first_end = 0;
   1499  1.1  christos #ifdef MBS_SUPPORT
   1500  1.1  christos   if (MB_CUR_MAX > 1)
   1501  1.1  christos     d->states[i].mbps.nelem = 0;
   1502  1.1  christos #endif
   1503  1.1  christos   for (j = 0; j < s->nelem; ++j)
   1504  1.1  christos     if (d->tokens[s->elems[j].index] < 0)
   1505  1.1  christos       {
   1506  1.1  christos 	constraint = s->elems[j].constraint;
   1507  1.1  christos 	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
   1508  1.1  christos 	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
   1509  1.1  christos 	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
   1510  1.1  christos 	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
   1511  1.1  christos 	  d->states[i].constraint |= constraint;
   1512  1.1  christos 	if (! d->states[i].first_end)
   1513  1.1  christos 	  d->states[i].first_end = d->tokens[s->elems[j].index];
   1514  1.1  christos       }
   1515  1.1  christos     else if (d->tokens[s->elems[j].index] == BACKREF)
   1516  1.1  christos       {
   1517  1.1  christos 	d->states[i].constraint = NO_CONSTRAINT;
   1518  1.1  christos 	d->states[i].backref = 1;
   1519  1.1  christos       }
   1520  1.1  christos 
   1521  1.1  christos   ++d->sindex;
   1522  1.1  christos 
   1523  1.1  christos   return i;
   1524  1.1  christos }
   1525  1.1  christos 
   1526  1.1  christos /* Find the epsilon closure of a set of positions.  If any position of the set
   1527  1.1  christos    contains a symbol that matches the empty string in some context, replace
   1528  1.1  christos    that position with the elements of its follow labeled with an appropriate
   1529  1.1  christos    constraint.  Repeat exhaustively until no funny positions are left.
   1530  1.1  christos    S->elems must be large enough to hold the result. */
   1531  1.1  christos static void
   1532  1.1  christos epsclosure (position_set *s, struct dfa const *d)
   1533  1.1  christos {
   1534  1.1  christos   int i, j;
   1535  1.1  christos   int *visited;
   1536  1.1  christos   position p, old;
   1537  1.1  christos 
   1538  1.1  christos   MALLOC(visited, int, d->tindex);
   1539  1.1  christos   for (i = 0; i < d->tindex; ++i)
   1540  1.1  christos     visited[i] = 0;
   1541  1.1  christos 
   1542  1.1  christos   for (i = 0; i < s->nelem; ++i)
   1543  1.1  christos     if (d->tokens[s->elems[i].index] >= NOTCHAR
   1544  1.1  christos 	&& d->tokens[s->elems[i].index] != BACKREF
   1545  1.1  christos #ifdef MBS_SUPPORT
   1546  1.1  christos 	&& d->tokens[s->elems[i].index] != ANYCHAR
   1547  1.1  christos 	&& d->tokens[s->elems[i].index] != MBCSET
   1548  1.1  christos #endif
   1549  1.1  christos 	&& d->tokens[s->elems[i].index] < CSET)
   1550  1.1  christos       {
   1551  1.1  christos 	old = s->elems[i];
   1552  1.1  christos 	p.constraint = old.constraint;
   1553  1.1  christos 	delete(s->elems[i], s);
   1554  1.1  christos 	if (visited[old.index])
   1555  1.1  christos 	  {
   1556  1.1  christos 	    --i;
   1557  1.1  christos 	    continue;
   1558  1.1  christos 	  }
   1559  1.1  christos 	visited[old.index] = 1;
   1560  1.1  christos 	switch (d->tokens[old.index])
   1561  1.1  christos 	  {
   1562  1.1  christos 	  case BEGLINE:
   1563  1.1  christos 	    p.constraint &= BEGLINE_CONSTRAINT;
   1564  1.1  christos 	    break;
   1565  1.1  christos 	  case ENDLINE:
   1566  1.1  christos 	    p.constraint &= ENDLINE_CONSTRAINT;
   1567  1.1  christos 	    break;
   1568  1.1  christos 	  case BEGWORD:
   1569  1.1  christos 	    p.constraint &= BEGWORD_CONSTRAINT;
   1570  1.1  christos 	    break;
   1571  1.1  christos 	  case ENDWORD:
   1572  1.1  christos 	    p.constraint &= ENDWORD_CONSTRAINT;
   1573  1.1  christos 	    break;
   1574  1.1  christos 	  case LIMWORD:
   1575  1.1  christos 	    p.constraint &= LIMWORD_CONSTRAINT;
   1576  1.1  christos 	    break;
   1577  1.1  christos 	  case NOTLIMWORD:
   1578  1.1  christos 	    p.constraint &= NOTLIMWORD_CONSTRAINT;
   1579  1.1  christos 	    break;
   1580  1.1  christos 	  default:
   1581  1.1  christos 	    break;
   1582  1.1  christos 	  }
   1583  1.1  christos 	for (j = 0; j < d->follows[old.index].nelem; ++j)
   1584  1.1  christos 	  {
   1585  1.1  christos 	    p.index = d->follows[old.index].elems[j].index;
   1586  1.1  christos 	    insert(p, s);
   1587  1.1  christos 	  }
   1588  1.1  christos 	/* Force rescan to start at the beginning. */
   1589  1.1  christos 	i = -1;
   1590  1.1  christos       }
   1591  1.1  christos 
   1592  1.1  christos   free(visited);
   1593  1.1  christos }
   1594  1.1  christos 
   1595  1.1  christos /* Perform bottom-up analysis on the parse tree, computing various functions.
   1596  1.1  christos    Note that at this point, we're pretending constructs like \< are real
   1597  1.1  christos    characters rather than constraints on what can follow them.
   1598  1.1  christos 
   1599  1.1  christos    Nullable:  A node is nullable if it is at the root of a regexp that can
   1600  1.1  christos    match the empty string.
   1601  1.1  christos    *  EMPTY leaves are nullable.
   1602  1.1  christos    * No other leaf is nullable.
   1603  1.1  christos    * A QMARK or STAR node is nullable.
   1604  1.1  christos    * A PLUS node is nullable if its argument is nullable.
   1605  1.1  christos    * A CAT node is nullable if both its arguments are nullable.
   1606  1.1  christos    * An OR node is nullable if either argument is nullable.
   1607  1.1  christos 
   1608  1.1  christos    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
   1609  1.1  christos    that could correspond to the first character of a string matching the
   1610  1.1  christos    regexp rooted at the given node.
   1611  1.1  christos    * EMPTY leaves have empty firstpos.
   1612  1.1  christos    * The firstpos of a nonempty leaf is that leaf itself.
   1613  1.1  christos    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
   1614  1.1  christos      argument.
   1615  1.1  christos    * The firstpos of a CAT node is the firstpos of the left argument, union
   1616  1.1  christos      the firstpos of the right if the left argument is nullable.
   1617  1.1  christos    * The firstpos of an OR node is the union of firstpos of each argument.
   1618  1.1  christos 
   1619  1.1  christos    Lastpos:  The lastpos of a node is the set of positions that could
   1620  1.1  christos    correspond to the last character of a string matching the regexp at
   1621  1.1  christos    the given node.
   1622  1.1  christos    * EMPTY leaves have empty lastpos.
   1623  1.1  christos    * The lastpos of a nonempty leaf is that leaf itself.
   1624  1.1  christos    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
   1625  1.1  christos      argument.
   1626  1.1  christos    * The lastpos of a CAT node is the lastpos of its right argument, union
   1627  1.1  christos      the lastpos of the left if the right argument is nullable.
   1628  1.1  christos    * The lastpos of an OR node is the union of the lastpos of each argument.
   1629  1.1  christos 
   1630  1.1  christos    Follow:  The follow of a position is the set of positions that could
   1631  1.1  christos    correspond to the character following a character matching the node in
   1632  1.1  christos    a string matching the regexp.  At this point we consider special symbols
   1633  1.1  christos    that match the empty string in some context to be just normal characters.
   1634  1.1  christos    Later, if we find that a special symbol is in a follow set, we will
   1635  1.1  christos    replace it with the elements of its follow, labeled with an appropriate
   1636  1.1  christos    constraint.
   1637  1.1  christos    * Every node in the firstpos of the argument of a STAR or PLUS node is in
   1638  1.1  christos      the follow of every node in the lastpos.
   1639  1.1  christos    * Every node in the firstpos of the second argument of a CAT node is in
   1640  1.1  christos      the follow of every node in the lastpos of the first argument.
   1641  1.1  christos 
   1642  1.1  christos    Because of the postfix representation of the parse tree, the depth-first
   1643  1.1  christos    analysis is conveniently done by a linear scan with the aid of a stack.
   1644  1.1  christos    Sets are stored as arrays of the elements, obeying a stack-like allocation
   1645  1.1  christos    scheme; the number of elements in each set deeper in the stack can be
   1646  1.1  christos    used to determine the address of a particular set's array. */
   1647  1.1  christos void
   1648  1.1  christos dfaanalyze (struct dfa *d, int searchflag)
   1649  1.1  christos {
   1650  1.1  christos   int *nullable;		/* Nullable stack. */
   1651  1.1  christos   int *nfirstpos;		/* Element count stack for firstpos sets. */
   1652  1.1  christos   position *firstpos;		/* Array where firstpos elements are stored. */
   1653  1.1  christos   int *nlastpos;		/* Element count stack for lastpos sets. */
   1654  1.1  christos   position *lastpos;		/* Array where lastpos elements are stored. */
   1655  1.1  christos   int *nalloc;			/* Sizes of arrays allocated to follow sets. */
   1656  1.1  christos   position_set tmp;		/* Temporary set for merging sets. */
   1657  1.1  christos   position_set merged;		/* Result of merging sets. */
   1658  1.1  christos   int wants_newline;		/* True if some position wants newline info. */
   1659  1.1  christos   int *o_nullable;
   1660  1.1  christos   int *o_nfirst, *o_nlast;
   1661  1.1  christos   position *o_firstpos, *o_lastpos;
   1662  1.1  christos   int i, j;
   1663  1.1  christos   position *pos;
   1664  1.1  christos 
   1665  1.1  christos #ifdef DEBUG
   1666  1.1  christos   fprintf(stderr, "dfaanalyze:\n");
   1667  1.1  christos   for (i = 0; i < d->tindex; ++i)
   1668  1.1  christos     {
   1669  1.1  christos       fprintf(stderr, " %d:", i);
   1670  1.1  christos       prtok(d->tokens[i]);
   1671  1.1  christos     }
   1672  1.1  christos   putc('\n', stderr);
   1673  1.1  christos #endif
   1674  1.1  christos 
   1675  1.1  christos   d->searchflag = searchflag;
   1676  1.1  christos 
   1677  1.1  christos   MALLOC(nullable, int, d->depth);
   1678  1.1  christos   o_nullable = nullable;
   1679  1.1  christos   MALLOC(nfirstpos, int, d->depth);
   1680  1.1  christos   o_nfirst = nfirstpos;
   1681  1.1  christos   MALLOC(firstpos, position, d->nleaves);
   1682  1.1  christos   o_firstpos = firstpos, firstpos += d->nleaves;
   1683  1.1  christos   MALLOC(nlastpos, int, d->depth);
   1684  1.1  christos   o_nlast = nlastpos;
   1685  1.1  christos   MALLOC(lastpos, position, d->nleaves);
   1686  1.1  christos   o_lastpos = lastpos, lastpos += d->nleaves;
   1687  1.1  christos   MALLOC(nalloc, int, d->tindex);
   1688  1.1  christos   for (i = 0; i < d->tindex; ++i)
   1689  1.1  christos     nalloc[i] = 0;
   1690  1.1  christos   MALLOC(merged.elems, position, d->nleaves);
   1691  1.1  christos 
   1692  1.1  christos   CALLOC(d->follows, position_set, d->tindex);
   1693  1.1  christos 
   1694  1.1  christos   for (i = 0; i < d->tindex; ++i)
   1695  1.1  christos #ifdef DEBUG
   1696  1.1  christos     {				/* Nonsyntactic #ifdef goo... */
   1697  1.1  christos #endif
   1698  1.1  christos     switch (d->tokens[i])
   1699  1.1  christos       {
   1700  1.1  christos       case EMPTY:
   1701  1.1  christos 	/* The empty set is nullable. */
   1702  1.1  christos 	*nullable++ = 1;
   1703  1.1  christos 
   1704  1.1  christos 	/* The firstpos and lastpos of the empty leaf are both empty. */
   1705  1.1  christos 	*nfirstpos++ = *nlastpos++ = 0;
   1706  1.1  christos 	break;
   1707  1.1  christos 
   1708  1.1  christos       case STAR:
   1709  1.1  christos       case PLUS:
   1710  1.1  christos 	/* Every element in the firstpos of the argument is in the follow
   1711  1.1  christos 	   of every element in the lastpos. */
   1712  1.1  christos 	tmp.nelem = nfirstpos[-1];
   1713  1.1  christos 	tmp.elems = firstpos;
   1714  1.1  christos 	pos = lastpos;
   1715  1.1  christos 	for (j = 0; j < nlastpos[-1]; ++j)
   1716  1.1  christos 	  {
   1717  1.1  christos 	    merge(&tmp, &d->follows[pos[j].index], &merged);
   1718  1.1  christos 	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
   1719  1.1  christos 				 nalloc[pos[j].index], merged.nelem - 1);
   1720  1.1  christos 	    copy(&merged, &d->follows[pos[j].index]);
   1721  1.1  christos 	  }
   1722  1.1  christos 
   1723  1.1  christos       case QMARK:
   1724  1.1  christos 	/* A QMARK or STAR node is automatically nullable. */
   1725  1.1  christos 	if (d->tokens[i] != PLUS)
   1726  1.1  christos 	  nullable[-1] = 1;
   1727  1.1  christos 	break;
   1728  1.1  christos 
   1729  1.1  christos       case CAT:
   1730  1.1  christos 	/* Every element in the firstpos of the second argument is in the
   1731  1.1  christos 	   follow of every element in the lastpos of the first argument. */
   1732  1.1  christos 	tmp.nelem = nfirstpos[-1];
   1733  1.1  christos 	tmp.elems = firstpos;
   1734  1.1  christos 	pos = lastpos + nlastpos[-1];
   1735  1.1  christos 	for (j = 0; j < nlastpos[-2]; ++j)
   1736  1.1  christos 	  {
   1737  1.1  christos 	    merge(&tmp, &d->follows[pos[j].index], &merged);
   1738  1.1  christos 	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
   1739  1.1  christos 				 nalloc[pos[j].index], merged.nelem - 1);
   1740  1.1  christos 	    copy(&merged, &d->follows[pos[j].index]);
   1741  1.1  christos 	  }
   1742  1.1  christos 
   1743  1.1  christos 	/* The firstpos of a CAT node is the firstpos of the first argument,
   1744  1.1  christos 	   union that of the second argument if the first is nullable. */
   1745  1.1  christos 	if (nullable[-2])
   1746  1.1  christos 	  nfirstpos[-2] += nfirstpos[-1];
   1747  1.1  christos 	else
   1748  1.1  christos 	  firstpos += nfirstpos[-1];
   1749  1.1  christos 	--nfirstpos;
   1750  1.1  christos 
   1751  1.1  christos 	/* The lastpos of a CAT node is the lastpos of the second argument,
   1752  1.1  christos 	   union that of the first argument if the second is nullable. */
   1753  1.1  christos 	if (nullable[-1])
   1754  1.1  christos 	  nlastpos[-2] += nlastpos[-1];
   1755  1.1  christos 	else
   1756  1.1  christos 	  {
   1757  1.1  christos 	    pos = lastpos + nlastpos[-2];
   1758  1.1  christos 	    for (j = nlastpos[-1] - 1; j >= 0; --j)
   1759  1.1  christos 	      pos[j] = lastpos[j];
   1760  1.1  christos 	    lastpos += nlastpos[-2];
   1761  1.1  christos 	    nlastpos[-2] = nlastpos[-1];
   1762  1.1  christos 	  }
   1763  1.1  christos 	--nlastpos;
   1764  1.1  christos 
   1765  1.1  christos 	/* A CAT node is nullable if both arguments are nullable. */
   1766  1.1  christos 	nullable[-2] = nullable[-1] && nullable[-2];
   1767  1.1  christos 	--nullable;
   1768  1.1  christos 	break;
   1769  1.1  christos 
   1770  1.1  christos       case OR:
   1771  1.1  christos       case ORTOP:
   1772  1.1  christos 	/* The firstpos is the union of the firstpos of each argument. */
   1773  1.1  christos 	nfirstpos[-2] += nfirstpos[-1];
   1774  1.1  christos 	--nfirstpos;
   1775  1.1  christos 
   1776  1.1  christos 	/* The lastpos is the union of the lastpos of each argument. */
   1777  1.1  christos 	nlastpos[-2] += nlastpos[-1];
   1778  1.1  christos 	--nlastpos;
   1779  1.1  christos 
   1780  1.1  christos 	/* An OR node is nullable if either argument is nullable. */
   1781  1.1  christos 	nullable[-2] = nullable[-1] || nullable[-2];
   1782  1.1  christos 	--nullable;
   1783  1.1  christos 	break;
   1784  1.1  christos 
   1785  1.1  christos       default:
   1786  1.1  christos 	/* Anything else is a nonempty position.  (Note that special
   1787  1.1  christos 	   constructs like \< are treated as nonempty strings here;
   1788  1.1  christos 	   an "epsilon closure" effectively makes them nullable later.
   1789  1.1  christos 	   Backreferences have to get a real position so we can detect
   1790  1.1  christos 	   transitions on them later.  But they are nullable. */
   1791  1.1  christos 	*nullable++ = d->tokens[i] == BACKREF;
   1792  1.1  christos 
   1793  1.1  christos 	/* This position is in its own firstpos and lastpos. */
   1794  1.1  christos 	*nfirstpos++ = *nlastpos++ = 1;
   1795  1.1  christos 	--firstpos, --lastpos;
   1796  1.1  christos 	firstpos->index = lastpos->index = i;
   1797  1.1  christos 	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
   1798  1.1  christos 
   1799  1.1  christos 	/* Allocate the follow set for this position. */
   1800  1.1  christos 	nalloc[i] = 1;
   1801  1.1  christos 	MALLOC(d->follows[i].elems, position, nalloc[i]);
   1802  1.1  christos 	break;
   1803  1.1  christos       }
   1804  1.1  christos #ifdef DEBUG
   1805  1.1  christos     /* ... balance the above nonsyntactic #ifdef goo... */
   1806  1.1  christos       fprintf(stderr, "node %d:", i);
   1807  1.1  christos       prtok(d->tokens[i]);
   1808  1.1  christos       putc('\n', stderr);
   1809  1.1  christos       fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
   1810  1.1  christos       fprintf(stderr, " firstpos:");
   1811  1.1  christos       for (j = nfirstpos[-1] - 1; j >= 0; --j)
   1812  1.1  christos 	{
   1813  1.1  christos 	  fprintf(stderr, " %d:", firstpos[j].index);
   1814  1.1  christos 	  prtok(d->tokens[firstpos[j].index]);
   1815  1.1  christos 	}
   1816  1.1  christos       fprintf(stderr, "\n lastpos:");
   1817  1.1  christos       for (j = nlastpos[-1] - 1; j >= 0; --j)
   1818  1.1  christos 	{
   1819  1.1  christos 	  fprintf(stderr, " %d:", lastpos[j].index);
   1820  1.1  christos 	  prtok(d->tokens[lastpos[j].index]);
   1821  1.1  christos 	}
   1822  1.1  christos       putc('\n', stderr);
   1823  1.1  christos     }
   1824  1.1  christos #endif
   1825  1.1  christos 
   1826  1.1  christos   /* For each follow set that is the follow set of a real position, replace
   1827  1.1  christos      it with its epsilon closure. */
   1828  1.1  christos   for (i = 0; i < d->tindex; ++i)
   1829  1.1  christos     if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
   1830  1.1  christos #ifdef MBS_SUPPORT
   1831  1.1  christos         || d->tokens[i] == ANYCHAR
   1832  1.1  christos         || d->tokens[i] == MBCSET
   1833  1.1  christos #endif
   1834  1.1  christos 	|| d->tokens[i] >= CSET)
   1835  1.1  christos       {
   1836  1.1  christos #ifdef DEBUG
   1837  1.1  christos 	fprintf(stderr, "follows(%d:", i);
   1838  1.1  christos 	prtok(d->tokens[i]);
   1839  1.1  christos 	fprintf(stderr, "):");
   1840  1.1  christos 	for (j = d->follows[i].nelem - 1; j >= 0; --j)
   1841  1.1  christos 	  {
   1842  1.1  christos 	    fprintf(stderr, " %d:", d->follows[i].elems[j].index);
   1843  1.1  christos 	    prtok(d->tokens[d->follows[i].elems[j].index]);
   1844  1.1  christos 	  }
   1845  1.1  christos 	putc('\n', stderr);
   1846  1.1  christos #endif
   1847  1.1  christos 	copy(&d->follows[i], &merged);
   1848  1.1  christos 	epsclosure(&merged, d);
   1849  1.1  christos 	if (d->follows[i].nelem < merged.nelem)
   1850  1.1  christos 	  REALLOC(d->follows[i].elems, position, merged.nelem);
   1851  1.1  christos 	copy(&merged, &d->follows[i]);
   1852  1.1  christos       }
   1853  1.1  christos 
   1854  1.1  christos   /* Get the epsilon closure of the firstpos of the regexp.  The result will
   1855  1.1  christos      be the set of positions of state 0. */
   1856  1.1  christos   merged.nelem = 0;
   1857  1.1  christos   for (i = 0; i < nfirstpos[-1]; ++i)
   1858  1.1  christos     insert(firstpos[i], &merged);
   1859  1.1  christos   epsclosure(&merged, d);
   1860  1.1  christos 
   1861  1.1  christos   /* Check if any of the positions of state 0 will want newline context. */
   1862  1.1  christos   wants_newline = 0;
   1863  1.1  christos   for (i = 0; i < merged.nelem; ++i)
   1864  1.1  christos     if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
   1865  1.1  christos       wants_newline = 1;
   1866  1.1  christos 
   1867  1.1  christos   /* Build the initial state. */
   1868  1.1  christos   d->salloc = 1;
   1869  1.1  christos   d->sindex = 0;
   1870  1.1  christos   MALLOC(d->states, dfa_state, d->salloc);
   1871  1.1  christos   state_index(d, &merged, wants_newline, 0);
   1872  1.1  christos 
   1873  1.1  christos   free(o_nullable);
   1874  1.1  christos   free(o_nfirst);
   1875  1.1  christos   free(o_firstpos);
   1876  1.1  christos   free(o_nlast);
   1877  1.1  christos   free(o_lastpos);
   1878  1.1  christos   free(nalloc);
   1879  1.1  christos   free(merged.elems);
   1880  1.1  christos }
   1881  1.1  christos 
   1882  1.1  christos /* Find, for each character, the transition out of state s of d, and store
   1883  1.1  christos    it in the appropriate slot of trans.
   1884  1.1  christos 
   1885  1.1  christos    We divide the positions of s into groups (positions can appear in more
   1886  1.1  christos    than one group).  Each group is labeled with a set of characters that
   1887  1.1  christos    every position in the group matches (taking into account, if necessary,
   1888  1.1  christos    preceding context information of s).  For each group, find the union
   1889  1.1  christos    of the its elements' follows.  This set is the set of positions of the
   1890  1.1  christos    new state.  For each character in the group's label, set the transition
   1891  1.1  christos    on this character to be to a state corresponding to the set's positions,
   1892  1.1  christos    and its associated backward context information, if necessary.
   1893  1.1  christos 
   1894  1.1  christos    If we are building a searching matcher, we include the positions of state
   1895  1.1  christos    0 in every state.
   1896  1.1  christos 
   1897  1.1  christos    The collection of groups is constructed by building an equivalence-class
   1898  1.1  christos    partition of the positions of s.
   1899  1.1  christos 
   1900  1.1  christos    For each position, find the set of characters C that it matches.  Eliminate
   1901  1.1  christos    any characters from C that fail on grounds of backward context.
   1902  1.1  christos 
   1903  1.1  christos    Search through the groups, looking for a group whose label L has nonempty
   1904  1.1  christos    intersection with C.  If L - C is nonempty, create a new group labeled
   1905  1.1  christos    L - C and having the same positions as the current group, and set L to
   1906  1.1  christos    the intersection of L and C.  Insert the position in this group, set
   1907  1.1  christos    C = C - L, and resume scanning.
   1908  1.1  christos 
   1909  1.1  christos    If after comparing with every group there are characters remaining in C,
   1910  1.1  christos    create a new group labeled with the characters of C and insert this
   1911  1.1  christos    position in that group. */
   1912  1.1  christos void
   1913  1.1  christos dfastate (int s, struct dfa *d, int trans[])
   1914  1.1  christos {
   1915  1.1  christos   position_set grps[NOTCHAR];	/* As many as will ever be needed. */
   1916  1.1  christos   charclass labels[NOTCHAR];	/* Labels corresponding to the groups. */
   1917  1.1  christos   int ngrps = 0;		/* Number of groups actually used. */
   1918  1.1  christos   position pos;			/* Current position being considered. */
   1919  1.1  christos   charclass matches;		/* Set of matching characters. */
   1920  1.1  christos   int matchesf;			/* True if matches is nonempty. */
   1921  1.1  christos   charclass intersect;		/* Intersection with some label set. */
   1922  1.1  christos   int intersectf;		/* True if intersect is nonempty. */
   1923  1.1  christos   charclass leftovers;		/* Stuff in the label that didn't match. */
   1924  1.1  christos   int leftoversf;		/* True if leftovers is nonempty. */
   1925  1.1  christos   static charclass letters;	/* Set of characters considered letters. */
   1926  1.1  christos   static charclass newline;	/* Set of characters that aren't newline. */
   1927  1.1  christos   position_set follows;		/* Union of the follows of some group. */
   1928  1.1  christos   position_set tmp;		/* Temporary space for merging sets. */
   1929  1.1  christos   int state;			/* New state. */
   1930  1.1  christos   int wants_newline;		/* New state wants to know newline context. */
   1931  1.1  christos   int state_newline;		/* New state on a newline transition. */
   1932  1.1  christos   int wants_letter;		/* New state wants to know letter context. */
   1933  1.1  christos   int state_letter;		/* New state on a letter transition. */
   1934  1.1  christos   static int initialized;	/* Flag for static initialization. */
   1935  1.1  christos #ifdef MBS_SUPPORT
   1936  1.1  christos   int next_isnt_1st_byte = 0;	/* Flag If we can't add state0.  */
   1937  1.1  christos #endif
   1938  1.1  christos   int i, j, k;
   1939  1.1  christos 
   1940  1.1  christos   /* Initialize the set of letters, if necessary. */
   1941  1.1  christos   if (! initialized)
   1942  1.1  christos     {
   1943  1.1  christos       initialized = 1;
   1944  1.1  christos       for (i = 0; i < NOTCHAR; ++i)
   1945  1.1  christos 	if (IS_WORD_CONSTITUENT(i))
   1946  1.1  christos 	  setbit(i, letters);
   1947  1.1  christos       setbit(eolbyte, newline);
   1948  1.1  christos     }
   1949  1.1  christos 
   1950  1.1  christos   zeroset(matches);
   1951  1.1  christos 
   1952  1.1  christos   for (i = 0; i < d->states[s].elems.nelem; ++i)
   1953  1.1  christos     {
   1954  1.1  christos       pos = d->states[s].elems.elems[i];
   1955  1.1  christos       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
   1956  1.1  christos 	setbit(d->tokens[pos.index], matches);
   1957  1.1  christos       else if (d->tokens[pos.index] >= CSET)
   1958  1.1  christos 	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
   1959  1.1  christos #ifdef MBS_SUPPORT
   1960  1.1  christos       else if (d->tokens[pos.index] == ANYCHAR
   1961  1.1  christos                || d->tokens[pos.index] == MBCSET)
   1962  1.1  christos       /* MB_CUR_MAX > 1  */
   1963  1.1  christos 	{
   1964  1.1  christos 	  /* ANYCHAR and MBCSET must match with a single character, so we
   1965  1.1  christos 	     must put it to d->states[s].mbps, which contains the positions
   1966  1.1  christos 	     which can match with a single character not a byte.  */
   1967  1.1  christos 	  if (d->states[s].mbps.nelem == 0)
   1968  1.1  christos 	    {
   1969  1.1  christos 	      MALLOC(d->states[s].mbps.elems, position,
   1970  1.1  christos 		     d->states[s].elems.nelem);
   1971  1.1  christos 	    }
   1972  1.1  christos 	  insert(pos, &(d->states[s].mbps));
   1973  1.1  christos 	  continue;
   1974  1.1  christos 	}
   1975  1.1  christos #endif /* MBS_SUPPORT */
   1976  1.1  christos       else
   1977  1.1  christos 	continue;
   1978  1.1  christos 
   1979  1.1  christos       /* Some characters may need to be eliminated from matches because
   1980  1.1  christos 	 they fail in the current context. */
   1981  1.1  christos       if (pos.constraint != 0xFF)
   1982  1.1  christos 	{
   1983  1.1  christos 	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
   1984  1.1  christos 					 d->states[s].newline, 1))
   1985  1.1  christos 	    clrbit(eolbyte, matches);
   1986  1.1  christos 	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
   1987  1.1  christos 					 d->states[s].newline, 0))
   1988  1.1  christos 	    for (j = 0; j < CHARCLASS_INTS; ++j)
   1989  1.1  christos 	      matches[j] &= newline[j];
   1990  1.1  christos 	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
   1991  1.1  christos 					d->states[s].letter, 1))
   1992  1.1  christos 	    for (j = 0; j < CHARCLASS_INTS; ++j)
   1993  1.1  christos 	      matches[j] &= ~letters[j];
   1994  1.1  christos 	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
   1995  1.1  christos 					d->states[s].letter, 0))
   1996  1.1  christos 	    for (j = 0; j < CHARCLASS_INTS; ++j)
   1997  1.1  christos 	      matches[j] &= letters[j];
   1998  1.1  christos 
   1999  1.1  christos 	  /* If there are no characters left, there's no point in going on. */
   2000  1.1  christos 	  for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
   2001  1.1  christos 	    continue;
   2002  1.1  christos 	  if (j == CHARCLASS_INTS)
   2003  1.1  christos 	    continue;
   2004  1.1  christos 	}
   2005  1.1  christos 
   2006  1.1  christos       for (j = 0; j < ngrps; ++j)
   2007  1.1  christos 	{
   2008  1.1  christos 	  /* If matches contains a single character only, and the current
   2009  1.1  christos 	     group's label doesn't contain that character, go on to the
   2010  1.1  christos 	     next group. */
   2011  1.1  christos 	  if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
   2012  1.1  christos 	      && !tstbit(d->tokens[pos.index], labels[j]))
   2013  1.1  christos 	    continue;
   2014  1.1  christos 
   2015  1.1  christos 	  /* Check if this group's label has a nonempty intersection with
   2016  1.1  christos 	     matches. */
   2017  1.1  christos 	  intersectf = 0;
   2018  1.1  christos 	  for (k = 0; k < CHARCLASS_INTS; ++k)
   2019  1.1  christos 	    (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
   2020  1.1  christos 	  if (! intersectf)
   2021  1.1  christos 	    continue;
   2022  1.1  christos 
   2023  1.1  christos 	  /* It does; now find the set differences both ways. */
   2024  1.1  christos 	  leftoversf = matchesf = 0;
   2025  1.1  christos 	  for (k = 0; k < CHARCLASS_INTS; ++k)
   2026  1.1  christos 	    {
   2027  1.1  christos 	      /* Even an optimizing compiler can't know this for sure. */
   2028  1.1  christos 	      int match = matches[k], label = labels[j][k];
   2029  1.1  christos 
   2030  1.1  christos 	      (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
   2031  1.1  christos 	      (matches[k] = match & ~label) ? (matchesf = 1) : 0;
   2032  1.1  christos 	    }
   2033  1.1  christos 
   2034  1.1  christos 	  /* If there were leftovers, create a new group labeled with them. */
   2035  1.1  christos 	  if (leftoversf)
   2036  1.1  christos 	    {
   2037  1.1  christos 	      copyset(leftovers, labels[ngrps]);
   2038  1.1  christos 	      copyset(intersect, labels[j]);
   2039  1.1  christos 	      MALLOC(grps[ngrps].elems, position, d->nleaves);
   2040  1.1  christos 	      copy(&grps[j], &grps[ngrps]);
   2041  1.1  christos 	      ++ngrps;
   2042  1.1  christos 	    }
   2043  1.1  christos 
   2044  1.1  christos 	  /* Put the position in the current group.  Note that there is no
   2045  1.1  christos 	     reason to call insert() here. */
   2046  1.1  christos 	  grps[j].elems[grps[j].nelem++] = pos;
   2047  1.1  christos 
   2048  1.1  christos 	  /* If every character matching the current position has been
   2049  1.1  christos 	     accounted for, we're done. */
   2050  1.1  christos 	  if (! matchesf)
   2051  1.1  christos 	    break;
   2052  1.1  christos 	}
   2053  1.1  christos 
   2054  1.1  christos       /* If we've passed the last group, and there are still characters
   2055  1.1  christos 	 unaccounted for, then we'll have to create a new group. */
   2056  1.1  christos       if (j == ngrps)
   2057  1.1  christos 	{
   2058  1.1  christos 	  copyset(matches, labels[ngrps]);
   2059  1.1  christos 	  zeroset(matches);
   2060  1.1  christos 	  MALLOC(grps[ngrps].elems, position, d->nleaves);
   2061  1.1  christos 	  grps[ngrps].nelem = 1;
   2062  1.1  christos 	  grps[ngrps].elems[0] = pos;
   2063  1.1  christos 	  ++ngrps;
   2064  1.1  christos 	}
   2065  1.1  christos     }
   2066  1.1  christos 
   2067  1.1  christos   MALLOC(follows.elems, position, d->nleaves);
   2068  1.1  christos   MALLOC(tmp.elems, position, d->nleaves);
   2069  1.1  christos 
   2070  1.1  christos   /* If we are a searching matcher, the default transition is to a state
   2071  1.1  christos      containing the positions of state 0, otherwise the default transition
   2072  1.1  christos      is to fail miserably. */
   2073  1.1  christos   if (d->searchflag)
   2074  1.1  christos     {
   2075  1.1  christos       wants_newline = 0;
   2076  1.1  christos       wants_letter = 0;
   2077  1.1  christos       for (i = 0; i < d->states[0].elems.nelem; ++i)
   2078  1.1  christos 	{
   2079  1.1  christos 	  if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
   2080  1.1  christos 	    wants_newline = 1;
   2081  1.1  christos 	  if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
   2082  1.1  christos 	    wants_letter = 1;
   2083  1.1  christos 	}
   2084  1.1  christos       copy(&d->states[0].elems, &follows);
   2085  1.1  christos       state = state_index(d, &follows, 0, 0);
   2086  1.1  christos       if (wants_newline)
   2087  1.1  christos 	state_newline = state_index(d, &follows, 1, 0);
   2088  1.1  christos       else
   2089  1.1  christos 	state_newline = state;
   2090  1.1  christos       if (wants_letter)
   2091  1.1  christos 	state_letter = state_index(d, &follows, 0, 1);
   2092  1.1  christos       else
   2093  1.1  christos 	state_letter = state;
   2094  1.1  christos       for (i = 0; i < NOTCHAR; ++i)
   2095  1.1  christos 	trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
   2096  1.1  christos       trans[eolbyte] = state_newline;
   2097  1.1  christos     }
   2098  1.1  christos   else
   2099  1.1  christos     for (i = 0; i < NOTCHAR; ++i)
   2100  1.1  christos       trans[i] = -1;
   2101  1.1  christos 
   2102  1.1  christos   for (i = 0; i < ngrps; ++i)
   2103  1.1  christos     {
   2104  1.1  christos       follows.nelem = 0;
   2105  1.1  christos 
   2106  1.1  christos       /* Find the union of the follows of the positions of the group.
   2107  1.1  christos 	 This is a hideously inefficient loop.  Fix it someday. */
   2108  1.1  christos       for (j = 0; j < grps[i].nelem; ++j)
   2109  1.1  christos 	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
   2110  1.1  christos 	  insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
   2111  1.1  christos 
   2112  1.1  christos #ifdef MBS_SUPPORT
   2113  1.1  christos       if (MB_CUR_MAX > 1)
   2114  1.1  christos 	{
   2115  1.1  christos 	  /* If a token in follows.elems is not 1st byte of a multibyte
   2116  1.1  christos 	     character, or the states of follows must accept the bytes
   2117  1.1  christos 	     which are not 1st byte of the multibyte character.
   2118  1.1  christos 	     Then, if a state of follows encounter a byte, it must not be
   2119  1.1  christos 	     a 1st byte of a multibyte character nor singlebyte character.
   2120  1.1  christos 	     We cansel to add state[0].follows to next state, because
   2121  1.1  christos 	     state[0] must accept 1st-byte
   2122  1.1  christos 
   2123  1.1  christos 	     For example, we assume <sb a> is a certain singlebyte
   2124  1.1  christos 	     character, <mb A> is a certain multibyte character, and the
   2125  1.1  christos 	     codepoint of <sb a> equals the 2nd byte of the codepoint of
   2126  1.1  christos 	     <mb A>.
   2127  1.1  christos 	     When state[0] accepts <sb a>, state[i] transit to state[i+1]
   2128  1.1  christos 	     by accepting accepts 1st byte of <mb A>, and state[i+1]
   2129  1.1  christos 	     accepts 2nd byte of <mb A>, if state[i+1] encounter the
   2130  1.1  christos 	     codepoint of <sb a>, it must not be <sb a> but 2nd byte of
   2131  1.1  christos 	     <mb A>, so we can not add state[0].  */
   2132  1.1  christos 
   2133  1.1  christos 	  next_isnt_1st_byte = 0;
   2134  1.1  christos 	  for (j = 0; j < follows.nelem; ++j)
   2135  1.1  christos 	    {
   2136  1.1  christos 	      if (!(d->multibyte_prop[follows.elems[j].index] & 1))
   2137  1.1  christos 		{
   2138  1.1  christos 		  next_isnt_1st_byte = 1;
   2139  1.1  christos 		  break;
   2140  1.1  christos 		}
   2141  1.1  christos 	    }
   2142  1.1  christos 	}
   2143  1.1  christos #endif
   2144  1.1  christos 
   2145  1.1  christos       /* If we are building a searching matcher, throw in the positions
   2146  1.1  christos 	 of state 0 as well. */
   2147  1.1  christos #ifdef MBS_SUPPORT
   2148  1.1  christos       if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte))
   2149  1.1  christos #else
   2150  1.1  christos       if (d->searchflag)
   2151  1.1  christos #endif
   2152  1.1  christos 	for (j = 0; j < d->states[0].elems.nelem; ++j)
   2153  1.1  christos 	  insert(d->states[0].elems.elems[j], &follows);
   2154  1.1  christos 
   2155  1.1  christos       /* Find out if the new state will want any context information. */
   2156  1.1  christos       wants_newline = 0;
   2157  1.1  christos       if (tstbit(eolbyte, labels[i]))
   2158  1.1  christos 	for (j = 0; j < follows.nelem; ++j)
   2159  1.1  christos 	  if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
   2160  1.1  christos 	    wants_newline = 1;
   2161  1.1  christos 
   2162  1.1  christos       wants_letter = 0;
   2163  1.1  christos       for (j = 0; j < CHARCLASS_INTS; ++j)
   2164  1.1  christos 	if (labels[i][j] & letters[j])
   2165  1.1  christos 	  break;
   2166  1.1  christos       if (j < CHARCLASS_INTS)
   2167  1.1  christos 	for (j = 0; j < follows.nelem; ++j)
   2168  1.1  christos 	  if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
   2169  1.1  christos 	    wants_letter = 1;
   2170  1.1  christos 
   2171  1.1  christos       /* Find the state(s) corresponding to the union of the follows. */
   2172  1.1  christos       state = state_index(d, &follows, 0, 0);
   2173  1.1  christos       if (wants_newline)
   2174  1.1  christos 	state_newline = state_index(d, &follows, 1, 0);
   2175  1.1  christos       else
   2176  1.1  christos 	state_newline = state;
   2177  1.1  christos       if (wants_letter)
   2178  1.1  christos 	state_letter = state_index(d, &follows, 0, 1);
   2179  1.1  christos       else
   2180  1.1  christos 	state_letter = state;
   2181  1.1  christos 
   2182  1.1  christos       /* Set the transitions for each character in the current label. */
   2183  1.1  christos       for (j = 0; j < CHARCLASS_INTS; ++j)
   2184  1.1  christos 	for (k = 0; k < INTBITS; ++k)
   2185  1.1  christos 	  if (labels[i][j] & 1 << k)
   2186  1.1  christos 	    {
   2187  1.1  christos 	      int c = j * INTBITS + k;
   2188  1.1  christos 
   2189  1.1  christos 	      if (c == eolbyte)
   2190  1.1  christos 		trans[c] = state_newline;
   2191  1.1  christos 	      else if (IS_WORD_CONSTITUENT(c))
   2192  1.1  christos 		trans[c] = state_letter;
   2193  1.1  christos 	      else if (c < NOTCHAR)
   2194  1.1  christos 		trans[c] = state;
   2195  1.1  christos 	    }
   2196  1.1  christos     }
   2197  1.1  christos 
   2198  1.1  christos   for (i = 0; i < ngrps; ++i)
   2199  1.1  christos     free(grps[i].elems);
   2200  1.1  christos   free(follows.elems);
   2201  1.1  christos   free(tmp.elems);
   2202  1.1  christos }
   2203  1.1  christos 
   2204  1.1  christos /* Some routines for manipulating a compiled dfa's transition tables.
   2205  1.1  christos    Each state may or may not have a transition table; if it does, and it
   2206  1.1  christos    is a non-accepting state, then d->trans[state] points to its table.
   2207  1.1  christos    If it is an accepting state then d->fails[state] points to its table.
   2208  1.1  christos    If it has no table at all, then d->trans[state] is NULL.
   2209  1.1  christos    TODO: Improve this comment, get rid of the unnecessary redundancy. */
   2210  1.1  christos 
   2211  1.1  christos static void
   2212  1.1  christos build_state (int s, struct dfa *d)
   2213  1.1  christos {
   2214  1.1  christos   int *trans;			/* The new transition table. */
   2215  1.1  christos   int i;
   2216  1.1  christos 
   2217  1.1  christos   /* Set an upper limit on the number of transition tables that will ever
   2218  1.1  christos      exist at once.  1024 is arbitrary.  The idea is that the frequently
   2219  1.1  christos      used transition tables will be quickly rebuilt, whereas the ones that
   2220  1.1  christos      were only needed once or twice will be cleared away. */
   2221  1.1  christos   if (d->trcount >= 1024)
   2222  1.1  christos     {
   2223  1.1  christos       for (i = 0; i < d->tralloc; ++i)
   2224  1.1  christos 	if (d->trans[i])
   2225  1.1  christos 	  {
   2226  1.1  christos 	    free((ptr_t) d->trans[i]);
   2227  1.1  christos 	    d->trans[i] = NULL;
   2228  1.1  christos 	  }
   2229  1.1  christos 	else if (d->fails[i])
   2230  1.1  christos 	  {
   2231  1.1  christos 	    free((ptr_t) d->fails[i]);
   2232  1.1  christos 	    d->fails[i] = NULL;
   2233  1.1  christos 	  }
   2234  1.1  christos       d->trcount = 0;
   2235  1.1  christos     }
   2236  1.1  christos 
   2237  1.1  christos   ++d->trcount;
   2238  1.1  christos 
   2239  1.1  christos   /* Set up the success bits for this state. */
   2240  1.1  christos   d->success[s] = 0;
   2241  1.1  christos   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
   2242  1.1  christos       s, *d))
   2243  1.1  christos     d->success[s] |= 4;
   2244  1.1  christos   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
   2245  1.1  christos       s, *d))
   2246  1.1  christos     d->success[s] |= 2;
   2247  1.1  christos   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
   2248  1.1  christos       s, *d))
   2249  1.1  christos     d->success[s] |= 1;
   2250  1.1  christos 
   2251  1.1  christos   MALLOC(trans, int, NOTCHAR);
   2252  1.1  christos   dfastate(s, d, trans);
   2253  1.1  christos 
   2254  1.1  christos   /* Now go through the new transition table, and make sure that the trans
   2255  1.1  christos      and fail arrays are allocated large enough to hold a pointer for the
   2256  1.1  christos      largest state mentioned in the table. */
   2257  1.1  christos   for (i = 0; i < NOTCHAR; ++i)
   2258  1.1  christos     if (trans[i] >= d->tralloc)
   2259  1.1  christos       {
   2260  1.1  christos 	int oldalloc = d->tralloc;
   2261  1.1  christos 
   2262  1.1  christos 	while (trans[i] >= d->tralloc)
   2263  1.1  christos 	  d->tralloc *= 2;
   2264  1.1  christos 	REALLOC(d->realtrans, int *, d->tralloc + 1);
   2265  1.1  christos 	d->trans = d->realtrans + 1;
   2266  1.1  christos 	REALLOC(d->fails, int *, d->tralloc);
   2267  1.1  christos 	REALLOC(d->success, int, d->tralloc);
   2268  1.1  christos 	while (oldalloc < d->tralloc)
   2269  1.1  christos 	  {
   2270  1.1  christos 	    d->trans[oldalloc] = NULL;
   2271  1.1  christos 	    d->fails[oldalloc++] = NULL;
   2272  1.1  christos 	  }
   2273  1.1  christos       }
   2274  1.1  christos 
   2275  1.1  christos   /* Newline is a sentinel.  */
   2276  1.1  christos   trans[eolbyte] = -1;
   2277  1.1  christos 
   2278  1.1  christos   if (ACCEPTING(s, *d))
   2279  1.1  christos     d->fails[s] = trans;
   2280  1.1  christos   else
   2281  1.1  christos     d->trans[s] = trans;
   2282  1.1  christos }
   2283  1.1  christos 
   2284  1.1  christos static void
   2285  1.1  christos build_state_zero (struct dfa *d)
   2286  1.1  christos {
   2287  1.1  christos   d->tralloc = 1;
   2288  1.1  christos   d->trcount = 0;
   2289  1.1  christos   CALLOC(d->realtrans, int *, d->tralloc + 1);
   2290  1.1  christos   d->trans = d->realtrans + 1;
   2291  1.1  christos   CALLOC(d->fails, int *, d->tralloc);
   2292  1.1  christos   MALLOC(d->success, int, d->tralloc);
   2293  1.1  christos   build_state(0, d);
   2294  1.1  christos }
   2295  1.1  christos 
   2296  1.1  christos #ifdef MBS_SUPPORT
   2297  1.1  christos /* Multibyte character handling sub-routins for dfaexec.  */
   2298  1.1  christos 
   2299  1.1  christos /* Initial state may encounter the byte which is not a singlebyte character
   2300  1.1  christos    nor 1st byte of a multibyte character.  But it is incorrect for initial
   2301  1.1  christos    state to accept such a byte.
   2302  1.1  christos    For example, in sjis encoding the regular expression like "\\" accepts
   2303  1.1  christos    the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
   2304  1.1  christos    0x815c. Then Initial state must skip the bytes which are not a singlebyte
   2305  1.1  christos    character nor 1st byte of a multibyte character.  */
   2306  1.1  christos #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p)		\
   2307  1.1  christos   if (s == 0)						\
   2308  1.1  christos     {							\
   2309  1.1  christos       while (inputwcs[p - buf_begin] == 0		\
   2310  1.1  christos             && mblen_buf[p - buf_begin] > 0		\
   2311  1.1  christos 	    && p < buf_end)				\
   2312  1.1  christos         ++p;						\
   2313  1.1  christos       if (p >= end)					\
   2314  1.1  christos 	{						\
   2315  1.1  christos           free(mblen_buf);				\
   2316  1.1  christos           free(inputwcs);				\
   2317  1.1  christos 	  return (size_t) -1;				\
   2318  1.1  christos 	}						\
   2319  1.1  christos     }
   2320  1.1  christos 
   2321  1.1  christos static void
   2322  1.1  christos realloc_trans_if_necessary(struct dfa *d, int new_state)
   2323  1.1  christos {
   2324  1.1  christos   /* Make sure that the trans and fail arrays are allocated large enough
   2325  1.1  christos      to hold a pointer for the new state. */
   2326  1.1  christos   if (new_state >= d->tralloc)
   2327  1.1  christos     {
   2328  1.1  christos       int oldalloc = d->tralloc;
   2329  1.1  christos 
   2330  1.1  christos       while (new_state >= d->tralloc)
   2331  1.1  christos 	d->tralloc *= 2;
   2332  1.1  christos       REALLOC(d->realtrans, int *, d->tralloc + 1);
   2333  1.1  christos       d->trans = d->realtrans + 1;
   2334  1.1  christos       REALLOC(d->fails, int *, d->tralloc);
   2335  1.1  christos       REALLOC(d->success, int, d->tralloc);
   2336  1.1  christos       while (oldalloc < d->tralloc)
   2337  1.1  christos 	{
   2338  1.1  christos 	  d->trans[oldalloc] = NULL;
   2339  1.1  christos 	  d->fails[oldalloc++] = NULL;
   2340  1.1  christos 	}
   2341  1.1  christos     }
   2342  1.1  christos }
   2343  1.1  christos 
   2344  1.1  christos /* Return values of transit_state_singlebyte(), and
   2345  1.1  christos    transit_state_consume_1char.  */
   2346  1.1  christos typedef enum
   2347  1.1  christos {
   2348  1.1  christos   TRANSIT_STATE_IN_PROGRESS,	/* State transition has not finished.  */
   2349  1.1  christos   TRANSIT_STATE_DONE,		/* State transition has finished.  */
   2350  1.1  christos   TRANSIT_STATE_END_BUFFER	/* Reach the end of the buffer.  */
   2351  1.1  christos } status_transit_state;
   2352  1.1  christos 
   2353  1.1  christos /* Consume a single byte and transit state from 's' to '*next_state'.
   2354  1.1  christos    This function is almost same as the state transition routin in dfaexec().
   2355  1.1  christos    But state transition is done just once, otherwise matching succeed or
   2356  1.1  christos    reach the end of the buffer.  */
   2357  1.1  christos static status_transit_state
   2358  1.1  christos transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
   2359  1.1  christos 				  int *next_state)
   2360  1.1  christos {
   2361  1.1  christos   int *t;
   2362  1.1  christos   int works = s;
   2363  1.1  christos 
   2364  1.1  christos   status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
   2365  1.1  christos 
   2366  1.1  christos   while (rval == TRANSIT_STATE_IN_PROGRESS)
   2367  1.1  christos     {
   2368  1.1  christos       if ((t = d->trans[works]) != NULL)
   2369  1.1  christos 	{
   2370  1.1  christos 	  works = t[*p];
   2371  1.1  christos 	  rval = TRANSIT_STATE_DONE;
   2372  1.1  christos 	  if (works < 0)
   2373  1.1  christos 	    works = 0;
   2374  1.1  christos 	}
   2375  1.1  christos       else if (works < 0)
   2376  1.1  christos 	{
   2377  1.1  christos 	  if (p == buf_end)
   2378  1.1  christos 	    /* At the moment, it must not happen.  */
   2379  1.1  christos 	    return TRANSIT_STATE_END_BUFFER;
   2380  1.1  christos 	  works = 0;
   2381  1.1  christos 	}
   2382  1.1  christos       else if (d->fails[works])
   2383  1.1  christos 	{
   2384  1.1  christos 	  works = d->fails[works][*p];
   2385  1.1  christos 	  rval = TRANSIT_STATE_DONE;
   2386  1.1  christos 	}
   2387  1.1  christos       else
   2388  1.1  christos 	{
   2389  1.1  christos 	  build_state(works, d);
   2390  1.1  christos 	}
   2391  1.1  christos     }
   2392  1.1  christos   *next_state = works;
   2393  1.1  christos   return rval;
   2394  1.1  christos }
   2395  1.1  christos 
   2396  1.1  christos /* Check whether period can match or not in the current context.  If it can,
   2397  1.1  christos    return the amount of the bytes with which period can match, otherwise
   2398  1.1  christos    return 0.
   2399  1.1  christos    `pos' is the position of the period.  `index' is the index from the
   2400  1.1  christos    buf_begin, and it is the current position in the buffer.  */
   2401  1.1  christos static int
   2402  1.1  christos match_anychar (struct dfa *d, int s, position pos, int index)
   2403  1.1  christos {
   2404  1.1  christos   int newline = 0;
   2405  1.1  christos   int letter = 0;
   2406  1.1  christos   wchar_t wc;
   2407  1.1  christos   int mbclen;
   2408  1.1  christos 
   2409  1.1  christos   wc = inputwcs[index];
   2410  1.1  christos   mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
   2411  1.1  christos 
   2412  1.1  christos   /* Check context.  */
   2413  1.1  christos   if (wc == (wchar_t)eolbyte)
   2414  1.1  christos     {
   2415  1.1  christos       if (!(syntax_bits & RE_DOT_NEWLINE))
   2416  1.1  christos 	return 0;
   2417  1.1  christos       newline = 1;
   2418  1.1  christos     }
   2419  1.1  christos   else if (wc == (wchar_t)'\0')
   2420  1.1  christos     {
   2421  1.1  christos       if (syntax_bits & RE_DOT_NOT_NULL)
   2422  1.1  christos 	return 0;
   2423  1.1  christos       newline = 1;
   2424  1.1  christos     }
   2425  1.1  christos 
   2426  1.1  christos   if (iswalnum(wc) || wc == L'_')
   2427  1.1  christos     letter = 1;
   2428  1.1  christos 
   2429  1.1  christos   if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
   2430  1.1  christos 			   newline, d->states[s].letter, letter))
   2431  1.1  christos     return 0;
   2432  1.1  christos 
   2433  1.1  christos   return mbclen;
   2434  1.1  christos }
   2435  1.1  christos 
   2436  1.1  christos /* Check whether bracket expression can match or not in the current context.
   2437  1.1  christos    If it can, return the amount of the bytes with which expression can match,
   2438  1.1  christos    otherwise return 0.
   2439  1.1  christos    `pos' is the position of the bracket expression.  `index' is the index
   2440  1.1  christos    from the buf_begin, and it is the current position in the buffer.  */
   2441  1.1  christos int
   2442  1.1  christos match_mb_charset (struct dfa *d, int s, position pos, int index)
   2443  1.1  christos {
   2444  1.1  christos   int i;
   2445  1.1  christos   int match;		/* Flag which represent that matching succeed.  */
   2446  1.1  christos   int match_len;	/* Length of the character (or collating element)
   2447  1.1  christos 			   with which this operator match.  */
   2448  1.1  christos   int op_len;		/* Length of the operator.  */
   2449  1.1  christos   char buffer[128];
   2450  1.1  christos   wchar_t wcbuf[6];
   2451  1.1  christos 
   2452  1.1  christos   /* Pointer to the structure to which we are currently reffering.  */
   2453  1.1  christos   struct mb_char_classes *work_mbc;
   2454  1.1  christos 
   2455  1.1  christos   int newline = 0;
   2456  1.1  christos   int letter = 0;
   2457  1.1  christos   wchar_t wc;		/* Current reffering character.  */
   2458  1.1  christos 
   2459  1.1  christos   wc = inputwcs[index];
   2460  1.1  christos 
   2461  1.1  christos   /* Check context.  */
   2462  1.1  christos   if (wc == (wchar_t)eolbyte)
   2463  1.1  christos     {
   2464  1.1  christos       if (!(syntax_bits & RE_DOT_NEWLINE))
   2465  1.1  christos 	return 0;
   2466  1.1  christos       newline = 1;
   2467  1.1  christos     }
   2468  1.1  christos   else if (wc == (wchar_t)'\0')
   2469  1.1  christos     {
   2470  1.1  christos       if (syntax_bits & RE_DOT_NOT_NULL)
   2471  1.1  christos 	return 0;
   2472  1.1  christos       newline = 1;
   2473  1.1  christos     }
   2474  1.1  christos   if (iswalnum(wc) || wc == L'_')
   2475  1.1  christos     letter = 1;
   2476  1.1  christos   if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
   2477  1.1  christos 			   newline, d->states[s].letter, letter))
   2478  1.1  christos     return 0;
   2479  1.1  christos 
   2480  1.1  christos   /* Assign the current reffering operator to work_mbc.  */
   2481  1.1  christos   work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
   2482  1.1  christos   match = !work_mbc->invert;
   2483  1.1  christos   match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
   2484  1.1  christos 
   2485  1.1  christos   /* match with a character class?  */
   2486  1.1  christos   for (i = 0; i<work_mbc->nch_classes; i++)
   2487  1.1  christos     {
   2488  1.1  christos       if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
   2489  1.1  christos 	goto charset_matched;
   2490  1.1  christos     }
   2491  1.1  christos 
   2492  1.1  christos   strncpy(buffer, buf_begin + index, match_len);
   2493  1.1  christos   buffer[match_len] = '\0';
   2494  1.1  christos 
   2495  1.1  christos   /* match with an equivalent class?  */
   2496  1.1  christos   for (i = 0; i<work_mbc->nequivs; i++)
   2497  1.1  christos     {
   2498  1.1  christos       op_len = strlen(work_mbc->equivs[i]);
   2499  1.1  christos       strncpy(buffer, buf_begin + index, op_len);
   2500  1.1  christos       buffer[op_len] = '\0';
   2501  1.1  christos       if (strcoll(work_mbc->equivs[i], buffer) == 0)
   2502  1.1  christos 	{
   2503  1.1  christos 	  match_len = op_len;
   2504  1.1  christos 	  goto charset_matched;
   2505  1.1  christos 	}
   2506  1.1  christos     }
   2507  1.1  christos 
   2508  1.1  christos   /* match with a collating element?  */
   2509  1.1  christos   for (i = 0; i<work_mbc->ncoll_elems; i++)
   2510  1.1  christos     {
   2511  1.1  christos       op_len = strlen(work_mbc->coll_elems[i]);
   2512  1.1  christos       strncpy(buffer, buf_begin + index, op_len);
   2513  1.1  christos       buffer[op_len] = '\0';
   2514  1.1  christos 
   2515  1.1  christos       if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
   2516  1.1  christos 	{
   2517  1.1  christos 	  match_len = op_len;
   2518  1.1  christos 	  goto charset_matched;
   2519  1.1  christos 	}
   2520  1.1  christos     }
   2521  1.1  christos 
   2522  1.1  christos   wcbuf[0] = wc;
   2523  1.1  christos   wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
   2524  1.1  christos 
   2525  1.1  christos   /* match with a range?  */
   2526  1.1  christos   for (i = 0; i<work_mbc->nranges; i++)
   2527  1.1  christos     {
   2528  1.1  christos       wcbuf[2] = work_mbc->range_sts[i];
   2529  1.1  christos       wcbuf[4] = work_mbc->range_ends[i];
   2530  1.1  christos 
   2531  1.1  christos       if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
   2532  1.1  christos 	  wcscoll(wcbuf+4, wcbuf) >= 0)
   2533  1.1  christos 	goto charset_matched;
   2534  1.1  christos     }
   2535  1.1  christos 
   2536  1.1  christos   /* match with a character?  */
   2537  1.1  christos   for (i = 0; i<work_mbc->nchars; i++)
   2538  1.1  christos     {
   2539  1.1  christos       if (wc == work_mbc->chars[i])
   2540  1.1  christos 	goto charset_matched;
   2541  1.1  christos     }
   2542  1.1  christos 
   2543  1.1  christos   match = !match;
   2544  1.1  christos 
   2545  1.1  christos  charset_matched:
   2546  1.1  christos   return match ? match_len : 0;
   2547  1.1  christos }
   2548  1.1  christos 
   2549  1.1  christos /* Check each of `d->states[s].mbps.elem' can match or not. Then return the
   2550  1.1  christos    array which corresponds to `d->states[s].mbps.elem' and each element of
   2551  1.1  christos    the array contains the amount of the bytes with which the element can
   2552  1.1  christos    match.
   2553  1.1  christos    `index' is the index from the buf_begin, and it is the current position
   2554  1.1  christos    in the buffer.
   2555  1.1  christos    Caller MUST free the array which this function return.  */
   2556  1.1  christos static int*
   2557  1.1  christos check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
   2558  1.1  christos {
   2559  1.1  christos   int i;
   2560  1.1  christos   int* rarray;
   2561  1.1  christos 
   2562  1.1  christos   MALLOC(rarray, int, d->states[s].mbps.nelem);
   2563  1.1  christos   for (i = 0; i < d->states[s].mbps.nelem; ++i)
   2564  1.1  christos     {
   2565  1.1  christos       position pos = d->states[s].mbps.elems[i];
   2566  1.1  christos       switch(d->tokens[pos.index])
   2567  1.1  christos 	{
   2568  1.1  christos 	case ANYCHAR:
   2569  1.1  christos 	  rarray[i] = match_anychar(d, s, pos, index);
   2570  1.1  christos 	  break;
   2571  1.1  christos 	case MBCSET:
   2572  1.1  christos 	  rarray[i] = match_mb_charset(d, s, pos, index);
   2573  1.1  christos 	  break;
   2574  1.1  christos 	default:
   2575  1.1  christos 	  break; /* can not happen.  */
   2576  1.1  christos 	}
   2577  1.1  christos     }
   2578  1.1  christos   return rarray;
   2579  1.1  christos }
   2580  1.1  christos 
   2581  1.1  christos /* Consume a single character and enumerate all of the positions which can
   2582  1.1  christos    be next position from the state `s'.
   2583  1.1  christos    `match_lens' is the input. It can be NULL, but it can also be the output
   2584  1.1  christos    of check_matching_with_multibyte_ops() for optimization.
   2585  1.1  christos    `mbclen' and `pps' are the output.  `mbclen' is the length of the
   2586  1.1  christos    character consumed, and `pps' is the set this function enumerate.  */
   2587  1.1  christos static status_transit_state
   2588  1.1  christos transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
   2589  1.1  christos 			     int *match_lens, int *mbclen, position_set *pps)
   2590  1.1  christos {
   2591  1.1  christos   int i, j;
   2592  1.1  christos   int s1, s2;
   2593  1.1  christos   int* work_mbls;
   2594  1.1  christos   status_transit_state rs = TRANSIT_STATE_DONE;
   2595  1.1  christos 
   2596  1.1  christos   /* Calculate the length of the (single/multi byte) character
   2597  1.1  christos      to which p points.  */
   2598  1.1  christos   *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
   2599  1.1  christos     : mblen_buf[*pp - buf_begin];
   2600  1.1  christos 
   2601  1.1  christos   /* Calculate the state which can be reached from the state `s' by
   2602  1.1  christos      consuming `*mbclen' single bytes from the buffer.  */
   2603  1.1  christos   s1 = s;
   2604  1.1  christos   for (i = 0; i < *mbclen; i++)
   2605  1.1  christos     {
   2606  1.1  christos       s2 = s1;
   2607  1.1  christos       rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
   2608  1.1  christos     }
   2609  1.1  christos   /* Copy the positions contained by `s1' to the set `pps'.  */
   2610  1.1  christos   copy(&(d->states[s1].elems), pps);
   2611  1.1  christos 
   2612  1.1  christos   /* Check (inputed)match_lens, and initialize if it is NULL.  */
   2613  1.1  christos   if (match_lens == NULL && d->states[s].mbps.nelem != 0)
   2614  1.1  christos     work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
   2615  1.1  christos   else
   2616  1.1  christos     work_mbls = match_lens;
   2617  1.1  christos 
   2618  1.1  christos   /* Add all of the positions which can be reached from `s' by consuming
   2619  1.1  christos      a single character.  */
   2620  1.1  christos   for (i = 0; i < d->states[s].mbps.nelem ; i++)
   2621  1.1  christos    {
   2622  1.1  christos       if (work_mbls[i] == *mbclen)
   2623  1.1  christos 	for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
   2624  1.1  christos 	     j++)
   2625  1.1  christos 	  insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
   2626  1.1  christos 		 pps);
   2627  1.1  christos     }
   2628  1.1  christos 
   2629  1.1  christos   if (match_lens == NULL && work_mbls != NULL)
   2630  1.1  christos     free(work_mbls);
   2631  1.1  christos   return rs;
   2632  1.1  christos }
   2633  1.1  christos 
   2634  1.1  christos /* Transit state from s, then return new state and update the pointer of the
   2635  1.1  christos    buffer.  This function is for some operator which can match with a multi-
   2636  1.1  christos    byte character or a collating element(which may be multi characters).  */
   2637  1.1  christos static int
   2638  1.1  christos transit_state (struct dfa *d, int s, unsigned char const **pp)
   2639  1.1  christos {
   2640  1.1  christos   int s1;
   2641  1.1  christos   int mbclen;		/* The length of current input multibyte character. */
   2642  1.1  christos   int maxlen = 0;
   2643  1.1  christos   int i, j;
   2644  1.1  christos   int *match_lens = NULL;
   2645  1.1  christos   int nelem = d->states[s].mbps.nelem; /* Just a alias.  */
   2646  1.1  christos   position_set follows;
   2647  1.1  christos   unsigned char const *p1 = *pp;
   2648  1.1  christos   status_transit_state rs;
   2649  1.1  christos   wchar_t wc;
   2650  1.1  christos 
   2651  1.1  christos   if (nelem > 0)
   2652  1.1  christos     /* This state has (a) multibyte operator(s).
   2653  1.1  christos        We check whether each of them can match or not.  */
   2654  1.1  christos     {
   2655  1.1  christos       /* Note: caller must free the return value of this function.  */
   2656  1.1  christos       match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
   2657  1.1  christos 
   2658  1.1  christos       for (i = 0; i < nelem; i++)
   2659  1.1  christos 	/* Search the operator which match the longest string,
   2660  1.1  christos 	   in this state.  */
   2661  1.1  christos 	{
   2662  1.1  christos 	  if (match_lens[i] > maxlen)
   2663  1.1  christos 	    maxlen = match_lens[i];
   2664  1.1  christos 	}
   2665  1.1  christos     }
   2666  1.1  christos 
   2667  1.1  christos   if (nelem == 0 || maxlen == 0)
   2668  1.1  christos     /* This state has no multibyte operator which can match.
   2669  1.1  christos        We need to  check only one singlebyte character.  */
   2670  1.1  christos     {
   2671  1.1  christos       status_transit_state rs;
   2672  1.1  christos       rs = transit_state_singlebyte(d, s, *pp, &s1);
   2673  1.1  christos 
   2674  1.1  christos       /* We must update the pointer if state transition succeeded.  */
   2675  1.1  christos       if (rs == TRANSIT_STATE_DONE)
   2676  1.1  christos 	++*pp;
   2677  1.1  christos 
   2678  1.1  christos       if (match_lens != NULL)
   2679  1.1  christos 	free(match_lens);
   2680  1.1  christos       return s1;
   2681  1.1  christos     }
   2682  1.1  christos 
   2683  1.1  christos   /* This state has some operators which can match a multibyte character.  */
   2684  1.1  christos   follows.nelem = 0;
   2685  1.1  christos   MALLOC(follows.elems, position, d->nleaves);
   2686  1.1  christos 
   2687  1.1  christos   /* `maxlen' may be longer than the length of a character, because it may
   2688  1.1  christos      not be a character but a (multi character) collating element.
   2689  1.1  christos      We enumerate all of the positions which `s' can reach by consuming
   2690  1.1  christos      `maxlen' bytes.  */
   2691  1.1  christos   rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
   2692  1.1  christos 
   2693  1.1  christos   wc = inputwcs[*pp - mbclen - buf_begin];
   2694  1.1  christos   s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
   2695  1.1  christos   realloc_trans_if_necessary(d, s1);
   2696  1.1  christos 
   2697  1.1  christos   while (*pp - p1 < maxlen)
   2698  1.1  christos     {
   2699  1.1  christos       follows.nelem = 0;
   2700  1.1  christos       rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
   2701  1.1  christos 
   2702  1.1  christos       for (i = 0; i < nelem ; i++)
   2703  1.1  christos 	{
   2704  1.1  christos 	  if (match_lens[i] == *pp - p1)
   2705  1.1  christos 	    for (j = 0;
   2706  1.1  christos 		 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
   2707  1.1  christos 	      insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
   2708  1.1  christos 		     &follows);
   2709  1.1  christos 	}
   2710  1.1  christos 
   2711  1.1  christos       wc = inputwcs[*pp - mbclen - buf_begin];
   2712  1.1  christos       s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
   2713  1.1  christos       realloc_trans_if_necessary(d, s1);
   2714  1.1  christos     }
   2715  1.1  christos   free(match_lens);
   2716  1.1  christos   free(follows.elems);
   2717  1.1  christos   return s1;
   2718  1.1  christos }
   2719  1.1  christos 
   2720  1.1  christos #endif
   2721  1.1  christos 
   2722  1.1  christos /* Search through a buffer looking for a match to the given struct dfa.
   2723  1.1  christos    Find the first occurrence of a string matching the regexp in the buffer,
   2724  1.1  christos    and the shortest possible version thereof.  Return the offset of the first
   2725  1.1  christos    character after the match, or (size_t) -1 if none is found.  BEGIN points to
   2726  1.1  christos    the beginning of the buffer, and SIZE is the size of the buffer.  If SIZE
   2727  1.1  christos    is nonzero, BEGIN[SIZE - 1] must be a newline.  BACKREF points to a place
   2728  1.1  christos    where we're supposed to store a 1 if backreferencing happened and the
   2729  1.1  christos    match needs to be verified by a backtracking matcher.  Otherwise
   2730  1.1  christos    we store a 0 in *backref. */
   2731  1.1  christos size_t
   2732  1.1  christos dfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
   2733  1.1  christos {
   2734  1.1  christos   register int s;	/* Current state. */
   2735  1.1  christos   register unsigned char const *p; /* Current input character. */
   2736  1.1  christos   register unsigned char const *end; /* One past the last input character.  */
   2737  1.1  christos   register int **trans, *t;	/* Copy of d->trans so it can be optimized
   2738  1.1  christos 				   into a register. */
   2739  1.1  christos   register unsigned char eol = eolbyte;	/* Likewise for eolbyte.  */
   2740  1.1  christos   static int sbit[NOTCHAR];	/* Table for anding with d->success. */
   2741  1.1  christos   static int sbit_init;
   2742  1.1  christos 
   2743  1.1  christos   if (! sbit_init)
   2744  1.1  christos     {
   2745  1.1  christos       int i;
   2746  1.1  christos 
   2747  1.1  christos       sbit_init = 1;
   2748  1.1  christos       for (i = 0; i < NOTCHAR; ++i)
   2749  1.1  christos 	sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
   2750  1.1  christos       sbit[eol] = 4;
   2751  1.1  christos     }
   2752  1.1  christos 
   2753  1.1  christos   if (! d->tralloc)
   2754  1.1  christos     build_state_zero(d);
   2755  1.1  christos 
   2756  1.1  christos   s = 0;
   2757  1.1  christos   p = (unsigned char const *) begin;
   2758  1.1  christos   end = p + size;
   2759  1.1  christos   trans = d->trans;
   2760  1.1  christos 
   2761  1.1  christos #ifdef MBS_SUPPORT
   2762  1.1  christos   if (MB_CUR_MAX > 1)
   2763  1.1  christos     {
   2764  1.1  christos       int remain_bytes, i;
   2765  1.1  christos       buf_begin = begin;
   2766  1.1  christos       buf_end = end;
   2767  1.1  christos 
   2768  1.1  christos       /* initialize mblen_buf, and inputwcs.  */
   2769  1.1  christos       MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
   2770  1.1  christos       MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
   2771  1.1  christos       memset(&mbs, 0, sizeof(mbstate_t));
   2772  1.1  christos       remain_bytes = 0;
   2773  1.1  christos       for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
   2774  1.1  christos 	{
   2775  1.1  christos 	  if (remain_bytes == 0)
   2776  1.1  christos 	    {
   2777  1.1  christos 	      remain_bytes
   2778  1.1  christos 		= mbrtowc(inputwcs + i, begin + i,
   2779  1.1  christos 			  end - (unsigned char const *)begin - i + 1, &mbs);
   2780  1.1  christos 	      if (remain_bytes <= 1)
   2781  1.1  christos 		{
   2782  1.1  christos 		  remain_bytes = 0;
   2783  1.1  christos 		  inputwcs[i] = (wchar_t)begin[i];
   2784  1.1  christos 		  mblen_buf[i] = 0;
   2785  1.1  christos 		}
   2786  1.1  christos 	      else
   2787  1.1  christos 		{
   2788  1.1  christos 		  mblen_buf[i] = remain_bytes;
   2789  1.1  christos 		  remain_bytes--;
   2790  1.1  christos 		}
   2791  1.1  christos 	    }
   2792  1.1  christos 	  else
   2793  1.1  christos 	    {
   2794  1.1  christos 	      mblen_buf[i] = remain_bytes;
   2795  1.1  christos 	      inputwcs[i] = 0;
   2796  1.1  christos 	      remain_bytes--;
   2797  1.1  christos 	    }
   2798  1.1  christos 	}
   2799  1.1  christos       mblen_buf[i] = 0;
   2800  1.1  christos       inputwcs[i] = 0; /* sentinel */
   2801  1.1  christos     }
   2802  1.1  christos #endif /* MBS_SUPPORT */
   2803  1.1  christos 
   2804  1.1  christos   for (;;)
   2805  1.1  christos     {
   2806  1.1  christos #ifdef MBS_SUPPORT
   2807  1.1  christos       if (MB_CUR_MAX > 1)
   2808  1.1  christos 	while ((t = trans[s]))
   2809  1.1  christos 	  {
   2810  1.1  christos 	    if (p == end)
   2811  1.1  christos 	      {
   2812  1.1  christos 		free(mblen_buf);
   2813  1.1  christos 		free(inputwcs);
   2814  1.1  christos 		return (size_t) -1;
   2815  1.1  christos 	      }
   2816  1.1  christos 	    if (d->states[s].mbps.nelem != 0)
   2817  1.1  christos 	      {
   2818  1.1  christos 		/* Can match with a multibyte character( and multi character
   2819  1.1  christos 		   collating element).  */
   2820  1.1  christos 		unsigned char const *nextp;
   2821  1.1  christos 
   2822  1.1  christos 		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
   2823  1.1  christos 
   2824  1.1  christos 		nextp = p;
   2825  1.1  christos 		s = transit_state(d, s, &nextp);
   2826  1.1  christos 		p = nextp;
   2827  1.1  christos 
   2828  1.1  christos 		/* Trans table might be updated.  */
   2829  1.1  christos 		trans = d->trans;
   2830  1.1  christos 	      }
   2831  1.1  christos 	    else
   2832  1.1  christos 	      {
   2833  1.1  christos 		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
   2834  1.1  christos 		s = t[*p++];
   2835  1.1  christos 	      }
   2836  1.1  christos 	  }
   2837  1.1  christos       else
   2838  1.1  christos #endif /* MBS_SUPPORT */
   2839  1.1  christos         while ((t = trans[s]))
   2840  1.1  christos 	  {
   2841  1.1  christos 	    if (p == end)
   2842  1.1  christos 	      return (size_t) -1;
   2843  1.1  christos 	    s = t[*p++];
   2844  1.1  christos 	  }
   2845  1.1  christos 
   2846  1.1  christos       if (s < 0)
   2847  1.1  christos 	{
   2848  1.1  christos 	  if (p == end)
   2849  1.1  christos 	    {
   2850  1.1  christos #ifdef MBS_SUPPORT
   2851  1.1  christos 	      if (MB_CUR_MAX > 1)
   2852  1.1  christos 		{
   2853  1.1  christos 		  free(mblen_buf);
   2854  1.1  christos 		  free(inputwcs);
   2855  1.1  christos 		}
   2856  1.1  christos #endif /* MBS_SUPPORT */
   2857  1.1  christos 	      return (size_t) -1;
   2858  1.1  christos 	    }
   2859  1.1  christos 	  s = 0;
   2860  1.1  christos 	}
   2861  1.1  christos       else if ((t = d->fails[s]))
   2862  1.1  christos 	{
   2863  1.1  christos 	  if (d->success[s] & sbit[*p])
   2864  1.1  christos 	    {
   2865  1.1  christos 	      if (backref)
   2866  1.1  christos 		*backref = (d->states[s].backref != 0);
   2867  1.1  christos #ifdef MBS_SUPPORT
   2868  1.1  christos 	      if (MB_CUR_MAX > 1)
   2869  1.1  christos 		{
   2870  1.1  christos 		  free(mblen_buf);
   2871  1.1  christos 		  free(inputwcs);
   2872  1.1  christos 		}
   2873  1.1  christos #endif /* MBS_SUPPORT */
   2874  1.1  christos 	      return (char const *) p - begin;
   2875  1.1  christos 	    }
   2876  1.1  christos 
   2877  1.1  christos #ifdef MBS_SUPPORT
   2878  1.1  christos 	  if (MB_CUR_MAX > 1)
   2879  1.1  christos 	    {
   2880  1.1  christos 		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
   2881  1.1  christos 		if (d->states[s].mbps.nelem != 0)
   2882  1.1  christos 		  {
   2883  1.1  christos 		    /* Can match with a multibyte character( and multi
   2884  1.1  christos 		       character collating element).  */
   2885  1.1  christos 		    unsigned char const *nextp;
   2886  1.1  christos 		    nextp = p;
   2887  1.1  christos 		    s = transit_state(d, s, &nextp);
   2888  1.1  christos 		    p = nextp;
   2889  1.1  christos 
   2890  1.1  christos 		    /* Trans table might be updated.  */
   2891  1.1  christos 		    trans = d->trans;
   2892  1.1  christos 		  }
   2893  1.1  christos 		else
   2894  1.1  christos 		s = t[*p++];
   2895  1.1  christos 	    }
   2896  1.1  christos 	  else
   2897  1.1  christos #endif /* MBS_SUPPORT */
   2898  1.1  christos 	  s = t[*p++];
   2899  1.1  christos 	}
   2900  1.1  christos       else
   2901  1.1  christos 	{
   2902  1.1  christos 	  build_state(s, d);
   2903  1.1  christos 	  trans = d->trans;
   2904  1.1  christos 	}
   2905  1.1  christos     }
   2906  1.1  christos }
   2907  1.1  christos 
   2908  1.1  christos /* Initialize the components of a dfa that the other routines don't
   2909  1.1  christos    initialize for themselves. */
   2910  1.1  christos void
   2911  1.1  christos dfainit (struct dfa *d)
   2912  1.1  christos {
   2913  1.1  christos   d->calloc = 1;
   2914  1.1  christos   MALLOC(d->charclasses, charclass, d->calloc);
   2915  1.1  christos   d->cindex = 0;
   2916  1.1  christos 
   2917  1.1  christos   d->talloc = 1;
   2918  1.1  christos   MALLOC(d->tokens, token, d->talloc);
   2919  1.1  christos   d->tindex = d->depth = d->nleaves = d->nregexps = 0;
   2920  1.1  christos #ifdef MBS_SUPPORT
   2921  1.1  christos   if (MB_CUR_MAX > 1)
   2922  1.1  christos     {
   2923  1.1  christos       d->nmultibyte_prop = 1;
   2924  1.1  christos       MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
   2925  1.1  christos       d->nmbcsets = 0;
   2926  1.1  christos       d->mbcsets_alloc = 1;
   2927  1.1  christos       MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
   2928  1.1  christos     }
   2929  1.1  christos #endif
   2930  1.1  christos 
   2931  1.1  christos   d->searchflag = 0;
   2932  1.1  christos   d->tralloc = 0;
   2933  1.1  christos 
   2934  1.1  christos   d->musts = 0;
   2935  1.1  christos }
   2936  1.1  christos 
   2937  1.1  christos /* Parse and analyze a single string of the given length. */
   2938  1.1  christos void
   2939  1.1  christos dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
   2940  1.1  christos {
   2941  1.1  christos   if (case_fold)	/* dummy folding in service of dfamust() */
   2942  1.1  christos     {
   2943  1.1  christos       char *lcopy;
   2944  1.1  christos       int i;
   2945  1.1  christos 
   2946  1.1  christos       lcopy = malloc(len);
   2947  1.1  christos       if (!lcopy)
   2948  1.1  christos 	dfaerror(_("out of memory"));
   2949  1.1  christos 
   2950  1.1  christos       /* This is a kludge. */
   2951  1.1  christos       case_fold = 0;
   2952  1.1  christos       for (i = 0; i < len; ++i)
   2953  1.1  christos 	if (ISUPPER ((unsigned char) s[i]))
   2954  1.1  christos 	  lcopy[i] = tolower ((unsigned char) s[i]);
   2955  1.1  christos 	else
   2956  1.1  christos 	  lcopy[i] = s[i];
   2957  1.1  christos 
   2958  1.1  christos       dfainit(d);
   2959  1.1  christos       dfaparse(lcopy, len, d);
   2960  1.1  christos       free(lcopy);
   2961  1.1  christos       dfamust(d);
   2962  1.1  christos       d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
   2963  1.1  christos       case_fold = 1;
   2964  1.1  christos       dfaparse(s, len, d);
   2965  1.1  christos       dfaanalyze(d, searchflag);
   2966  1.1  christos     }
   2967  1.1  christos   else
   2968  1.1  christos     {
   2969  1.1  christos         dfainit(d);
   2970  1.1  christos         dfaparse(s, len, d);
   2971  1.1  christos 	dfamust(d);
   2972  1.1  christos         dfaanalyze(d, searchflag);
   2973  1.1  christos     }
   2974  1.1  christos }
   2975  1.1  christos 
   2976  1.1  christos /* Free the storage held by the components of a dfa. */
   2977  1.1  christos void
   2978  1.1  christos dfafree (struct dfa *d)
   2979  1.1  christos {
   2980  1.1  christos   int i;
   2981  1.1  christos   struct dfamust *dm, *ndm;
   2982  1.1  christos 
   2983  1.1  christos   free((ptr_t) d->charclasses);
   2984  1.1  christos   free((ptr_t) d->tokens);
   2985  1.1  christos 
   2986  1.1  christos #ifdef MBS_SUPPORT
   2987  1.1  christos   if (MB_CUR_MAX > 1)
   2988  1.1  christos     {
   2989  1.1  christos       free((ptr_t) d->multibyte_prop);
   2990  1.1  christos       for (i = 0; i < d->nmbcsets; ++i)
   2991  1.1  christos 	{
   2992  1.1  christos 	  int j;
   2993  1.1  christos 	  struct mb_char_classes *p = &(d->mbcsets[i]);
   2994  1.1  christos 	  if (p->chars != NULL)
   2995  1.1  christos 	    free(p->chars);
   2996  1.1  christos 	  if (p->ch_classes != NULL)
   2997  1.1  christos 	    free(p->ch_classes);
   2998  1.1  christos 	  if (p->range_sts != NULL)
   2999  1.1  christos 	    free(p->range_sts);
   3000  1.1  christos 	  if (p->range_ends != NULL)
   3001  1.1  christos 	    free(p->range_ends);
   3002  1.1  christos 
   3003  1.1  christos 	  for (j = 0; j < p->nequivs; ++j)
   3004  1.1  christos 	    free(p->equivs[j]);
   3005  1.1  christos 	  if (p->equivs != NULL)
   3006  1.1  christos 	    free(p->equivs);
   3007  1.1  christos 
   3008  1.1  christos 	  for (j = 0; j < p->ncoll_elems; ++j)
   3009  1.1  christos 	    free(p->coll_elems[j]);
   3010  1.1  christos 	  if (p->coll_elems != NULL)
   3011  1.1  christos 	    free(p->coll_elems);
   3012  1.1  christos 	}
   3013  1.1  christos       free((ptr_t) d->mbcsets);
   3014  1.1  christos     }
   3015  1.1  christos #endif /* MBS_SUPPORT */
   3016  1.1  christos 
   3017  1.1  christos   for (i = 0; i < d->sindex; ++i)
   3018  1.1  christos     free((ptr_t) d->states[i].elems.elems);
   3019  1.1  christos   free((ptr_t) d->states);
   3020  1.1  christos   for (i = 0; i < d->tindex; ++i)
   3021  1.1  christos     if (d->follows[i].elems)
   3022  1.1  christos       free((ptr_t) d->follows[i].elems);
   3023  1.1  christos   free((ptr_t) d->follows);
   3024  1.1  christos   for (i = 0; i < d->tralloc; ++i)
   3025  1.1  christos     if (d->trans[i])
   3026  1.1  christos       free((ptr_t) d->trans[i]);
   3027  1.1  christos     else if (d->fails[i])
   3028  1.1  christos       free((ptr_t) d->fails[i]);
   3029  1.1  christos   if (d->realtrans) free((ptr_t) d->realtrans);
   3030  1.1  christos   if (d->fails) free((ptr_t) d->fails);
   3031  1.1  christos   if (d->success) free((ptr_t) d->success);
   3032  1.1  christos   for (dm = d->musts; dm; dm = ndm)
   3033  1.1  christos     {
   3034  1.1  christos       ndm = dm->next;
   3035  1.1  christos       free(dm->must);
   3036  1.1  christos       free((ptr_t) dm);
   3037  1.1  christos     }
   3038  1.1  christos }
   3039  1.1  christos 
   3040  1.1  christos /* Having found the postfix representation of the regular expression,
   3041  1.1  christos    try to find a long sequence of characters that must appear in any line
   3042  1.1  christos    containing the r.e.
   3043  1.1  christos    Finding a "longest" sequence is beyond the scope here;
   3044  1.1  christos    we take an easy way out and hope for the best.
   3045  1.1  christos    (Take "(ab|a)b"--please.)
   3046  1.1  christos 
   3047  1.1  christos    We do a bottom-up calculation of sequences of characters that must appear
   3048  1.1  christos    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
   3049  1.1  christos    representation:
   3050  1.1  christos 	sequences that must appear at the left of the match ("left")
   3051  1.1  christos 	sequences that must appear at the right of the match ("right")
   3052  1.1  christos 	lists of sequences that must appear somewhere in the match ("in")
   3053  1.1  christos 	sequences that must constitute the match ("is")
   3054  1.1  christos 
   3055  1.1  christos    When we get to the root of the tree, we use one of the longest of its
   3056  1.1  christos    calculated "in" sequences as our answer.  The sequence we find is returned in
   3057  1.1  christos    d->must (where "d" is the single argument passed to "dfamust");
   3058  1.1  christos    the length of the sequence is returned in d->mustn.
   3059  1.1  christos 
   3060  1.1  christos    The sequences calculated for the various types of node (in pseudo ANSI c)
   3061  1.1  christos    are shown below.  "p" is the operand of unary operators (and the left-hand
   3062  1.1  christos    operand of binary operators); "q" is the right-hand operand of binary
   3063  1.1  christos    operators.
   3064  1.1  christos 
   3065  1.1  christos    "ZERO" means "a zero-length sequence" below.
   3066  1.1  christos 
   3067  1.1  christos 	Type	left		right		is		in
   3068  1.1  christos 	----	----		-----		--		--
   3069  1.1  christos 	char c	# c		# c		# c		# c
   3070  1.1  christos 
   3071  1.1  christos 	ANYCHAR	ZERO		ZERO		ZERO		ZERO
   3072  1.1  christos 
   3073  1.1  christos 	MBCSET	ZERO		ZERO		ZERO		ZERO
   3074  1.1  christos 
   3075  1.1  christos 	CSET	ZERO		ZERO		ZERO		ZERO
   3076  1.1  christos 
   3077  1.1  christos 	STAR	ZERO		ZERO		ZERO		ZERO
   3078  1.1  christos 
   3079  1.1  christos 	QMARK	ZERO		ZERO		ZERO		ZERO
   3080  1.1  christos 
   3081  1.1  christos 	PLUS	p->left		p->right	ZERO		p->in
   3082  1.1  christos 
   3083  1.1  christos 	CAT	(p->is==ZERO)?	(q->is==ZERO)?	(p->is!=ZERO &&	p->in plus
   3084  1.1  christos 		p->left :	q->right :	q->is!=ZERO) ?	q->in plus
   3085  1.1  christos 		p->is##q->left	p->right##q->is	p->is##q->is :	p->right##q->left
   3086  1.1  christos 						ZERO
   3087  1.1  christos 
   3088  1.1  christos 	OR	longest common	longest common	(do p->is and	substrings common to
   3089  1.1  christos 		leading		trailing	q->is have same	p->in and q->in
   3090  1.1  christos 		(sub)sequence	(sub)sequence	length and
   3091  1.1  christos 		of p->left	of p->right	content) ?
   3092  1.1  christos 		and q->left	and q->right	p->is : NULL
   3093  1.1  christos 
   3094  1.1  christos    If there's anything else we recognize in the tree, all four sequences get set
   3095  1.1  christos    to zero-length sequences.  If there's something we don't recognize in the tree,
   3096  1.1  christos    we just return a zero-length sequence.
   3097  1.1  christos 
   3098  1.1  christos    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
   3099  1.1  christos    'aaa')?
   3100  1.1  christos 
   3101  1.1  christos    And. . .is it here or someplace that we might ponder "optimizations" such as
   3102  1.1  christos 	egrep 'psi|epsilon'	->	egrep 'psi'
   3103  1.1  christos 	egrep 'pepsi|epsilon'	->	egrep 'epsi'
   3104  1.1  christos 					(Yes, we now find "epsi" as a "string
   3105  1.1  christos 					that must occur", but we might also
   3106  1.1  christos 					simplify the *entire* r.e. being sought)
   3107  1.1  christos 	grep '[c]'		->	grep 'c'
   3108  1.1  christos 	grep '(ab|a)b'		->	grep 'ab'
   3109  1.1  christos 	grep 'ab*'		->	grep 'a'
   3110  1.1  christos 	grep 'a*b'		->	grep 'b'
   3111  1.1  christos 
   3112  1.1  christos    There are several issues:
   3113  1.1  christos 
   3114  1.1  christos    Is optimization easy (enough)?
   3115  1.1  christos 
   3116  1.1  christos    Does optimization actually accomplish anything,
   3117  1.1  christos    or is the automaton you get from "psi|epsilon" (for example)
   3118  1.1  christos    the same as the one you get from "psi" (for example)?
   3119  1.1  christos 
   3120  1.1  christos    Are optimizable r.e.'s likely to be used in real-life situations
   3121  1.1  christos    (something like 'ab*' is probably unlikely; something like is
   3122  1.1  christos    'psi|epsilon' is likelier)? */
   3123  1.1  christos 
   3124  1.1  christos static char *
   3125  1.1  christos icatalloc (char *old, char *new)
   3126  1.1  christos {
   3127  1.1  christos   char *result;
   3128  1.1  christos   size_t oldsize, newsize;
   3129  1.1  christos 
   3130  1.1  christos   newsize = (new == NULL) ? 0 : strlen(new);
   3131  1.1  christos   if (old == NULL)
   3132  1.1  christos     oldsize = 0;
   3133  1.1  christos   else if (newsize == 0)
   3134  1.1  christos     return old;
   3135  1.1  christos   else	oldsize = strlen(old);
   3136  1.1  christos   if (old == NULL)
   3137  1.1  christos     result = (char *) malloc(newsize + 1);
   3138  1.1  christos   else
   3139  1.1  christos     result = (char *) realloc((void *) old, oldsize + newsize + 1);
   3140  1.1  christos   if (result != NULL && new != NULL)
   3141  1.1  christos     (void) strcpy(result + oldsize, new);
   3142  1.1  christos   return result;
   3143  1.1  christos }
   3144  1.1  christos 
   3145  1.1  christos static char *
   3146  1.1  christos icpyalloc (char *string)
   3147  1.1  christos {
   3148  1.1  christos   return icatalloc((char *) NULL, string);
   3149  1.1  christos }
   3150  1.1  christos 
   3151  1.1  christos static char *
   3152  1.1  christos istrstr (char *lookin, char *lookfor)
   3153  1.1  christos {
   3154  1.1  christos   char *cp;
   3155  1.1  christos   size_t len;
   3156  1.1  christos 
   3157  1.1  christos   len = strlen(lookfor);
   3158  1.1  christos   for (cp = lookin; *cp != '\0'; ++cp)
   3159  1.1  christos     if (strncmp(cp, lookfor, len) == 0)
   3160  1.1  christos       return cp;
   3161  1.1  christos   return NULL;
   3162  1.1  christos }
   3163  1.1  christos 
   3164  1.1  christos static void
   3165  1.1  christos ifree (char *cp)
   3166  1.1  christos {
   3167  1.1  christos   if (cp != NULL)
   3168  1.1  christos     free(cp);
   3169  1.1  christos }
   3170  1.1  christos 
   3171  1.1  christos static void
   3172  1.1  christos freelist (char **cpp)
   3173  1.1  christos {
   3174  1.1  christos   int i;
   3175  1.1  christos 
   3176  1.1  christos   if (cpp == NULL)
   3177  1.1  christos     return;
   3178  1.1  christos   for (i = 0; cpp[i] != NULL; ++i)
   3179  1.1  christos     {
   3180  1.1  christos       free(cpp[i]);
   3181  1.1  christos       cpp[i] = NULL;
   3182  1.1  christos     }
   3183  1.1  christos }
   3184  1.1  christos 
   3185  1.1  christos static char **
   3186  1.1  christos enlist (char **cpp, char *new, size_t len)
   3187  1.1  christos {
   3188  1.1  christos   int i, j;
   3189  1.1  christos 
   3190  1.1  christos   if (cpp == NULL)
   3191  1.1  christos     return NULL;
   3192  1.1  christos   if ((new = icpyalloc(new)) == NULL)
   3193  1.1  christos     {
   3194  1.1  christos       freelist(cpp);
   3195  1.1  christos       return NULL;
   3196  1.1  christos     }
   3197  1.1  christos   new[len] = '\0';
   3198  1.1  christos   /* Is there already something in the list that's new (or longer)? */
   3199  1.1  christos   for (i = 0; cpp[i] != NULL; ++i)
   3200  1.1  christos     if (istrstr(cpp[i], new) != NULL)
   3201  1.1  christos       {
   3202  1.1  christos 	free(new);
   3203  1.1  christos 	return cpp;
   3204  1.1  christos       }
   3205  1.1  christos   /* Eliminate any obsoleted strings. */
   3206  1.1  christos   j = 0;
   3207  1.1  christos   while (cpp[j] != NULL)
   3208  1.1  christos     if (istrstr(new, cpp[j]) == NULL)
   3209  1.1  christos       ++j;
   3210  1.1  christos     else
   3211  1.1  christos       {
   3212  1.1  christos 	free(cpp[j]);
   3213  1.1  christos 	if (--i == j)
   3214  1.1  christos 	  break;
   3215  1.1  christos 	cpp[j] = cpp[i];
   3216  1.1  christos 	cpp[i] = NULL;
   3217  1.1  christos       }
   3218  1.1  christos   /* Add the new string. */
   3219  1.1  christos   cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
   3220  1.1  christos   if (cpp == NULL)
   3221  1.1  christos     return NULL;
   3222  1.1  christos   cpp[i] = new;
   3223  1.1  christos   cpp[i + 1] = NULL;
   3224  1.1  christos   return cpp;
   3225  1.1  christos }
   3226  1.1  christos 
   3227  1.1  christos /* Given pointers to two strings, return a pointer to an allocated
   3228  1.1  christos    list of their distinct common substrings. Return NULL if something
   3229  1.1  christos    seems wild. */
   3230  1.1  christos static char **
   3231  1.1  christos comsubs (char *left, char *right)
   3232  1.1  christos {
   3233  1.1  christos   char **cpp;
   3234  1.1  christos   char *lcp;
   3235  1.1  christos   char *rcp;
   3236  1.1  christos   size_t i, len;
   3237  1.1  christos 
   3238  1.1  christos   if (left == NULL || right == NULL)
   3239  1.1  christos     return NULL;
   3240  1.1  christos   cpp = (char **) malloc(sizeof *cpp);
   3241  1.1  christos   if (cpp == NULL)
   3242  1.1  christos     return NULL;
   3243  1.1  christos   cpp[0] = NULL;
   3244  1.1  christos   for (lcp = left; *lcp != '\0'; ++lcp)
   3245  1.1  christos     {
   3246  1.1  christos       len = 0;
   3247  1.1  christos       rcp = strchr (right, *lcp);
   3248  1.1  christos       while (rcp != NULL)
   3249  1.1  christos 	{
   3250  1.1  christos 	  for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
   3251  1.1  christos 	    continue;
   3252  1.1  christos 	  if (i > len)
   3253  1.1  christos 	    len = i;
   3254  1.1  christos 	  rcp = strchr (rcp + 1, *lcp);
   3255  1.1  christos 	}
   3256  1.1  christos       if (len == 0)
   3257  1.1  christos 	continue;
   3258  1.1  christos       if ((cpp = enlist(cpp, lcp, len)) == NULL)
   3259  1.1  christos 	break;
   3260  1.1  christos     }
   3261  1.1  christos   return cpp;
   3262  1.1  christos }
   3263  1.1  christos 
   3264  1.1  christos static char **
   3265  1.1  christos addlists (char **old, char **new)
   3266  1.1  christos {
   3267  1.1  christos   int i;
   3268  1.1  christos 
   3269  1.1  christos   if (old == NULL || new == NULL)
   3270  1.1  christos     return NULL;
   3271  1.1  christos   for (i = 0; new[i] != NULL; ++i)
   3272  1.1  christos     {
   3273  1.1  christos       old = enlist(old, new[i], strlen(new[i]));
   3274  1.1  christos       if (old == NULL)
   3275  1.1  christos 	break;
   3276  1.1  christos     }
   3277  1.1  christos   return old;
   3278  1.1  christos }
   3279  1.1  christos 
   3280  1.1  christos /* Given two lists of substrings, return a new list giving substrings
   3281  1.1  christos    common to both. */
   3282  1.1  christos static char **
   3283  1.1  christos inboth (char **left, char **right)
   3284  1.1  christos {
   3285  1.1  christos   char **both;
   3286  1.1  christos   char **temp;
   3287  1.1  christos   int lnum, rnum;
   3288  1.1  christos 
   3289  1.1  christos   if (left == NULL || right == NULL)
   3290  1.1  christos     return NULL;
   3291  1.1  christos   both = (char **) malloc(sizeof *both);
   3292  1.1  christos   if (both == NULL)
   3293  1.1  christos     return NULL;
   3294  1.1  christos   both[0] = NULL;
   3295  1.1  christos   for (lnum = 0; left[lnum] != NULL; ++lnum)
   3296  1.1  christos     {
   3297  1.1  christos       for (rnum = 0; right[rnum] != NULL; ++rnum)
   3298  1.1  christos 	{
   3299  1.1  christos 	  temp = comsubs(left[lnum], right[rnum]);
   3300  1.1  christos 	  if (temp == NULL)
   3301  1.1  christos 	    {
   3302  1.1  christos 	      freelist(both);
   3303  1.1  christos 	      return NULL;
   3304  1.1  christos 	    }
   3305  1.1  christos 	  both = addlists(both, temp);
   3306  1.1  christos 	  freelist(temp);
   3307  1.1  christos 	  free(temp);
   3308  1.1  christos 	  if (both == NULL)
   3309  1.1  christos 	    return NULL;
   3310  1.1  christos 	}
   3311  1.1  christos     }
   3312  1.1  christos   return both;
   3313  1.1  christos }
   3314  1.1  christos 
   3315  1.1  christos typedef struct
   3316  1.1  christos {
   3317  1.1  christos   char **in;
   3318  1.1  christos   char *left;
   3319  1.1  christos   char *right;
   3320  1.1  christos   char *is;
   3321  1.1  christos } must;
   3322  1.1  christos 
   3323  1.1  christos static void
   3324  1.1  christos resetmust (must *mp)
   3325  1.1  christos {
   3326  1.1  christos   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
   3327  1.1  christos   freelist(mp->in);
   3328  1.1  christos }
   3329  1.1  christos 
   3330  1.1  christos static void
   3331  1.1  christos dfamust (struct dfa *dfa)
   3332  1.1  christos {
   3333  1.1  christos   must *musts;
   3334  1.1  christos   must *mp;
   3335  1.1  christos   char *result;
   3336  1.1  christos   int ri;
   3337  1.1  christos   int i;
   3338  1.1  christos   int exact;
   3339  1.1  christos   token t;
   3340  1.1  christos   static must must0;
   3341  1.1  christos   struct dfamust *dm;
   3342  1.1  christos   static char empty_string[] = "";
   3343  1.1  christos 
   3344  1.1  christos   result = empty_string;
   3345  1.1  christos   exact = 0;
   3346  1.1  christos   musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
   3347  1.1  christos   if (musts == NULL)
   3348  1.1  christos     return;
   3349  1.1  christos   mp = musts;
   3350  1.1  christos   for (i = 0; i <= dfa->tindex; ++i)
   3351  1.1  christos     mp[i] = must0;
   3352  1.1  christos   for (i = 0; i <= dfa->tindex; ++i)
   3353  1.1  christos     {
   3354  1.1  christos       mp[i].in = (char **) malloc(sizeof *mp[i].in);
   3355  1.1  christos       mp[i].left = malloc(2);
   3356  1.1  christos       mp[i].right = malloc(2);
   3357  1.1  christos       mp[i].is = malloc(2);
   3358  1.1  christos       if (mp[i].in == NULL || mp[i].left == NULL ||
   3359  1.1  christos 	  mp[i].right == NULL || mp[i].is == NULL)
   3360  1.1  christos 	goto done;
   3361  1.1  christos       mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
   3362  1.1  christos       mp[i].in[0] = NULL;
   3363  1.1  christos     }
   3364  1.1  christos #ifdef DEBUG
   3365  1.1  christos   fprintf(stderr, "dfamust:\n");
   3366  1.1  christos   for (i = 0; i < dfa->tindex; ++i)
   3367  1.1  christos     {
   3368  1.1  christos       fprintf(stderr, " %d:", i);
   3369  1.1  christos       prtok(dfa->tokens[i]);
   3370  1.1  christos     }
   3371  1.1  christos   putc('\n', stderr);
   3372  1.1  christos #endif
   3373  1.1  christos   for (ri = 0; ri < dfa->tindex; ++ri)
   3374  1.1  christos     {
   3375  1.1  christos       switch (t = dfa->tokens[ri])
   3376  1.1  christos 	{
   3377  1.1  christos 	case LPAREN:
   3378  1.1  christos 	case RPAREN:
   3379  1.1  christos 	  goto done;		/* "cannot happen" */
   3380  1.1  christos 	case EMPTY:
   3381  1.1  christos 	case BEGLINE:
   3382  1.1  christos 	case ENDLINE:
   3383  1.1  christos 	case BEGWORD:
   3384  1.1  christos 	case ENDWORD:
   3385  1.1  christos 	case LIMWORD:
   3386  1.1  christos 	case NOTLIMWORD:
   3387  1.1  christos 	case BACKREF:
   3388  1.1  christos 	  resetmust(mp);
   3389  1.1  christos 	  break;
   3390  1.1  christos 	case STAR:
   3391  1.1  christos 	case QMARK:
   3392  1.1  christos 	  if (mp <= musts)
   3393  1.1  christos 	    goto done;		/* "cannot happen" */
   3394  1.1  christos 	  --mp;
   3395  1.1  christos 	  resetmust(mp);
   3396  1.1  christos 	  break;
   3397  1.1  christos 	case OR:
   3398  1.1  christos 	case ORTOP:
   3399  1.1  christos 	  if (mp < &musts[2])
   3400  1.1  christos 	    goto done;		/* "cannot happen" */
   3401  1.1  christos 	  {
   3402  1.1  christos 	    char **new;
   3403  1.1  christos 	    must *lmp;
   3404  1.1  christos 	    must *rmp;
   3405  1.1  christos 	    int j, ln, rn, n;
   3406  1.1  christos 
   3407  1.1  christos 	    rmp = --mp;
   3408  1.1  christos 	    lmp = --mp;
   3409  1.1  christos 	    /* Guaranteed to be.  Unlikely, but. . . */
   3410  1.1  christos 	    if (strcmp(lmp->is, rmp->is) != 0)
   3411  1.1  christos 	      lmp->is[0] = '\0';
   3412  1.1  christos 	    /* Left side--easy */
   3413  1.1  christos 	    i = 0;
   3414  1.1  christos 	    while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
   3415  1.1  christos 	      ++i;
   3416  1.1  christos 	    lmp->left[i] = '\0';
   3417  1.1  christos 	    /* Right side */
   3418  1.1  christos 	    ln = strlen(lmp->right);
   3419  1.1  christos 	    rn = strlen(rmp->right);
   3420  1.1  christos 	    n = ln;
   3421  1.1  christos 	    if (n > rn)
   3422  1.1  christos 	      n = rn;
   3423  1.1  christos 	    for (i = 0; i < n; ++i)
   3424  1.1  christos 	      if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
   3425  1.1  christos 		break;
   3426  1.1  christos 	    for (j = 0; j < i; ++j)
   3427  1.1  christos 	      lmp->right[j] = lmp->right[(ln - i) + j];
   3428  1.1  christos 	    lmp->right[j] = '\0';
   3429  1.1  christos 	    new = inboth(lmp->in, rmp->in);
   3430  1.1  christos 	    if (new == NULL)
   3431  1.1  christos 	      goto done;
   3432  1.1  christos 	    freelist(lmp->in);
   3433  1.1  christos 	    free((char *) lmp->in);
   3434  1.1  christos 	    lmp->in = new;
   3435  1.1  christos 	  }
   3436  1.1  christos 	  break;
   3437  1.1  christos 	case PLUS:
   3438  1.1  christos 	  if (mp <= musts)
   3439  1.1  christos 	    goto done;		/* "cannot happen" */
   3440  1.1  christos 	  --mp;
   3441  1.1  christos 	  mp->is[0] = '\0';
   3442  1.1  christos 	  break;
   3443  1.1  christos 	case END:
   3444  1.1  christos 	  if (mp != &musts[1])
   3445  1.1  christos 	    goto done;		/* "cannot happen" */
   3446  1.1  christos 	  for (i = 0; musts[0].in[i] != NULL; ++i)
   3447  1.1  christos 	    if (strlen(musts[0].in[i]) > strlen(result))
   3448  1.1  christos 	      result = musts[0].in[i];
   3449  1.1  christos 	  if (strcmp(result, musts[0].is) == 0)
   3450  1.1  christos 	    exact = 1;
   3451  1.1  christos 	  goto done;
   3452  1.1  christos 	case CAT:
   3453  1.1  christos 	  if (mp < &musts[2])
   3454  1.1  christos 	    goto done;		/* "cannot happen" */
   3455  1.1  christos 	  {
   3456  1.1  christos 	    must *lmp;
   3457  1.1  christos 	    must *rmp;
   3458  1.1  christos 
   3459  1.1  christos 	    rmp = --mp;
   3460  1.1  christos 	    lmp = --mp;
   3461  1.1  christos 	    /* In.  Everything in left, plus everything in
   3462  1.1  christos 	       right, plus catenation of
   3463  1.1  christos 	       left's right and right's left. */
   3464  1.1  christos 	    lmp->in = addlists(lmp->in, rmp->in);
   3465  1.1  christos 	    if (lmp->in == NULL)
   3466  1.1  christos 	      goto done;
   3467  1.1  christos 	    if (lmp->right[0] != '\0' &&
   3468  1.1  christos 		rmp->left[0] != '\0')
   3469  1.1  christos 	      {
   3470  1.1  christos 		char *tp;
   3471  1.1  christos 
   3472  1.1  christos 		tp = icpyalloc(lmp->right);
   3473  1.1  christos 		if (tp == NULL)
   3474  1.1  christos 		  goto done;
   3475  1.1  christos 		tp = icatalloc(tp, rmp->left);
   3476  1.1  christos 		if (tp == NULL)
   3477  1.1  christos 		  goto done;
   3478  1.1  christos 		lmp->in = enlist(lmp->in, tp,
   3479  1.1  christos 				 strlen(tp));
   3480  1.1  christos 		free(tp);
   3481  1.1  christos 		if (lmp->in == NULL)
   3482  1.1  christos 		  goto done;
   3483  1.1  christos 	      }
   3484  1.1  christos 	    /* Left-hand */
   3485  1.1  christos 	    if (lmp->is[0] != '\0')
   3486  1.1  christos 	      {
   3487  1.1  christos 		lmp->left = icatalloc(lmp->left,
   3488  1.1  christos 				      rmp->left);
   3489  1.1  christos 		if (lmp->left == NULL)
   3490  1.1  christos 		  goto done;
   3491  1.1  christos 	      }
   3492  1.1  christos 	    /* Right-hand */
   3493  1.1  christos 	    if (rmp->is[0] == '\0')
   3494  1.1  christos 	      lmp->right[0] = '\0';
   3495  1.1  christos 	    lmp->right = icatalloc(lmp->right, rmp->right);
   3496  1.1  christos 	    if (lmp->right == NULL)
   3497  1.1  christos 	      goto done;
   3498  1.1  christos 	    /* Guaranteed to be */
   3499  1.1  christos 	    if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
   3500  1.1  christos 	      {
   3501  1.1  christos 		lmp->is = icatalloc(lmp->is, rmp->is);
   3502  1.1  christos 		if (lmp->is == NULL)
   3503  1.1  christos 		  goto done;
   3504  1.1  christos 	      }
   3505  1.1  christos 	    else
   3506  1.1  christos 	      lmp->is[0] = '\0';
   3507  1.1  christos 	  }
   3508  1.1  christos 	  break;
   3509  1.1  christos 	default:
   3510  1.1  christos 	  if (t < END)
   3511  1.1  christos 	    {
   3512  1.1  christos 	      /* "cannot happen" */
   3513  1.1  christos 	      goto done;
   3514  1.1  christos 	    }
   3515  1.1  christos 	  else if (t == '\0')
   3516  1.1  christos 	    {
   3517  1.1  christos 	      /* not on *my* shift */
   3518  1.1  christos 	      goto done;
   3519  1.1  christos 	    }
   3520  1.1  christos 	  else if (t >= CSET
   3521  1.1  christos #ifdef MBS_SUPPORT
   3522  1.1  christos 		   || t == ANYCHAR
   3523  1.1  christos 		   || t == MBCSET
   3524  1.1  christos #endif /* MBS_SUPPORT */
   3525  1.1  christos 		   )
   3526  1.1  christos 	    {
   3527  1.1  christos 	      /* easy enough */
   3528  1.1  christos 	      resetmust(mp);
   3529  1.1  christos 	    }
   3530  1.1  christos 	  else
   3531  1.1  christos 	    {
   3532  1.1  christos 	      /* plain character */
   3533  1.1  christos 	      resetmust(mp);
   3534  1.1  christos 	      mp->is[0] = mp->left[0] = mp->right[0] = t;
   3535  1.1  christos 	      mp->is[1] = mp->left[1] = mp->right[1] = '\0';
   3536  1.1  christos 	      mp->in = enlist(mp->in, mp->is, (size_t)1);
   3537  1.1  christos 	      if (mp->in == NULL)
   3538  1.1  christos 		goto done;
   3539  1.1  christos 	    }
   3540  1.1  christos 	  break;
   3541  1.1  christos 	}
   3542  1.1  christos #ifdef DEBUG
   3543  1.1  christos       fprintf(stderr, " node: %d:", ri);
   3544  1.1  christos       prtok(dfa->tokens[ri]);
   3545  1.1  christos       fprintf(stderr, "\n  in:");
   3546  1.1  christos       for (i = 0; mp->in[i]; ++i)
   3547  1.1  christos 	fprintf(stderr, " \"%s\"", mp->in[i]);
   3548  1.1  christos       fprintf(stderr, "\n  is: \"%s\"\n", mp->is);
   3549  1.1  christos       fprintf(stderr, "  left: \"%s\"\n", mp->left);
   3550  1.1  christos       fprintf(stderr, "  right: \"%s\"\n", mp->right);
   3551  1.1  christos #endif
   3552  1.1  christos       ++mp;
   3553  1.1  christos     }
   3554  1.1  christos  done:
   3555  1.1  christos   if (strlen(result))
   3556  1.1  christos     {
   3557  1.1  christos       dm = (struct dfamust *) malloc(sizeof (struct dfamust));
   3558  1.1  christos       dm->exact = exact;
   3559  1.1  christos       dm->must = malloc(strlen(result) + 1);
   3560  1.1  christos       strcpy(dm->must, result);
   3561  1.1  christos       dm->next = dfa->musts;
   3562  1.1  christos       dfa->musts = dm;
   3563  1.1  christos     }
   3564  1.1  christos   mp = musts;
   3565  1.1  christos   for (i = 0; i <= dfa->tindex; ++i)
   3566  1.1  christos     {
   3567  1.1  christos       freelist(mp[i].in);
   3568  1.1  christos       ifree((char *) mp[i].in);
   3569  1.1  christos       ifree(mp[i].left);
   3570  1.1  christos       ifree(mp[i].right);
   3571  1.1  christos       ifree(mp[i].is);
   3572  1.1  christos     }
   3573  1.1  christos   free((char *) mp);
   3574  1.1  christos }
   3575  1.1  christos /* vim:set shiftwidth=2: */
   3576