1 1.1 christos /* dfa.c - deterministic extended regexp routines for GNU 2 1.1 christos Copyright 1988, 1998, 2000, 2005 Free Software Foundation, Inc. 3 1.1 christos 4 1.1 christos This program is free software; you can redistribute it and/or modify 5 1.1 christos it under the terms of the GNU General Public License as published by 6 1.1 christos the Free Software Foundation; either version 2, or (at your option) 7 1.1 christos any later version. 8 1.1 christos 9 1.1 christos This program is distributed in the hope that it will be useful, 10 1.1 christos but WITHOUT ANY WARRANTY; without even the implied warranty of 11 1.1 christos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 1.1 christos GNU General Public License for more details. 13 1.1 christos 14 1.1 christos You should have received a copy of the GNU General Public License 15 1.1 christos along with this program; if not, write to the Free Software Foundation, 16 1.1 christos Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-13017, USA */ 17 1.1 christos 18 1.1 christos /* Written June, 1988 by Mike Haertel 19 1.1 christos Modified July, 1988 by Arthur David Olson to assist BMG speedups */ 20 1.1 christos 21 1.1 christos #ifdef HAVE_CONFIG_H 22 1.1 christos #include <config.h> 23 1.1 christos #endif 24 1.1 christos 25 1.1 christos #include <assert.h> 26 1.1 christos #include <ctype.h> 27 1.1 christos #include <stdio.h> 28 1.1 christos 29 1.1 christos #include <sys/types.h> 30 1.1 christos #include <stdlib.h> 31 1.1 christos #include <string.h> 32 1.1 christos #include <locale.h> 33 1.1 christos 34 1.1 christos #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC 35 1.1 christos /* We can handle multibyte string. */ 36 1.1 christos # define MBS_SUPPORT 37 1.1 christos #endif 38 1.1 christos 39 1.1 christos #ifdef MBS_SUPPORT 40 1.1 christos # include <wchar.h> 41 1.1 christos # include <wctype.h> 42 1.1 christos #endif 43 1.1 christos 44 1.1 christos #ifndef DEBUG /* use the same approach as regex.c */ 45 1.1 christos #undef assert 46 1.1 christos #define assert(e) 47 1.1 christos #endif /* DEBUG */ 48 1.1 christos 49 1.1 christos #ifndef isgraph 50 1.1 christos #define isgraph(C) (isprint(C) && !isspace(C)) 51 1.1 christos #endif 52 1.1 christos 53 1.1 christos #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 54 1.1 christos #define ISALPHA(C) isalpha(C) 55 1.1 christos #define ISUPPER(C) isupper(C) 56 1.1 christos #define ISLOWER(C) islower(C) 57 1.1 christos #define ISDIGIT(C) isdigit(C) 58 1.1 christos #define ISXDIGIT(C) isxdigit(C) 59 1.1 christos #define ISSPACE(C) isspace(C) 60 1.1 christos #define ISPUNCT(C) ispunct(C) 61 1.1 christos #define ISALNUM(C) isalnum(C) 62 1.1 christos #define ISPRINT(C) isprint(C) 63 1.1 christos #define ISGRAPH(C) isgraph(C) 64 1.1 christos #define ISCNTRL(C) iscntrl(C) 65 1.1 christos #else 66 1.1 christos #define ISALPHA(C) (isascii(C) && isalpha(C)) 67 1.1 christos #define ISUPPER(C) (isascii(C) && isupper(C)) 68 1.1 christos #define ISLOWER(C) (isascii(C) && islower(C)) 69 1.1 christos #define ISDIGIT(C) (isascii(C) && isdigit(C)) 70 1.1 christos #define ISXDIGIT(C) (isascii(C) && isxdigit(C)) 71 1.1 christos #define ISSPACE(C) (isascii(C) && isspace(C)) 72 1.1 christos #define ISPUNCT(C) (isascii(C) && ispunct(C)) 73 1.1 christos #define ISALNUM(C) (isascii(C) && isalnum(C)) 74 1.1 christos #define ISPRINT(C) (isascii(C) && isprint(C)) 75 1.1 christos #define ISGRAPH(C) (isascii(C) && isgraph(C)) 76 1.1 christos #define ISCNTRL(C) (isascii(C) && iscntrl(C)) 77 1.1 christos #endif 78 1.1 christos 79 1.1 christos /* ISASCIIDIGIT differs from ISDIGIT, as follows: 80 1.1 christos - Its arg may be any int or unsigned int; it need not be an unsigned char. 81 1.1 christos - It's guaranteed to evaluate its argument exactly once. 82 1.1 christos - It's typically faster. 83 1.1 christos Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that 84 1.1 christos only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless 85 1.1 christos it's important to use the locale's definition of `digit' even when the 86 1.1 christos host does not conform to Posix. */ 87 1.1 christos #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9) 88 1.1 christos 89 1.1 christos #include "regex.h" 90 1.1 christos #include "dfa.h" 91 1.1 christos #include "hard-locale.h" 92 1.1 christos #include "gettext.h" 93 1.1 christos #define _(str) gettext (str) 94 1.1 christos 95 1.1 christos /* HPUX, define those as macros in sys/param.h */ 96 1.1 christos #ifdef setbit 97 1.1 christos # undef setbit 98 1.1 christos #endif 99 1.1 christos #ifdef clrbit 100 1.1 christos # undef clrbit 101 1.1 christos #endif 102 1.1 christos 103 1.1 christos static void dfamust (struct dfa *dfa); 104 1.1 christos static void regexp (int toplevel); 105 1.1 christos 106 1.1 christos static ptr_t 107 1.1 christos xcalloc (size_t n, size_t s) 108 1.1 christos { 109 1.1 christos ptr_t r = calloc(n, s); 110 1.1 christos 111 1.1 christos if (!r) 112 1.1 christos dfaerror(_("Memory exhausted")); 113 1.1 christos return r; 114 1.1 christos } 115 1.1 christos 116 1.1 christos static ptr_t 117 1.1 christos xmalloc (size_t n) 118 1.1 christos { 119 1.1 christos ptr_t r = malloc(n); 120 1.1 christos 121 1.1 christos assert(n != 0); 122 1.1 christos if (!r) 123 1.1 christos dfaerror(_("Memory exhausted")); 124 1.1 christos return r; 125 1.1 christos } 126 1.1 christos 127 1.1 christos static ptr_t 128 1.1 christos xrealloc (ptr_t p, size_t n) 129 1.1 christos { 130 1.1 christos ptr_t r = realloc(p, n); 131 1.1 christos 132 1.1 christos assert(n != 0); 133 1.1 christos if (!r) 134 1.1 christos dfaerror(_("Memory exhausted")); 135 1.1 christos return r; 136 1.1 christos } 137 1.1 christos 138 1.1 christos #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t))) 139 1.1 christos #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) 140 1.1 christos #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) 141 1.1 christos 142 1.1 christos /* Reallocate an array of type t if nalloc is too small for index. */ 143 1.1 christos #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ 144 1.1 christos if ((index) >= (nalloc)) \ 145 1.1 christos { \ 146 1.1 christos do \ 147 1.1 christos (nalloc) *= 2; \ 148 1.1 christos while ((index) >= (nalloc)); \ 149 1.1 christos REALLOC(p, t, nalloc); \ 150 1.1 christos } 151 1.1 christos 152 1.1 christos #ifdef DEBUG 153 1.1 christos 154 1.1 christos static void 155 1.1 christos prtok (token t) 156 1.1 christos { 157 1.1 christos char const *s; 158 1.1 christos 159 1.1 christos if (t < 0) 160 1.1 christos fprintf(stderr, "END"); 161 1.1 christos else if (t < NOTCHAR) 162 1.1 christos fprintf(stderr, "%c", t); 163 1.1 christos else 164 1.1 christos { 165 1.1 christos switch (t) 166 1.1 christos { 167 1.1 christos case EMPTY: s = "EMPTY"; break; 168 1.1 christos case BACKREF: s = "BACKREF"; break; 169 1.1 christos case BEGLINE: s = "BEGLINE"; break; 170 1.1 christos case ENDLINE: s = "ENDLINE"; break; 171 1.1 christos case BEGWORD: s = "BEGWORD"; break; 172 1.1 christos case ENDWORD: s = "ENDWORD"; break; 173 1.1 christos case LIMWORD: s = "LIMWORD"; break; 174 1.1 christos case NOTLIMWORD: s = "NOTLIMWORD"; break; 175 1.1 christos case QMARK: s = "QMARK"; break; 176 1.1 christos case STAR: s = "STAR"; break; 177 1.1 christos case PLUS: s = "PLUS"; break; 178 1.1 christos case CAT: s = "CAT"; break; 179 1.1 christos case OR: s = "OR"; break; 180 1.1 christos case ORTOP: s = "ORTOP"; break; 181 1.1 christos case LPAREN: s = "LPAREN"; break; 182 1.1 christos case RPAREN: s = "RPAREN"; break; 183 1.1 christos case CRANGE: s = "CRANGE"; break; 184 1.1 christos #ifdef MBS_SUPPORT 185 1.1 christos case ANYCHAR: s = "ANYCHAR"; break; 186 1.1 christos case MBCSET: s = "MBCSET"; break; 187 1.1 christos #endif /* MBS_SUPPORT */ 188 1.1 christos default: s = "CSET"; break; 189 1.1 christos } 190 1.1 christos fprintf(stderr, "%s", s); 191 1.1 christos } 192 1.1 christos } 193 1.1 christos #endif /* DEBUG */ 194 1.1 christos 195 1.1 christos /* Stuff pertaining to charclasses. */ 196 1.1 christos 197 1.1 christos static int 198 1.1 christos tstbit (unsigned b, charclass c) 199 1.1 christos { 200 1.1 christos return c[b / INTBITS] & 1 << b % INTBITS; 201 1.1 christos } 202 1.1 christos 203 1.1 christos static void 204 1.1 christos setbit (unsigned b, charclass c) 205 1.1 christos { 206 1.1 christos c[b / INTBITS] |= 1 << b % INTBITS; 207 1.1 christos } 208 1.1 christos 209 1.1 christos static void 210 1.1 christos clrbit (unsigned b, charclass c) 211 1.1 christos { 212 1.1 christos c[b / INTBITS] &= ~(1 << b % INTBITS); 213 1.1 christos } 214 1.1 christos 215 1.1 christos static void 216 1.1 christos copyset (charclass src, charclass dst) 217 1.1 christos { 218 1.1 christos memcpy (dst, src, sizeof (charclass)); 219 1.1 christos } 220 1.1 christos 221 1.1 christos static void 222 1.1 christos zeroset (charclass s) 223 1.1 christos { 224 1.1 christos memset (s, 0, sizeof (charclass)); 225 1.1 christos } 226 1.1 christos 227 1.1 christos static void 228 1.1 christos notset (charclass s) 229 1.1 christos { 230 1.1 christos int i; 231 1.1 christos 232 1.1 christos for (i = 0; i < CHARCLASS_INTS; ++i) 233 1.1 christos s[i] = ~s[i]; 234 1.1 christos } 235 1.1 christos 236 1.1 christos static int 237 1.1 christos equal (charclass s1, charclass s2) 238 1.1 christos { 239 1.1 christos return memcmp (s1, s2, sizeof (charclass)) == 0; 240 1.1 christos } 241 1.1 christos 242 1.1 christos /* A pointer to the current dfa is kept here during parsing. */ 243 1.1 christos static struct dfa *dfa; 244 1.1 christos 245 1.1 christos /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */ 246 1.1 christos static int 247 1.1 christos charclass_index (charclass s) 248 1.1 christos { 249 1.1 christos int i; 250 1.1 christos 251 1.1 christos for (i = 0; i < dfa->cindex; ++i) 252 1.1 christos if (equal(s, dfa->charclasses[i])) 253 1.1 christos return i; 254 1.1 christos REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex); 255 1.1 christos ++dfa->cindex; 256 1.1 christos copyset(s, dfa->charclasses[i]); 257 1.1 christos return i; 258 1.1 christos } 259 1.1 christos 260 1.1 christos /* Syntax bits controlling the behavior of the lexical analyzer. */ 261 1.1 christos static reg_syntax_t syntax_bits, syntax_bits_set; 262 1.1 christos 263 1.1 christos /* Flag for case-folding letters into sets. */ 264 1.1 christos static int case_fold; 265 1.1 christos 266 1.1 christos /* End-of-line byte in data. */ 267 1.1 christos static unsigned char eolbyte; 268 1.1 christos 269 1.1 christos /* Entry point to set syntax options. */ 270 1.1 christos void 271 1.1 christos dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) 272 1.1 christos { 273 1.1 christos syntax_bits_set = 1; 274 1.1 christos syntax_bits = bits; 275 1.1 christos case_fold = fold; 276 1.1 christos eolbyte = eol; 277 1.1 christos } 278 1.1 christos 279 1.1 christos /* Like setbit, but if case is folded, set both cases of a letter. */ 280 1.1 christos static void 281 1.1 christos setbit_case_fold (unsigned b, charclass c) 282 1.1 christos { 283 1.1 christos setbit (b, c); 284 1.1 christos if (case_fold) 285 1.1 christos { 286 1.1 christos if (ISUPPER (b)) 287 1.1 christos setbit (tolower (b), c); 288 1.1 christos else if (ISLOWER (b)) 289 1.1 christos setbit (toupper (b), c); 290 1.1 christos } 291 1.1 christos } 292 1.1 christos 293 1.1 christos /* Lexical analyzer. All the dross that deals with the obnoxious 294 1.1 christos GNU Regex syntax bits is located here. The poor, suffering 295 1.1 christos reader is referred to the GNU Regex documentation for the 296 1.1 christos meaning of the @#%!@#%^!@ syntax bits. */ 297 1.1 christos 298 1.1 christos static char const *lexstart; /* Pointer to beginning of input string. */ 299 1.1 christos static char const *lexptr; /* Pointer to next input character. */ 300 1.1 christos static int lexleft; /* Number of characters remaining. */ 301 1.1 christos static token lasttok; /* Previous token returned; initially END. */ 302 1.1 christos static int laststart; /* True if we're separated from beginning or (, | 303 1.1 christos only by zero-width characters. */ 304 1.1 christos static int parens; /* Count of outstanding left parens. */ 305 1.1 christos static int minrep, maxrep; /* Repeat counts for {m,n}. */ 306 1.1 christos static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */ 307 1.1 christos 308 1.1 christos #ifdef MBS_SUPPORT 309 1.1 christos /* These variables are used only if (MB_CUR_MAX > 1). */ 310 1.1 christos static mbstate_t mbs; /* Mbstate for mbrlen(). */ 311 1.1 christos static int cur_mb_len; /* Byte length of the current scanning 312 1.1 christos multibyte character. */ 313 1.1 christos static int cur_mb_index; /* Byte index of the current scanning multibyte 314 1.1 christos character. 315 1.1 christos 316 1.1 christos singlebyte character : cur_mb_index = 0 317 1.1 christos multibyte character 318 1.1 christos 1st byte : cur_mb_index = 1 319 1.1 christos 2nd byte : cur_mb_index = 2 320 1.1 christos ... 321 1.1 christos nth byte : cur_mb_index = n */ 322 1.1 christos static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec(). 323 1.1 christos Each element store the amount of remain 324 1.1 christos byte of corresponding multibyte character 325 1.1 christos in the input string. A element's value 326 1.1 christos is 0 if corresponding character is a 327 1.1 christos singlebyte chracter. 328 1.1 christos e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)> 329 1.1 christos mblen_buf : 0, 3, 2, 1 330 1.1 christos */ 331 1.1 christos static wchar_t *inputwcs; /* Wide character representation of input 332 1.1 christos string in dfaexec(). 333 1.1 christos The length of this array is same as 334 1.1 christos the length of input string(char array). 335 1.1 christos inputstring[i] is a single-byte char, 336 1.1 christos or 1st byte of a multibyte char. 337 1.1 christos And inputwcs[i] is the codepoint. */ 338 1.1 christos static unsigned char const *buf_begin;/* refference to begin in dfaexec(). */ 339 1.1 christos static unsigned char const *buf_end; /* refference to end in dfaexec(). */ 340 1.1 christos #endif /* MBS_SUPPORT */ 341 1.1 christos 342 1.1 christos #ifdef MBS_SUPPORT 343 1.1 christos /* This function update cur_mb_len, and cur_mb_index. 344 1.1 christos p points current lexptr, len is the remaining buffer length. */ 345 1.1 christos static void 346 1.1 christos update_mb_len_index (char const *p, int len) 347 1.1 christos { 348 1.1 christos /* If last character is a part of a multibyte character, 349 1.1 christos we update cur_mb_index. */ 350 1.1 christos if (cur_mb_index) 351 1.1 christos cur_mb_index = (cur_mb_index >= cur_mb_len)? 0 352 1.1 christos : cur_mb_index + 1; 353 1.1 christos 354 1.1 christos /* If last character is a single byte character, or the 355 1.1 christos last portion of a multibyte character, we check whether 356 1.1 christos next character is a multibyte character or not. */ 357 1.1 christos if (! cur_mb_index) 358 1.1 christos { 359 1.1 christos cur_mb_len = mbrlen(p, len, &mbs); 360 1.1 christos if (cur_mb_len > 1) 361 1.1 christos /* It is a multibyte character. 362 1.1 christos cur_mb_len was already set by mbrlen(). */ 363 1.1 christos cur_mb_index = 1; 364 1.1 christos else if (cur_mb_len < 1) 365 1.1 christos /* Invalid sequence. We treat it as a singlebyte character. 366 1.1 christos cur_mb_index is aleady 0. */ 367 1.1 christos cur_mb_len = 1; 368 1.1 christos /* Otherwise, cur_mb_len == 1, it is a singlebyte character. 369 1.1 christos cur_mb_index is aleady 0. */ 370 1.1 christos } 371 1.1 christos } 372 1.1 christos #endif /* MBS_SUPPORT */ 373 1.1 christos 374 1.1 christos #ifdef MBS_SUPPORT 375 1.1 christos /* Note that characters become unsigned here. */ 376 1.1 christos # define FETCH(c, eoferr) \ 377 1.1 christos { \ 378 1.1 christos if (! lexleft) \ 379 1.1 christos { \ 380 1.1 christos if (eoferr != 0) \ 381 1.1 christos dfaerror (eoferr); \ 382 1.1 christos else \ 383 1.1 christos return lasttok = END; \ 384 1.1 christos } \ 385 1.1 christos if (MB_CUR_MAX > 1) \ 386 1.1 christos update_mb_len_index(lexptr, lexleft); \ 387 1.1 christos (c) = (unsigned char) *lexptr++; \ 388 1.1 christos --lexleft; \ 389 1.1 christos } 390 1.1 christos 391 1.1 christos /* This function fetch a wide character, and update cur_mb_len, 392 1.1 christos used only if the current locale is a multibyte environment. */ 393 1.1 christos static wchar_t 394 1.1 christos fetch_wc (char const *eoferr) 395 1.1 christos { 396 1.1 christos wchar_t wc; 397 1.1 christos if (! lexleft) 398 1.1 christos { 399 1.1 christos if (eoferr != 0) 400 1.1 christos dfaerror (eoferr); 401 1.1 christos else 402 1.1 christos return -1; 403 1.1 christos } 404 1.1 christos 405 1.1 christos cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); 406 1.1 christos if (cur_mb_len <= 0) 407 1.1 christos { 408 1.1 christos cur_mb_len = 1; 409 1.1 christos wc = *lexptr; 410 1.1 christos } 411 1.1 christos lexptr += cur_mb_len; 412 1.1 christos lexleft -= cur_mb_len; 413 1.1 christos return wc; 414 1.1 christos } 415 1.1 christos #else 416 1.1 christos /* Note that characters become unsigned here. */ 417 1.1 christos # define FETCH(c, eoferr) \ 418 1.1 christos { \ 419 1.1 christos if (! lexleft) \ 420 1.1 christos { \ 421 1.1 christos if (eoferr != 0) \ 422 1.1 christos dfaerror (eoferr); \ 423 1.1 christos else \ 424 1.1 christos return lasttok = END; \ 425 1.1 christos } \ 426 1.1 christos (c) = (unsigned char) *lexptr++; \ 427 1.1 christos --lexleft; \ 428 1.1 christos } 429 1.1 christos #endif /* MBS_SUPPORT */ 430 1.1 christos 431 1.1 christos #ifdef MBS_SUPPORT 432 1.1 christos /* Multibyte character handling sub-routin for lex. 433 1.1 christos This function parse a bracket expression and build a struct 434 1.1 christos mb_char_classes. */ 435 1.1 christos static void 436 1.1 christos parse_bracket_exp_mb () 437 1.1 christos { 438 1.1 christos wchar_t wc, wc1, wc2; 439 1.1 christos 440 1.1 christos /* Work area to build a mb_char_classes. */ 441 1.1 christos struct mb_char_classes *work_mbc; 442 1.1 christos int chars_al, range_sts_al, range_ends_al, ch_classes_al, 443 1.1 christos equivs_al, coll_elems_al; 444 1.1 christos 445 1.1 christos REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes, 446 1.1 christos dfa->mbcsets_alloc, dfa->nmbcsets + 1); 447 1.1 christos /* dfa->multibyte_prop[] hold the index of dfa->mbcsets. 448 1.1 christos We will update dfa->multibyte_prop in addtok(), because we can't 449 1.1 christos decide the index in dfa->tokens[]. */ 450 1.1 christos 451 1.1 christos /* Initialize work are */ 452 1.1 christos work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]); 453 1.1 christos 454 1.1 christos chars_al = 1; 455 1.1 christos range_sts_al = range_ends_al = 0; 456 1.1 christos ch_classes_al = equivs_al = coll_elems_al = 0; 457 1.1 christos MALLOC(work_mbc->chars, wchar_t, chars_al); 458 1.1 christos 459 1.1 christos work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0; 460 1.1 christos work_mbc->nequivs = work_mbc->ncoll_elems = 0; 461 1.1 christos work_mbc->chars = NULL; 462 1.1 christos work_mbc->ch_classes = NULL; 463 1.1 christos work_mbc->range_sts = work_mbc->range_ends = NULL; 464 1.1 christos work_mbc->equivs = work_mbc->coll_elems = NULL; 465 1.1 christos 466 1.1 christos wc = fetch_wc(_("Unbalanced [")); 467 1.1 christos if (wc == L'^') 468 1.1 christos { 469 1.1 christos wc = fetch_wc(_("Unbalanced [")); 470 1.1 christos work_mbc->invert = 1; 471 1.1 christos } 472 1.1 christos else 473 1.1 christos work_mbc->invert = 0; 474 1.1 christos do 475 1.1 christos { 476 1.1 christos wc1 = -1; /* mark wc1 is not initialized". */ 477 1.1 christos 478 1.1 christos /* Note that if we're looking at some other [:...:] construct, 479 1.1 christos we just treat it as a bunch of ordinary characters. We can do 480 1.1 christos this because we assume regex has checked for syntax errors before 481 1.1 christos dfa is ever called. */ 482 1.1 christos if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES)) 483 1.1 christos { 484 1.1 christos #define BRACKET_BUFFER_SIZE 128 485 1.1 christos char str[BRACKET_BUFFER_SIZE]; 486 1.1 christos wc1 = wc; 487 1.1 christos wc = fetch_wc(_("Unbalanced [")); 488 1.1 christos 489 1.1 christos /* If pattern contains `[[:', `[[.', or `[[='. */ 490 1.1 christos if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'=')) 491 1.1 christos { 492 1.1 christos unsigned char c; 493 1.1 christos unsigned char delim = (unsigned char)wc; 494 1.1 christos int len = 0; 495 1.1 christos for (;;) 496 1.1 christos { 497 1.1 christos if (! lexleft) 498 1.1 christos dfaerror (_("Unbalanced [")); 499 1.1 christos c = (unsigned char) *lexptr++; 500 1.1 christos --lexleft; 501 1.1 christos 502 1.1 christos if ((c == delim && *lexptr == ']') || lexleft == 0) 503 1.1 christos break; 504 1.1 christos if (len < BRACKET_BUFFER_SIZE) 505 1.1 christos str[len++] = c; 506 1.1 christos else 507 1.1 christos /* This is in any case an invalid class name. */ 508 1.1 christos str[0] = '\0'; 509 1.1 christos } 510 1.1 christos str[len] = '\0'; 511 1.1 christos 512 1.1 christos if (lexleft == 0) 513 1.1 christos { 514 1.1 christos REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, 515 1.1 christos work_mbc->nchars + 2); 516 1.1 christos work_mbc->chars[work_mbc->nchars++] = L'['; 517 1.1 christos work_mbc->chars[work_mbc->nchars++] = delim; 518 1.1 christos break; 519 1.1 christos } 520 1.1 christos 521 1.1 christos if (--lexleft, *lexptr++ != ']') 522 1.1 christos dfaerror (_("Unbalanced [")); 523 1.1 christos if (delim == ':') 524 1.1 christos /* build character class. */ 525 1.1 christos { 526 1.1 christos wctype_t wt; 527 1.1 christos /* Query the character class as wctype_t. */ 528 1.1 christos wt = wctype (str); 529 1.1 christos 530 1.1 christos if (ch_classes_al == 0) 531 1.1 christos MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al); 532 1.1 christos REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t, 533 1.1 christos ch_classes_al, 534 1.1 christos work_mbc->nch_classes + 1); 535 1.1 christos work_mbc->ch_classes[work_mbc->nch_classes++] = wt; 536 1.1 christos 537 1.1 christos } 538 1.1 christos else if (delim == '=' || delim == '.') 539 1.1 christos { 540 1.1 christos char *elem; 541 1.1 christos MALLOC(elem, char, len + 1); 542 1.1 christos strncpy(elem, str, len + 1); 543 1.1 christos 544 1.1 christos if (delim == '=') 545 1.1 christos /* build equivalent class. */ 546 1.1 christos { 547 1.1 christos if (equivs_al == 0) 548 1.1 christos MALLOC(work_mbc->equivs, char*, ++equivs_al); 549 1.1 christos REALLOC_IF_NECESSARY(work_mbc->equivs, char*, 550 1.1 christos equivs_al, 551 1.1 christos work_mbc->nequivs + 1); 552 1.1 christos work_mbc->equivs[work_mbc->nequivs++] = elem; 553 1.1 christos } 554 1.1 christos 555 1.1 christos if (delim == '.') 556 1.1 christos /* build collating element. */ 557 1.1 christos { 558 1.1 christos if (coll_elems_al == 0) 559 1.1 christos MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al); 560 1.1 christos REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*, 561 1.1 christos coll_elems_al, 562 1.1 christos work_mbc->ncoll_elems + 1); 563 1.1 christos work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; 564 1.1 christos } 565 1.1 christos } 566 1.1 christos wc = -1; 567 1.1 christos } 568 1.1 christos else 569 1.1 christos /* We treat '[' as a normal character here. */ 570 1.1 christos { 571 1.1 christos wc2 = wc1; wc1 = wc; wc = wc2; /* swap */ 572 1.1 christos } 573 1.1 christos } 574 1.1 christos else 575 1.1 christos { 576 1.1 christos if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 577 1.1 christos wc = fetch_wc(("Unbalanced [")); 578 1.1 christos } 579 1.1 christos 580 1.1 christos if (wc1 == -1) 581 1.1 christos wc1 = fetch_wc(_("Unbalanced [")); 582 1.1 christos 583 1.1 christos if (wc1 == L'-') 584 1.1 christos /* build range characters. */ 585 1.1 christos { 586 1.1 christos wc2 = fetch_wc(_("Unbalanced [")); 587 1.1 christos if (wc2 == L']') 588 1.1 christos { 589 1.1 christos /* In the case [x-], the - is an ordinary hyphen, 590 1.1 christos which is left in c1, the lookahead character. */ 591 1.1 christos lexptr -= cur_mb_len; 592 1.1 christos lexleft += cur_mb_len; 593 1.1 christos wc2 = wc; 594 1.1 christos } 595 1.1 christos else 596 1.1 christos { 597 1.1 christos if (wc2 == L'\\' 598 1.1 christos && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 599 1.1 christos wc2 = fetch_wc(_("Unbalanced [")); 600 1.1 christos wc1 = fetch_wc(_("Unbalanced [")); 601 1.1 christos } 602 1.1 christos 603 1.1 christos if (range_sts_al == 0) 604 1.1 christos { 605 1.1 christos MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al); 606 1.1 christos MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al); 607 1.1 christos } 608 1.1 christos REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, 609 1.1 christos range_sts_al, work_mbc->nranges + 1); 610 1.1 christos work_mbc->range_sts[work_mbc->nranges] = wc; 611 1.1 christos REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, 612 1.1 christos range_ends_al, work_mbc->nranges + 1); 613 1.1 christos work_mbc->range_ends[work_mbc->nranges++] = wc2; 614 1.1 christos } 615 1.1 christos else if (wc != -1) 616 1.1 christos /* build normal characters. */ 617 1.1 christos { 618 1.1 christos REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, 619 1.1 christos work_mbc->nchars + 1); 620 1.1 christos work_mbc->chars[work_mbc->nchars++] = wc; 621 1.1 christos } 622 1.1 christos } 623 1.1 christos while ((wc = wc1) != L']'); 624 1.1 christos } 625 1.1 christos #endif /* MBS_SUPPORT */ 626 1.1 christos 627 1.1 christos #ifdef __STDC__ 628 1.1 christos #define FUNC(F, P) static int F(int c) { return P(c); } 629 1.1 christos #else 630 1.1 christos #define FUNC(F, P) static int F(c) int c; { return P(c); } 631 1.1 christos #endif 632 1.1 christos 633 1.1 christos FUNC(is_alpha, ISALPHA) 634 1.1 christos FUNC(is_upper, ISUPPER) 635 1.1 christos FUNC(is_lower, ISLOWER) 636 1.1 christos FUNC(is_digit, ISDIGIT) 637 1.1 christos FUNC(is_xdigit, ISXDIGIT) 638 1.1 christos FUNC(is_space, ISSPACE) 639 1.1 christos FUNC(is_punct, ISPUNCT) 640 1.1 christos FUNC(is_alnum, ISALNUM) 641 1.1 christos FUNC(is_print, ISPRINT) 642 1.1 christos FUNC(is_graph, ISGRAPH) 643 1.1 christos FUNC(is_cntrl, ISCNTRL) 644 1.1 christos 645 1.1 christos static int 646 1.1 christos is_blank (int c) 647 1.1 christos { 648 1.1 christos return (c == ' ' || c == '\t'); 649 1.1 christos } 650 1.1 christos 651 1.1 christos /* The following list maps the names of the Posix named character classes 652 1.1 christos to predicate functions that determine whether a given character is in 653 1.1 christos the class. The leading [ has already been eaten by the lexical analyzer. */ 654 1.1 christos static struct { 655 1.1 christos const char *name; 656 1.1 christos int (*pred) (int); 657 1.1 christos } const prednames[] = { 658 1.1 christos { ":alpha:]", is_alpha }, 659 1.1 christos { ":upper:]", is_upper }, 660 1.1 christos { ":lower:]", is_lower }, 661 1.1 christos { ":digit:]", is_digit }, 662 1.1 christos { ":xdigit:]", is_xdigit }, 663 1.1 christos { ":space:]", is_space }, 664 1.1 christos { ":punct:]", is_punct }, 665 1.1 christos { ":alnum:]", is_alnum }, 666 1.1 christos { ":print:]", is_print }, 667 1.1 christos { ":graph:]", is_graph }, 668 1.1 christos { ":cntrl:]", is_cntrl }, 669 1.1 christos { ":blank:]", is_blank }, 670 1.1 christos { 0 } 671 1.1 christos }; 672 1.1 christos 673 1.1 christos /* Return non-zero if C is a `word-constituent' byte; zero otherwise. */ 674 1.1 christos #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_') 675 1.1 christos 676 1.1 christos static int 677 1.1 christos looking_at (char const *s) 678 1.1 christos { 679 1.1 christos size_t len; 680 1.1 christos 681 1.1 christos len = strlen(s); 682 1.1 christos if (lexleft < len) 683 1.1 christos return 0; 684 1.1 christos return strncmp(s, lexptr, len) == 0; 685 1.1 christos } 686 1.1 christos 687 1.1 christos static token 688 1.1 christos lex (void) 689 1.1 christos { 690 1.1 christos unsigned c, c1, c2; 691 1.1 christos int backslash = 0, invert; 692 1.1 christos charclass ccl; 693 1.1 christos int i; 694 1.1 christos 695 1.1 christos /* Basic plan: We fetch a character. If it's a backslash, 696 1.1 christos we set the backslash flag and go through the loop again. 697 1.1 christos On the plus side, this avoids having a duplicate of the 698 1.1 christos main switch inside the backslash case. On the minus side, 699 1.1 christos it means that just about every case begins with 700 1.1 christos "if (backslash) ...". */ 701 1.1 christos for (i = 0; i < 2; ++i) 702 1.1 christos { 703 1.1 christos FETCH(c, 0); 704 1.1 christos #ifdef MBS_SUPPORT 705 1.1 christos if (MB_CUR_MAX > 1 && cur_mb_index) 706 1.1 christos /* If this is a part of a multi-byte character, we must treat 707 1.1 christos this byte data as a normal character. 708 1.1 christos e.g. In case of SJIS encoding, some character contains '\', 709 1.1 christos but they must not be backslash. */ 710 1.1 christos goto normal_char; 711 1.1 christos #endif /* MBS_SUPPORT */ 712 1.1 christos switch (c) 713 1.1 christos { 714 1.1 christos case '\\': 715 1.1 christos if (backslash) 716 1.1 christos goto normal_char; 717 1.1 christos if (lexleft == 0) 718 1.1 christos dfaerror(_("Unfinished \\ escape")); 719 1.1 christos backslash = 1; 720 1.1 christos break; 721 1.1 christos 722 1.1 christos case '^': 723 1.1 christos if (backslash) 724 1.1 christos goto normal_char; 725 1.1 christos if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 726 1.1 christos || lasttok == END 727 1.1 christos || lasttok == LPAREN 728 1.1 christos || lasttok == OR) 729 1.1 christos return lasttok = BEGLINE; 730 1.1 christos goto normal_char; 731 1.1 christos 732 1.1 christos case '$': 733 1.1 christos if (backslash) 734 1.1 christos goto normal_char; 735 1.1 christos if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 736 1.1 christos || lexleft == 0 737 1.1 christos || (syntax_bits & RE_NO_BK_PARENS 738 1.1 christos ? lexleft > 0 && *lexptr == ')' 739 1.1 christos : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')') 740 1.1 christos || (syntax_bits & RE_NO_BK_VBAR 741 1.1 christos ? lexleft > 0 && *lexptr == '|' 742 1.1 christos : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|') 743 1.1 christos || ((syntax_bits & RE_NEWLINE_ALT) 744 1.1 christos && lexleft > 0 && *lexptr == '\n')) 745 1.1 christos return lasttok = ENDLINE; 746 1.1 christos goto normal_char; 747 1.1 christos 748 1.1 christos case '1': 749 1.1 christos case '2': 750 1.1 christos case '3': 751 1.1 christos case '4': 752 1.1 christos case '5': 753 1.1 christos case '6': 754 1.1 christos case '7': 755 1.1 christos case '8': 756 1.1 christos case '9': 757 1.1 christos if (backslash && !(syntax_bits & RE_NO_BK_REFS)) 758 1.1 christos { 759 1.1 christos laststart = 0; 760 1.1 christos return lasttok = BACKREF; 761 1.1 christos } 762 1.1 christos goto normal_char; 763 1.1 christos 764 1.1 christos case '`': 765 1.1 christos if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 766 1.1 christos return lasttok = BEGLINE; /* FIXME: should be beginning of string */ 767 1.1 christos goto normal_char; 768 1.1 christos 769 1.1 christos case '\'': 770 1.1 christos if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 771 1.1 christos return lasttok = ENDLINE; /* FIXME: should be end of string */ 772 1.1 christos goto normal_char; 773 1.1 christos 774 1.1 christos case '<': 775 1.1 christos if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 776 1.1 christos return lasttok = BEGWORD; 777 1.1 christos goto normal_char; 778 1.1 christos 779 1.1 christos case '>': 780 1.1 christos if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 781 1.1 christos return lasttok = ENDWORD; 782 1.1 christos goto normal_char; 783 1.1 christos 784 1.1 christos case 'b': 785 1.1 christos if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 786 1.1 christos return lasttok = LIMWORD; 787 1.1 christos goto normal_char; 788 1.1 christos 789 1.1 christos case 'B': 790 1.1 christos if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 791 1.1 christos return lasttok = NOTLIMWORD; 792 1.1 christos goto normal_char; 793 1.1 christos 794 1.1 christos case '?': 795 1.1 christos if (syntax_bits & RE_LIMITED_OPS) 796 1.1 christos goto normal_char; 797 1.1 christos if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 798 1.1 christos goto normal_char; 799 1.1 christos if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 800 1.1 christos goto normal_char; 801 1.1 christos return lasttok = QMARK; 802 1.1 christos 803 1.1 christos case '*': 804 1.1 christos if (backslash) 805 1.1 christos goto normal_char; 806 1.1 christos if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 807 1.1 christos goto normal_char; 808 1.1 christos return lasttok = STAR; 809 1.1 christos 810 1.1 christos case '+': 811 1.1 christos if (syntax_bits & RE_LIMITED_OPS) 812 1.1 christos goto normal_char; 813 1.1 christos if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 814 1.1 christos goto normal_char; 815 1.1 christos if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 816 1.1 christos goto normal_char; 817 1.1 christos return lasttok = PLUS; 818 1.1 christos 819 1.1 christos case '{': 820 1.1 christos if (!(syntax_bits & RE_INTERVALS)) 821 1.1 christos goto normal_char; 822 1.1 christos if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) 823 1.1 christos goto normal_char; 824 1.1 christos if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 825 1.1 christos goto normal_char; 826 1.1 christos 827 1.1 christos if (syntax_bits & RE_NO_BK_BRACES) 828 1.1 christos { 829 1.1 christos /* Scan ahead for a valid interval; if it's not valid, 830 1.1 christos treat it as a literal '{'. */ 831 1.1 christos int lo = -1, hi = -1; 832 1.1 christos char const *p = lexptr; 833 1.1 christos char const *lim = p + lexleft; 834 1.1 christos for (; p != lim && ISASCIIDIGIT (*p); p++) 835 1.1 christos lo = (lo < 0 ? 0 : lo * 10) + *p - '0'; 836 1.1 christos if (p != lim && *p == ',') 837 1.1 christos while (++p != lim && ISASCIIDIGIT (*p)) 838 1.1 christos hi = (hi < 0 ? 0 : hi * 10) + *p - '0'; 839 1.1 christos else 840 1.1 christos hi = lo; 841 1.1 christos if (p == lim || *p != '}' 842 1.1 christos || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo)) 843 1.1 christos goto normal_char; 844 1.1 christos } 845 1.1 christos 846 1.1 christos minrep = 0; 847 1.1 christos /* Cases: 848 1.1 christos {M} - exact count 849 1.1 christos {M,} - minimum count, maximum is infinity 850 1.1 christos {M,N} - M through N */ 851 1.1 christos FETCH(c, _("unfinished repeat count")); 852 1.1 christos if (ISASCIIDIGIT (c)) 853 1.1 christos { 854 1.1 christos minrep = c - '0'; 855 1.1 christos for (;;) 856 1.1 christos { 857 1.1 christos FETCH(c, _("unfinished repeat count")); 858 1.1 christos if (! ISASCIIDIGIT (c)) 859 1.1 christos break; 860 1.1 christos minrep = 10 * minrep + c - '0'; 861 1.1 christos } 862 1.1 christos } 863 1.1 christos else 864 1.1 christos dfaerror(_("malformed repeat count")); 865 1.1 christos if (c == ',') 866 1.1 christos { 867 1.1 christos FETCH (c, _("unfinished repeat count")); 868 1.1 christos if (! ISASCIIDIGIT (c)) 869 1.1 christos maxrep = -1; 870 1.1 christos else 871 1.1 christos { 872 1.1 christos maxrep = c - '0'; 873 1.1 christos for (;;) 874 1.1 christos { 875 1.1 christos FETCH (c, _("unfinished repeat count")); 876 1.1 christos if (! ISASCIIDIGIT (c)) 877 1.1 christos break; 878 1.1 christos maxrep = 10 * maxrep + c - '0'; 879 1.1 christos } 880 1.1 christos if (0 <= maxrep && maxrep < minrep) 881 1.1 christos dfaerror (_("malformed repeat count")); 882 1.1 christos } 883 1.1 christos } 884 1.1 christos else 885 1.1 christos maxrep = minrep; 886 1.1 christos if (!(syntax_bits & RE_NO_BK_BRACES)) 887 1.1 christos { 888 1.1 christos if (c != '\\') 889 1.1 christos dfaerror(_("malformed repeat count")); 890 1.1 christos FETCH(c, _("unfinished repeat count")); 891 1.1 christos } 892 1.1 christos if (c != '}') 893 1.1 christos dfaerror(_("malformed repeat count")); 894 1.1 christos laststart = 0; 895 1.1 christos return lasttok = REPMN; 896 1.1 christos 897 1.1 christos case '|': 898 1.1 christos if (syntax_bits & RE_LIMITED_OPS) 899 1.1 christos goto normal_char; 900 1.1 christos if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) 901 1.1 christos goto normal_char; 902 1.1 christos laststart = 1; 903 1.1 christos return lasttok = OR; 904 1.1 christos 905 1.1 christos case '\n': 906 1.1 christos if (syntax_bits & RE_LIMITED_OPS 907 1.1 christos || backslash 908 1.1 christos || !(syntax_bits & RE_NEWLINE_ALT)) 909 1.1 christos goto normal_char; 910 1.1 christos laststart = 1; 911 1.1 christos return lasttok = OR; 912 1.1 christos 913 1.1 christos case '(': 914 1.1 christos if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 915 1.1 christos goto normal_char; 916 1.1 christos ++parens; 917 1.1 christos laststart = 1; 918 1.1 christos return lasttok = LPAREN; 919 1.1 christos 920 1.1 christos case ')': 921 1.1 christos if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 922 1.1 christos goto normal_char; 923 1.1 christos if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) 924 1.1 christos goto normal_char; 925 1.1 christos --parens; 926 1.1 christos laststart = 0; 927 1.1 christos return lasttok = RPAREN; 928 1.1 christos 929 1.1 christos case '.': 930 1.1 christos if (backslash) 931 1.1 christos goto normal_char; 932 1.1 christos #ifdef MBS_SUPPORT 933 1.1 christos if (MB_CUR_MAX > 1) 934 1.1 christos { 935 1.1 christos /* In multibyte environment period must match with a single 936 1.1 christos character not a byte. So we use ANYCHAR. */ 937 1.1 christos laststart = 0; 938 1.1 christos return lasttok = ANYCHAR; 939 1.1 christos } 940 1.1 christos #endif /* MBS_SUPPORT */ 941 1.1 christos zeroset(ccl); 942 1.1 christos notset(ccl); 943 1.1 christos if (!(syntax_bits & RE_DOT_NEWLINE)) 944 1.1 christos clrbit(eolbyte, ccl); 945 1.1 christos if (syntax_bits & RE_DOT_NOT_NULL) 946 1.1 christos clrbit('\0', ccl); 947 1.1 christos laststart = 0; 948 1.1 christos return lasttok = CSET + charclass_index(ccl); 949 1.1 christos 950 1.1 christos case 'w': 951 1.1 christos case 'W': 952 1.1 christos if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) 953 1.1 christos goto normal_char; 954 1.1 christos zeroset(ccl); 955 1.1 christos for (c2 = 0; c2 < NOTCHAR; ++c2) 956 1.1 christos if (IS_WORD_CONSTITUENT(c2)) 957 1.1 christos setbit(c2, ccl); 958 1.1 christos if (c == 'W') 959 1.1 christos notset(ccl); 960 1.1 christos laststart = 0; 961 1.1 christos return lasttok = CSET + charclass_index(ccl); 962 1.1 christos 963 1.1 christos case '[': 964 1.1 christos if (backslash) 965 1.1 christos goto normal_char; 966 1.1 christos laststart = 0; 967 1.1 christos #ifdef MBS_SUPPORT 968 1.1 christos if (MB_CUR_MAX > 1) 969 1.1 christos { 970 1.1 christos /* In multibyte environment a bracket expression may contain 971 1.1 christos multibyte characters, which must be treated as characters 972 1.1 christos (not bytes). So we parse it by parse_bracket_exp_mb(). */ 973 1.1 christos parse_bracket_exp_mb(); 974 1.1 christos return lasttok = MBCSET; 975 1.1 christos } 976 1.1 christos #endif 977 1.1 christos zeroset(ccl); 978 1.1 christos FETCH(c, _("Unbalanced [")); 979 1.1 christos if (c == '^') 980 1.1 christos { 981 1.1 christos FETCH(c, _("Unbalanced [")); 982 1.1 christos invert = 1; 983 1.1 christos } 984 1.1 christos else 985 1.1 christos invert = 0; 986 1.1 christos do 987 1.1 christos { 988 1.1 christos /* Nobody ever said this had to be fast. :-) 989 1.1 christos Note that if we're looking at some other [:...:] 990 1.1 christos construct, we just treat it as a bunch of ordinary 991 1.1 christos characters. We can do this because we assume 992 1.1 christos regex has checked for syntax errors before 993 1.1 christos dfa is ever called. */ 994 1.1 christos if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) 995 1.1 christos for (c1 = 0; prednames[c1].name; ++c1) 996 1.1 christos if (looking_at(prednames[c1].name)) 997 1.1 christos { 998 1.1 christos int (*pred) (int) = prednames[c1].pred; 999 1.1 christos 1000 1.1 christos for (c2 = 0; c2 < NOTCHAR; ++c2) 1001 1.1 christos if ((*pred)(c2)) 1002 1.1 christos setbit_case_fold (c2, ccl); 1003 1.1 christos lexptr += strlen(prednames[c1].name); 1004 1.1 christos lexleft -= strlen(prednames[c1].name); 1005 1.1 christos FETCH(c1, _("Unbalanced [")); 1006 1.1 christos goto skip; 1007 1.1 christos } 1008 1.1 christos if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 1009 1.1 christos FETCH(c, _("Unbalanced [")); 1010 1.1 christos FETCH(c1, _("Unbalanced [")); 1011 1.1 christos if (c1 == '-') 1012 1.1 christos { 1013 1.1 christos FETCH(c2, _("Unbalanced [")); 1014 1.1 christos if (c2 == ']') 1015 1.1 christos { 1016 1.1 christos /* In the case [x-], the - is an ordinary hyphen, 1017 1.1 christos which is left in c1, the lookahead character. */ 1018 1.1 christos --lexptr; 1019 1.1 christos ++lexleft; 1020 1.1 christos } 1021 1.1 christos else 1022 1.1 christos { 1023 1.1 christos if (c2 == '\\' 1024 1.1 christos && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 1025 1.1 christos FETCH(c2, _("Unbalanced [")); 1026 1.1 christos FETCH(c1, _("Unbalanced [")); 1027 1.1 christos if (!hard_LC_COLLATE) { 1028 1.1 christos for (; c <= c2; c++) 1029 1.1 christos setbit_case_fold (c, ccl); 1030 1.1 christos } else { 1031 1.1 christos /* POSIX locales are painful - leave the decision to libc */ 1032 1.1 christos regex_t re; 1033 1.1 christos char expr[6]; /* = { '[', c, '-', c2, ']', '\0' }; */ 1034 1.1 christos 1035 1.1 christos expr[0] = '['; expr[1] = c; expr[2] = '-'; 1036 1.1 christos expr[3] = c2; expr[4] = ']'; expr[5] = '\0'; 1037 1.1 christos if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) { 1038 1.1 christos for (c = 0; c < NOTCHAR; ++c) { 1039 1.1 christos regmatch_t mat; 1040 1.1 christos char buf[2]; /* = { c, '\0' }; */ 1041 1.1 christos 1042 1.1 christos buf[0] = c; buf[1] = '\0'; 1043 1.1 christos if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR 1044 1.1 christos && mat.rm_so == 0 && mat.rm_eo == 1) 1045 1.1 christos setbit_case_fold (c, ccl); 1046 1.1 christos } 1047 1.1 christos regfree (&re); 1048 1.1 christos } 1049 1.1 christos } 1050 1.1 christos continue; 1051 1.1 christos } 1052 1.1 christos } 1053 1.1 christos 1054 1.1 christos setbit_case_fold (c, ccl); 1055 1.1 christos 1056 1.1 christos skip: 1057 1.1 christos ; 1058 1.1 christos } 1059 1.1 christos while ((c = c1) != ']'); 1060 1.1 christos if (invert) 1061 1.1 christos { 1062 1.1 christos notset(ccl); 1063 1.1 christos if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) 1064 1.1 christos clrbit(eolbyte, ccl); 1065 1.1 christos } 1066 1.1 christos return lasttok = CSET + charclass_index(ccl); 1067 1.1 christos 1068 1.1 christos default: 1069 1.1 christos normal_char: 1070 1.1 christos laststart = 0; 1071 1.1 christos if (case_fold && ISALPHA(c)) 1072 1.1 christos { 1073 1.1 christos zeroset(ccl); 1074 1.1 christos setbit_case_fold (c, ccl); 1075 1.1 christos return lasttok = CSET + charclass_index(ccl); 1076 1.1 christos } 1077 1.1 christos return c; 1078 1.1 christos } 1079 1.1 christos } 1080 1.1 christos 1081 1.1 christos /* The above loop should consume at most a backslash 1082 1.1 christos and some other character. */ 1083 1.1 christos abort(); 1084 1.1 christos return END; /* keeps pedantic compilers happy. */ 1085 1.1 christos } 1086 1.1 christos 1087 1.1 christos /* Recursive descent parser for regular expressions. */ 1088 1.1 christos 1089 1.1 christos static token tok; /* Lookahead token. */ 1090 1.1 christos static int depth; /* Current depth of a hypothetical stack 1091 1.1 christos holding deferred productions. This is 1092 1.1 christos used to determine the depth that will be 1093 1.1 christos required of the real stack later on in 1094 1.1 christos dfaanalyze(). */ 1095 1.1 christos 1096 1.1 christos /* Add the given token to the parse tree, maintaining the depth count and 1097 1.1 christos updating the maximum depth if necessary. */ 1098 1.1 christos static void 1099 1.1 christos addtok (token t) 1100 1.1 christos { 1101 1.1 christos #ifdef MBS_SUPPORT 1102 1.1 christos if (MB_CUR_MAX > 1) 1103 1.1 christos { 1104 1.1 christos REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop, 1105 1.1 christos dfa->tindex); 1106 1.1 christos /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */ 1107 1.1 christos if (t == MBCSET) 1108 1.1 christos dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3; 1109 1.1 christos else if (t < NOTCHAR) 1110 1.1 christos dfa->multibyte_prop[dfa->tindex] 1111 1.1 christos = (cur_mb_len == 1)? 3 /* single-byte char */ 1112 1.1 christos : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */ 1113 1.1 christos + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */ 1114 1.1 christos else 1115 1.1 christos /* It may be unnecesssary, but it is safer to treat other 1116 1.1 christos symbols as singlebyte characters. */ 1117 1.1 christos dfa->multibyte_prop[dfa->tindex] = 3; 1118 1.1 christos } 1119 1.1 christos #endif 1120 1.1 christos 1121 1.1 christos REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); 1122 1.1 christos dfa->tokens[dfa->tindex++] = t; 1123 1.1 christos 1124 1.1 christos switch (t) 1125 1.1 christos { 1126 1.1 christos case QMARK: 1127 1.1 christos case STAR: 1128 1.1 christos case PLUS: 1129 1.1 christos break; 1130 1.1 christos 1131 1.1 christos case CAT: 1132 1.1 christos case OR: 1133 1.1 christos case ORTOP: 1134 1.1 christos --depth; 1135 1.1 christos break; 1136 1.1 christos 1137 1.1 christos default: 1138 1.1 christos ++dfa->nleaves; 1139 1.1 christos case EMPTY: 1140 1.1 christos ++depth; 1141 1.1 christos break; 1142 1.1 christos } 1143 1.1 christos if (depth > dfa->depth) 1144 1.1 christos dfa->depth = depth; 1145 1.1 christos } 1146 1.1 christos 1147 1.1 christos /* The grammar understood by the parser is as follows. 1148 1.1 christos 1149 1.1 christos regexp: 1150 1.1 christos regexp OR branch 1151 1.1 christos branch 1152 1.1 christos 1153 1.1 christos branch: 1154 1.1 christos branch closure 1155 1.1 christos closure 1156 1.1 christos 1157 1.1 christos closure: 1158 1.1 christos closure QMARK 1159 1.1 christos closure STAR 1160 1.1 christos closure PLUS 1161 1.1 christos closure REPMN 1162 1.1 christos atom 1163 1.1 christos 1164 1.1 christos atom: 1165 1.1 christos <normal character> 1166 1.1 christos <multibyte character> 1167 1.1 christos ANYCHAR 1168 1.1 christos MBCSET 1169 1.1 christos CSET 1170 1.1 christos BACKREF 1171 1.1 christos BEGLINE 1172 1.1 christos ENDLINE 1173 1.1 christos BEGWORD 1174 1.1 christos ENDWORD 1175 1.1 christos LIMWORD 1176 1.1 christos NOTLIMWORD 1177 1.1 christos CRANGE 1178 1.1 christos LPAREN regexp RPAREN 1179 1.1 christos <empty> 1180 1.1 christos 1181 1.1 christos The parser builds a parse tree in postfix form in an array of tokens. */ 1182 1.1 christos 1183 1.1 christos static void 1184 1.1 christos atom (void) 1185 1.1 christos { 1186 1.1 christos if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF 1187 1.1 christos || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD 1188 1.1 christos #ifdef MBS_SUPPORT 1189 1.1 christos || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */ 1190 1.1 christos #endif /* MBS_SUPPORT */ 1191 1.1 christos || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) 1192 1.1 christos { 1193 1.1 christos addtok(tok); 1194 1.1 christos tok = lex(); 1195 1.1 christos #ifdef MBS_SUPPORT 1196 1.1 christos /* We treat a multibyte character as a single atom, so that DFA 1197 1.1 christos can treat a multibyte character as a single expression. 1198 1.1 christos 1199 1.1 christos e.g. We construct following tree from "<mb1><mb2>". 1200 1.1 christos <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT> 1201 1.1 christos <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> 1202 1.1 christos */ 1203 1.1 christos if (MB_CUR_MAX > 1) 1204 1.1 christos { 1205 1.1 christos while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR) 1206 1.1 christos { 1207 1.1 christos addtok(tok); 1208 1.1 christos addtok(CAT); 1209 1.1 christos tok = lex(); 1210 1.1 christos } 1211 1.1 christos } 1212 1.1 christos #endif /* MBS_SUPPORT */ 1213 1.1 christos } 1214 1.1 christos else if (tok == CRANGE) 1215 1.1 christos { 1216 1.1 christos /* A character range like "[a-z]" in a locale other than "C" or 1217 1.1 christos "POSIX". This range might any sequence of one or more 1218 1.1 christos characters. Unfortunately the POSIX locale primitives give 1219 1.1 christos us no practical way to find what character sequences might be 1220 1.1 christos matched. Treat this approximately like "(.\1)" -- i.e. match 1221 1.1 christos one character, and then punt to the full matcher. */ 1222 1.1 christos charclass ccl; 1223 1.1 christos zeroset (ccl); 1224 1.1 christos notset (ccl); 1225 1.1 christos addtok (CSET + charclass_index (ccl)); 1226 1.1 christos addtok (BACKREF); 1227 1.1 christos addtok (CAT); 1228 1.1 christos tok = lex (); 1229 1.1 christos } 1230 1.1 christos else if (tok == LPAREN) 1231 1.1 christos { 1232 1.1 christos tok = lex(); 1233 1.1 christos regexp(0); 1234 1.1 christos if (tok != RPAREN) 1235 1.1 christos dfaerror(_("Unbalanced (")); 1236 1.1 christos tok = lex(); 1237 1.1 christos } 1238 1.1 christos else 1239 1.1 christos addtok(EMPTY); 1240 1.1 christos } 1241 1.1 christos 1242 1.1 christos /* Return the number of tokens in the given subexpression. */ 1243 1.1 christos static int 1244 1.1 christos nsubtoks (int tindex) 1245 1.1 christos { 1246 1.1 christos int ntoks1; 1247 1.1 christos 1248 1.1 christos switch (dfa->tokens[tindex - 1]) 1249 1.1 christos { 1250 1.1 christos default: 1251 1.1 christos return 1; 1252 1.1 christos case QMARK: 1253 1.1 christos case STAR: 1254 1.1 christos case PLUS: 1255 1.1 christos return 1 + nsubtoks(tindex - 1); 1256 1.1 christos case CAT: 1257 1.1 christos case OR: 1258 1.1 christos case ORTOP: 1259 1.1 christos ntoks1 = nsubtoks(tindex - 1); 1260 1.1 christos return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1); 1261 1.1 christos } 1262 1.1 christos } 1263 1.1 christos 1264 1.1 christos /* Copy the given subexpression to the top of the tree. */ 1265 1.1 christos static void 1266 1.1 christos copytoks (int tindex, int ntokens) 1267 1.1 christos { 1268 1.1 christos int i; 1269 1.1 christos 1270 1.1 christos for (i = 0; i < ntokens; ++i) 1271 1.1 christos addtok(dfa->tokens[tindex + i]); 1272 1.1 christos } 1273 1.1 christos 1274 1.1 christos static void 1275 1.1 christos closure (void) 1276 1.1 christos { 1277 1.1 christos int tindex, ntokens, i; 1278 1.1 christos 1279 1.1 christos atom(); 1280 1.1 christos while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) 1281 1.1 christos if (tok == REPMN) 1282 1.1 christos { 1283 1.1 christos ntokens = nsubtoks(dfa->tindex); 1284 1.1 christos tindex = dfa->tindex - ntokens; 1285 1.1 christos if (maxrep < 0) 1286 1.1 christos addtok(PLUS); 1287 1.1 christos if (minrep == 0) 1288 1.1 christos addtok(QMARK); 1289 1.1 christos for (i = 1; i < minrep; ++i) 1290 1.1 christos { 1291 1.1 christos copytoks(tindex, ntokens); 1292 1.1 christos addtok(CAT); 1293 1.1 christos } 1294 1.1 christos for (; i < maxrep; ++i) 1295 1.1 christos { 1296 1.1 christos copytoks(tindex, ntokens); 1297 1.1 christos addtok(QMARK); 1298 1.1 christos addtok(CAT); 1299 1.1 christos } 1300 1.1 christos tok = lex(); 1301 1.1 christos } 1302 1.1 christos else 1303 1.1 christos { 1304 1.1 christos addtok(tok); 1305 1.1 christos tok = lex(); 1306 1.1 christos } 1307 1.1 christos } 1308 1.1 christos 1309 1.1 christos static void 1310 1.1 christos branch (void) 1311 1.1 christos { 1312 1.1 christos closure(); 1313 1.1 christos while (tok != RPAREN && tok != OR && tok >= 0) 1314 1.1 christos { 1315 1.1 christos closure(); 1316 1.1 christos addtok(CAT); 1317 1.1 christos } 1318 1.1 christos } 1319 1.1 christos 1320 1.1 christos static void 1321 1.1 christos regexp (int toplevel) 1322 1.1 christos { 1323 1.1 christos branch(); 1324 1.1 christos while (tok == OR) 1325 1.1 christos { 1326 1.1 christos tok = lex(); 1327 1.1 christos branch(); 1328 1.1 christos if (toplevel) 1329 1.1 christos addtok(ORTOP); 1330 1.1 christos else 1331 1.1 christos addtok(OR); 1332 1.1 christos } 1333 1.1 christos } 1334 1.1 christos 1335 1.1 christos /* Main entry point for the parser. S is a string to be parsed, len is the 1336 1.1 christos length of the string, so s can include NUL characters. D is a pointer to 1337 1.1 christos the struct dfa to parse into. */ 1338 1.1 christos void 1339 1.1 christos dfaparse (char const *s, size_t len, struct dfa *d) 1340 1.1 christos { 1341 1.1 christos dfa = d; 1342 1.1 christos lexstart = lexptr = s; 1343 1.1 christos lexleft = len; 1344 1.1 christos lasttok = END; 1345 1.1 christos laststart = 1; 1346 1.1 christos parens = 0; 1347 1.1 christos #if ENABLE_NLS 1348 1.1 christos hard_LC_COLLATE = hard_locale (LC_COLLATE); 1349 1.1 christos #endif 1350 1.1 christos #ifdef MBS_SUPPORT 1351 1.1 christos if (MB_CUR_MAX > 1) 1352 1.1 christos { 1353 1.1 christos cur_mb_index = 0; 1354 1.1 christos cur_mb_len = 0; 1355 1.1 christos memset(&mbs, 0, sizeof(mbstate_t)); 1356 1.1 christos } 1357 1.1 christos #endif /* MBS_SUPPORT */ 1358 1.1 christos 1359 1.1 christos if (! syntax_bits_set) 1360 1.1 christos dfaerror(_("No syntax specified")); 1361 1.1 christos 1362 1.1 christos tok = lex(); 1363 1.1 christos depth = d->depth; 1364 1.1 christos 1365 1.1 christos regexp(1); 1366 1.1 christos 1367 1.1 christos if (tok != END) 1368 1.1 christos dfaerror(_("Unbalanced )")); 1369 1.1 christos 1370 1.1 christos addtok(END - d->nregexps); 1371 1.1 christos addtok(CAT); 1372 1.1 christos 1373 1.1 christos if (d->nregexps) 1374 1.1 christos addtok(ORTOP); 1375 1.1 christos 1376 1.1 christos ++d->nregexps; 1377 1.1 christos } 1378 1.1 christos 1379 1.1 christos /* Some primitives for operating on sets of positions. */ 1380 1.1 christos 1381 1.1 christos /* Copy one set to another; the destination must be large enough. */ 1382 1.1 christos static void 1383 1.1 christos copy (position_set const *src, position_set *dst) 1384 1.1 christos { 1385 1.1 christos int i; 1386 1.1 christos 1387 1.1 christos for (i = 0; i < src->nelem; ++i) 1388 1.1 christos dst->elems[i] = src->elems[i]; 1389 1.1 christos dst->nelem = src->nelem; 1390 1.1 christos } 1391 1.1 christos 1392 1.1 christos /* Insert a position in a set. Position sets are maintained in sorted 1393 1.1 christos order according to index. If position already exists in the set with 1394 1.1 christos the same index then their constraints are logically or'd together. 1395 1.1 christos S->elems must point to an array large enough to hold the resulting set. */ 1396 1.1 christos static void 1397 1.1 christos insert (position p, position_set *s) 1398 1.1 christos { 1399 1.1 christos int i; 1400 1.1 christos position t1, t2; 1401 1.1 christos 1402 1.1 christos for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) 1403 1.1 christos continue; 1404 1.1 christos if (i < s->nelem && p.index == s->elems[i].index) 1405 1.1 christos s->elems[i].constraint |= p.constraint; 1406 1.1 christos else 1407 1.1 christos { 1408 1.1 christos t1 = p; 1409 1.1 christos ++s->nelem; 1410 1.1 christos while (i < s->nelem) 1411 1.1 christos { 1412 1.1 christos t2 = s->elems[i]; 1413 1.1 christos s->elems[i++] = t1; 1414 1.1 christos t1 = t2; 1415 1.1 christos } 1416 1.1 christos } 1417 1.1 christos } 1418 1.1 christos 1419 1.1 christos /* Merge two sets of positions into a third. The result is exactly as if 1420 1.1 christos the positions of both sets were inserted into an initially empty set. */ 1421 1.1 christos static void 1422 1.1 christos merge (position_set const *s1, position_set const *s2, position_set *m) 1423 1.1 christos { 1424 1.1 christos int i = 0, j = 0; 1425 1.1 christos 1426 1.1 christos m->nelem = 0; 1427 1.1 christos while (i < s1->nelem && j < s2->nelem) 1428 1.1 christos if (s1->elems[i].index > s2->elems[j].index) 1429 1.1 christos m->elems[m->nelem++] = s1->elems[i++]; 1430 1.1 christos else if (s1->elems[i].index < s2->elems[j].index) 1431 1.1 christos m->elems[m->nelem++] = s2->elems[j++]; 1432 1.1 christos else 1433 1.1 christos { 1434 1.1 christos m->elems[m->nelem] = s1->elems[i++]; 1435 1.1 christos m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; 1436 1.1 christos } 1437 1.1 christos while (i < s1->nelem) 1438 1.1 christos m->elems[m->nelem++] = s1->elems[i++]; 1439 1.1 christos while (j < s2->nelem) 1440 1.1 christos m->elems[m->nelem++] = s2->elems[j++]; 1441 1.1 christos } 1442 1.1 christos 1443 1.1 christos /* Delete a position from a set. */ 1444 1.1 christos static void 1445 1.1 christos delete (position p, position_set *s) 1446 1.1 christos { 1447 1.1 christos int i; 1448 1.1 christos 1449 1.1 christos for (i = 0; i < s->nelem; ++i) 1450 1.1 christos if (p.index == s->elems[i].index) 1451 1.1 christos break; 1452 1.1 christos if (i < s->nelem) 1453 1.1 christos for (--s->nelem; i < s->nelem; ++i) 1454 1.1 christos s->elems[i] = s->elems[i + 1]; 1455 1.1 christos } 1456 1.1 christos 1457 1.1 christos /* Find the index of the state corresponding to the given position set with 1458 1.1 christos the given preceding context, or create a new state if there is no such 1459 1.1 christos state. Newline and letter tell whether we got here on a newline or 1460 1.1 christos letter, respectively. */ 1461 1.1 christos static int 1462 1.1 christos state_index (struct dfa *d, position_set const *s, int newline, int letter) 1463 1.1 christos { 1464 1.1 christos int hash = 0; 1465 1.1 christos int constraint; 1466 1.1 christos int i, j; 1467 1.1 christos 1468 1.1 christos newline = newline ? 1 : 0; 1469 1.1 christos letter = letter ? 1 : 0; 1470 1.1 christos 1471 1.1 christos for (i = 0; i < s->nelem; ++i) 1472 1.1 christos hash ^= s->elems[i].index + s->elems[i].constraint; 1473 1.1 christos 1474 1.1 christos /* Try to find a state that exactly matches the proposed one. */ 1475 1.1 christos for (i = 0; i < d->sindex; ++i) 1476 1.1 christos { 1477 1.1 christos if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem 1478 1.1 christos || newline != d->states[i].newline || letter != d->states[i].letter) 1479 1.1 christos continue; 1480 1.1 christos for (j = 0; j < s->nelem; ++j) 1481 1.1 christos if (s->elems[j].constraint 1482 1.1 christos != d->states[i].elems.elems[j].constraint 1483 1.1 christos || s->elems[j].index != d->states[i].elems.elems[j].index) 1484 1.1 christos break; 1485 1.1 christos if (j == s->nelem) 1486 1.1 christos return i; 1487 1.1 christos } 1488 1.1 christos 1489 1.1 christos /* We'll have to create a new state. */ 1490 1.1 christos REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex); 1491 1.1 christos d->states[i].hash = hash; 1492 1.1 christos MALLOC(d->states[i].elems.elems, position, s->nelem); 1493 1.1 christos copy(s, &d->states[i].elems); 1494 1.1 christos d->states[i].newline = newline; 1495 1.1 christos d->states[i].letter = letter; 1496 1.1 christos d->states[i].backref = 0; 1497 1.1 christos d->states[i].constraint = 0; 1498 1.1 christos d->states[i].first_end = 0; 1499 1.1 christos #ifdef MBS_SUPPORT 1500 1.1 christos if (MB_CUR_MAX > 1) 1501 1.1 christos d->states[i].mbps.nelem = 0; 1502 1.1 christos #endif 1503 1.1 christos for (j = 0; j < s->nelem; ++j) 1504 1.1 christos if (d->tokens[s->elems[j].index] < 0) 1505 1.1 christos { 1506 1.1 christos constraint = s->elems[j].constraint; 1507 1.1 christos if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) 1508 1.1 christos || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) 1509 1.1 christos || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) 1510 1.1 christos || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) 1511 1.1 christos d->states[i].constraint |= constraint; 1512 1.1 christos if (! d->states[i].first_end) 1513 1.1 christos d->states[i].first_end = d->tokens[s->elems[j].index]; 1514 1.1 christos } 1515 1.1 christos else if (d->tokens[s->elems[j].index] == BACKREF) 1516 1.1 christos { 1517 1.1 christos d->states[i].constraint = NO_CONSTRAINT; 1518 1.1 christos d->states[i].backref = 1; 1519 1.1 christos } 1520 1.1 christos 1521 1.1 christos ++d->sindex; 1522 1.1 christos 1523 1.1 christos return i; 1524 1.1 christos } 1525 1.1 christos 1526 1.1 christos /* Find the epsilon closure of a set of positions. If any position of the set 1527 1.1 christos contains a symbol that matches the empty string in some context, replace 1528 1.1 christos that position with the elements of its follow labeled with an appropriate 1529 1.1 christos constraint. Repeat exhaustively until no funny positions are left. 1530 1.1 christos S->elems must be large enough to hold the result. */ 1531 1.1 christos static void 1532 1.1 christos epsclosure (position_set *s, struct dfa const *d) 1533 1.1 christos { 1534 1.1 christos int i, j; 1535 1.1 christos int *visited; 1536 1.1 christos position p, old; 1537 1.1 christos 1538 1.1 christos MALLOC(visited, int, d->tindex); 1539 1.1 christos for (i = 0; i < d->tindex; ++i) 1540 1.1 christos visited[i] = 0; 1541 1.1 christos 1542 1.1 christos for (i = 0; i < s->nelem; ++i) 1543 1.1 christos if (d->tokens[s->elems[i].index] >= NOTCHAR 1544 1.1 christos && d->tokens[s->elems[i].index] != BACKREF 1545 1.1 christos #ifdef MBS_SUPPORT 1546 1.1 christos && d->tokens[s->elems[i].index] != ANYCHAR 1547 1.1 christos && d->tokens[s->elems[i].index] != MBCSET 1548 1.1 christos #endif 1549 1.1 christos && d->tokens[s->elems[i].index] < CSET) 1550 1.1 christos { 1551 1.1 christos old = s->elems[i]; 1552 1.1 christos p.constraint = old.constraint; 1553 1.1 christos delete(s->elems[i], s); 1554 1.1 christos if (visited[old.index]) 1555 1.1 christos { 1556 1.1 christos --i; 1557 1.1 christos continue; 1558 1.1 christos } 1559 1.1 christos visited[old.index] = 1; 1560 1.1 christos switch (d->tokens[old.index]) 1561 1.1 christos { 1562 1.1 christos case BEGLINE: 1563 1.1 christos p.constraint &= BEGLINE_CONSTRAINT; 1564 1.1 christos break; 1565 1.1 christos case ENDLINE: 1566 1.1 christos p.constraint &= ENDLINE_CONSTRAINT; 1567 1.1 christos break; 1568 1.1 christos case BEGWORD: 1569 1.1 christos p.constraint &= BEGWORD_CONSTRAINT; 1570 1.1 christos break; 1571 1.1 christos case ENDWORD: 1572 1.1 christos p.constraint &= ENDWORD_CONSTRAINT; 1573 1.1 christos break; 1574 1.1 christos case LIMWORD: 1575 1.1 christos p.constraint &= LIMWORD_CONSTRAINT; 1576 1.1 christos break; 1577 1.1 christos case NOTLIMWORD: 1578 1.1 christos p.constraint &= NOTLIMWORD_CONSTRAINT; 1579 1.1 christos break; 1580 1.1 christos default: 1581 1.1 christos break; 1582 1.1 christos } 1583 1.1 christos for (j = 0; j < d->follows[old.index].nelem; ++j) 1584 1.1 christos { 1585 1.1 christos p.index = d->follows[old.index].elems[j].index; 1586 1.1 christos insert(p, s); 1587 1.1 christos } 1588 1.1 christos /* Force rescan to start at the beginning. */ 1589 1.1 christos i = -1; 1590 1.1 christos } 1591 1.1 christos 1592 1.1 christos free(visited); 1593 1.1 christos } 1594 1.1 christos 1595 1.1 christos /* Perform bottom-up analysis on the parse tree, computing various functions. 1596 1.1 christos Note that at this point, we're pretending constructs like \< are real 1597 1.1 christos characters rather than constraints on what can follow them. 1598 1.1 christos 1599 1.1 christos Nullable: A node is nullable if it is at the root of a regexp that can 1600 1.1 christos match the empty string. 1601 1.1 christos * EMPTY leaves are nullable. 1602 1.1 christos * No other leaf is nullable. 1603 1.1 christos * A QMARK or STAR node is nullable. 1604 1.1 christos * A PLUS node is nullable if its argument is nullable. 1605 1.1 christos * A CAT node is nullable if both its arguments are nullable. 1606 1.1 christos * An OR node is nullable if either argument is nullable. 1607 1.1 christos 1608 1.1 christos Firstpos: The firstpos of a node is the set of positions (nonempty leaves) 1609 1.1 christos that could correspond to the first character of a string matching the 1610 1.1 christos regexp rooted at the given node. 1611 1.1 christos * EMPTY leaves have empty firstpos. 1612 1.1 christos * The firstpos of a nonempty leaf is that leaf itself. 1613 1.1 christos * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its 1614 1.1 christos argument. 1615 1.1 christos * The firstpos of a CAT node is the firstpos of the left argument, union 1616 1.1 christos the firstpos of the right if the left argument is nullable. 1617 1.1 christos * The firstpos of an OR node is the union of firstpos of each argument. 1618 1.1 christos 1619 1.1 christos Lastpos: The lastpos of a node is the set of positions that could 1620 1.1 christos correspond to the last character of a string matching the regexp at 1621 1.1 christos the given node. 1622 1.1 christos * EMPTY leaves have empty lastpos. 1623 1.1 christos * The lastpos of a nonempty leaf is that leaf itself. 1624 1.1 christos * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its 1625 1.1 christos argument. 1626 1.1 christos * The lastpos of a CAT node is the lastpos of its right argument, union 1627 1.1 christos the lastpos of the left if the right argument is nullable. 1628 1.1 christos * The lastpos of an OR node is the union of the lastpos of each argument. 1629 1.1 christos 1630 1.1 christos Follow: The follow of a position is the set of positions that could 1631 1.1 christos correspond to the character following a character matching the node in 1632 1.1 christos a string matching the regexp. At this point we consider special symbols 1633 1.1 christos that match the empty string in some context to be just normal characters. 1634 1.1 christos Later, if we find that a special symbol is in a follow set, we will 1635 1.1 christos replace it with the elements of its follow, labeled with an appropriate 1636 1.1 christos constraint. 1637 1.1 christos * Every node in the firstpos of the argument of a STAR or PLUS node is in 1638 1.1 christos the follow of every node in the lastpos. 1639 1.1 christos * Every node in the firstpos of the second argument of a CAT node is in 1640 1.1 christos the follow of every node in the lastpos of the first argument. 1641 1.1 christos 1642 1.1 christos Because of the postfix representation of the parse tree, the depth-first 1643 1.1 christos analysis is conveniently done by a linear scan with the aid of a stack. 1644 1.1 christos Sets are stored as arrays of the elements, obeying a stack-like allocation 1645 1.1 christos scheme; the number of elements in each set deeper in the stack can be 1646 1.1 christos used to determine the address of a particular set's array. */ 1647 1.1 christos void 1648 1.1 christos dfaanalyze (struct dfa *d, int searchflag) 1649 1.1 christos { 1650 1.1 christos int *nullable; /* Nullable stack. */ 1651 1.1 christos int *nfirstpos; /* Element count stack for firstpos sets. */ 1652 1.1 christos position *firstpos; /* Array where firstpos elements are stored. */ 1653 1.1 christos int *nlastpos; /* Element count stack for lastpos sets. */ 1654 1.1 christos position *lastpos; /* Array where lastpos elements are stored. */ 1655 1.1 christos int *nalloc; /* Sizes of arrays allocated to follow sets. */ 1656 1.1 christos position_set tmp; /* Temporary set for merging sets. */ 1657 1.1 christos position_set merged; /* Result of merging sets. */ 1658 1.1 christos int wants_newline; /* True if some position wants newline info. */ 1659 1.1 christos int *o_nullable; 1660 1.1 christos int *o_nfirst, *o_nlast; 1661 1.1 christos position *o_firstpos, *o_lastpos; 1662 1.1 christos int i, j; 1663 1.1 christos position *pos; 1664 1.1 christos 1665 1.1 christos #ifdef DEBUG 1666 1.1 christos fprintf(stderr, "dfaanalyze:\n"); 1667 1.1 christos for (i = 0; i < d->tindex; ++i) 1668 1.1 christos { 1669 1.1 christos fprintf(stderr, " %d:", i); 1670 1.1 christos prtok(d->tokens[i]); 1671 1.1 christos } 1672 1.1 christos putc('\n', stderr); 1673 1.1 christos #endif 1674 1.1 christos 1675 1.1 christos d->searchflag = searchflag; 1676 1.1 christos 1677 1.1 christos MALLOC(nullable, int, d->depth); 1678 1.1 christos o_nullable = nullable; 1679 1.1 christos MALLOC(nfirstpos, int, d->depth); 1680 1.1 christos o_nfirst = nfirstpos; 1681 1.1 christos MALLOC(firstpos, position, d->nleaves); 1682 1.1 christos o_firstpos = firstpos, firstpos += d->nleaves; 1683 1.1 christos MALLOC(nlastpos, int, d->depth); 1684 1.1 christos o_nlast = nlastpos; 1685 1.1 christos MALLOC(lastpos, position, d->nleaves); 1686 1.1 christos o_lastpos = lastpos, lastpos += d->nleaves; 1687 1.1 christos MALLOC(nalloc, int, d->tindex); 1688 1.1 christos for (i = 0; i < d->tindex; ++i) 1689 1.1 christos nalloc[i] = 0; 1690 1.1 christos MALLOC(merged.elems, position, d->nleaves); 1691 1.1 christos 1692 1.1 christos CALLOC(d->follows, position_set, d->tindex); 1693 1.1 christos 1694 1.1 christos for (i = 0; i < d->tindex; ++i) 1695 1.1 christos #ifdef DEBUG 1696 1.1 christos { /* Nonsyntactic #ifdef goo... */ 1697 1.1 christos #endif 1698 1.1 christos switch (d->tokens[i]) 1699 1.1 christos { 1700 1.1 christos case EMPTY: 1701 1.1 christos /* The empty set is nullable. */ 1702 1.1 christos *nullable++ = 1; 1703 1.1 christos 1704 1.1 christos /* The firstpos and lastpos of the empty leaf are both empty. */ 1705 1.1 christos *nfirstpos++ = *nlastpos++ = 0; 1706 1.1 christos break; 1707 1.1 christos 1708 1.1 christos case STAR: 1709 1.1 christos case PLUS: 1710 1.1 christos /* Every element in the firstpos of the argument is in the follow 1711 1.1 christos of every element in the lastpos. */ 1712 1.1 christos tmp.nelem = nfirstpos[-1]; 1713 1.1 christos tmp.elems = firstpos; 1714 1.1 christos pos = lastpos; 1715 1.1 christos for (j = 0; j < nlastpos[-1]; ++j) 1716 1.1 christos { 1717 1.1 christos merge(&tmp, &d->follows[pos[j].index], &merged); 1718 1.1 christos REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1719 1.1 christos nalloc[pos[j].index], merged.nelem - 1); 1720 1.1 christos copy(&merged, &d->follows[pos[j].index]); 1721 1.1 christos } 1722 1.1 christos 1723 1.1 christos case QMARK: 1724 1.1 christos /* A QMARK or STAR node is automatically nullable. */ 1725 1.1 christos if (d->tokens[i] != PLUS) 1726 1.1 christos nullable[-1] = 1; 1727 1.1 christos break; 1728 1.1 christos 1729 1.1 christos case CAT: 1730 1.1 christos /* Every element in the firstpos of the second argument is in the 1731 1.1 christos follow of every element in the lastpos of the first argument. */ 1732 1.1 christos tmp.nelem = nfirstpos[-1]; 1733 1.1 christos tmp.elems = firstpos; 1734 1.1 christos pos = lastpos + nlastpos[-1]; 1735 1.1 christos for (j = 0; j < nlastpos[-2]; ++j) 1736 1.1 christos { 1737 1.1 christos merge(&tmp, &d->follows[pos[j].index], &merged); 1738 1.1 christos REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1739 1.1 christos nalloc[pos[j].index], merged.nelem - 1); 1740 1.1 christos copy(&merged, &d->follows[pos[j].index]); 1741 1.1 christos } 1742 1.1 christos 1743 1.1 christos /* The firstpos of a CAT node is the firstpos of the first argument, 1744 1.1 christos union that of the second argument if the first is nullable. */ 1745 1.1 christos if (nullable[-2]) 1746 1.1 christos nfirstpos[-2] += nfirstpos[-1]; 1747 1.1 christos else 1748 1.1 christos firstpos += nfirstpos[-1]; 1749 1.1 christos --nfirstpos; 1750 1.1 christos 1751 1.1 christos /* The lastpos of a CAT node is the lastpos of the second argument, 1752 1.1 christos union that of the first argument if the second is nullable. */ 1753 1.1 christos if (nullable[-1]) 1754 1.1 christos nlastpos[-2] += nlastpos[-1]; 1755 1.1 christos else 1756 1.1 christos { 1757 1.1 christos pos = lastpos + nlastpos[-2]; 1758 1.1 christos for (j = nlastpos[-1] - 1; j >= 0; --j) 1759 1.1 christos pos[j] = lastpos[j]; 1760 1.1 christos lastpos += nlastpos[-2]; 1761 1.1 christos nlastpos[-2] = nlastpos[-1]; 1762 1.1 christos } 1763 1.1 christos --nlastpos; 1764 1.1 christos 1765 1.1 christos /* A CAT node is nullable if both arguments are nullable. */ 1766 1.1 christos nullable[-2] = nullable[-1] && nullable[-2]; 1767 1.1 christos --nullable; 1768 1.1 christos break; 1769 1.1 christos 1770 1.1 christos case OR: 1771 1.1 christos case ORTOP: 1772 1.1 christos /* The firstpos is the union of the firstpos of each argument. */ 1773 1.1 christos nfirstpos[-2] += nfirstpos[-1]; 1774 1.1 christos --nfirstpos; 1775 1.1 christos 1776 1.1 christos /* The lastpos is the union of the lastpos of each argument. */ 1777 1.1 christos nlastpos[-2] += nlastpos[-1]; 1778 1.1 christos --nlastpos; 1779 1.1 christos 1780 1.1 christos /* An OR node is nullable if either argument is nullable. */ 1781 1.1 christos nullable[-2] = nullable[-1] || nullable[-2]; 1782 1.1 christos --nullable; 1783 1.1 christos break; 1784 1.1 christos 1785 1.1 christos default: 1786 1.1 christos /* Anything else is a nonempty position. (Note that special 1787 1.1 christos constructs like \< are treated as nonempty strings here; 1788 1.1 christos an "epsilon closure" effectively makes them nullable later. 1789 1.1 christos Backreferences have to get a real position so we can detect 1790 1.1 christos transitions on them later. But they are nullable. */ 1791 1.1 christos *nullable++ = d->tokens[i] == BACKREF; 1792 1.1 christos 1793 1.1 christos /* This position is in its own firstpos and lastpos. */ 1794 1.1 christos *nfirstpos++ = *nlastpos++ = 1; 1795 1.1 christos --firstpos, --lastpos; 1796 1.1 christos firstpos->index = lastpos->index = i; 1797 1.1 christos firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; 1798 1.1 christos 1799 1.1 christos /* Allocate the follow set for this position. */ 1800 1.1 christos nalloc[i] = 1; 1801 1.1 christos MALLOC(d->follows[i].elems, position, nalloc[i]); 1802 1.1 christos break; 1803 1.1 christos } 1804 1.1 christos #ifdef DEBUG 1805 1.1 christos /* ... balance the above nonsyntactic #ifdef goo... */ 1806 1.1 christos fprintf(stderr, "node %d:", i); 1807 1.1 christos prtok(d->tokens[i]); 1808 1.1 christos putc('\n', stderr); 1809 1.1 christos fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n"); 1810 1.1 christos fprintf(stderr, " firstpos:"); 1811 1.1 christos for (j = nfirstpos[-1] - 1; j >= 0; --j) 1812 1.1 christos { 1813 1.1 christos fprintf(stderr, " %d:", firstpos[j].index); 1814 1.1 christos prtok(d->tokens[firstpos[j].index]); 1815 1.1 christos } 1816 1.1 christos fprintf(stderr, "\n lastpos:"); 1817 1.1 christos for (j = nlastpos[-1] - 1; j >= 0; --j) 1818 1.1 christos { 1819 1.1 christos fprintf(stderr, " %d:", lastpos[j].index); 1820 1.1 christos prtok(d->tokens[lastpos[j].index]); 1821 1.1 christos } 1822 1.1 christos putc('\n', stderr); 1823 1.1 christos } 1824 1.1 christos #endif 1825 1.1 christos 1826 1.1 christos /* For each follow set that is the follow set of a real position, replace 1827 1.1 christos it with its epsilon closure. */ 1828 1.1 christos for (i = 0; i < d->tindex; ++i) 1829 1.1 christos if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF 1830 1.1 christos #ifdef MBS_SUPPORT 1831 1.1 christos || d->tokens[i] == ANYCHAR 1832 1.1 christos || d->tokens[i] == MBCSET 1833 1.1 christos #endif 1834 1.1 christos || d->tokens[i] >= CSET) 1835 1.1 christos { 1836 1.1 christos #ifdef DEBUG 1837 1.1 christos fprintf(stderr, "follows(%d:", i); 1838 1.1 christos prtok(d->tokens[i]); 1839 1.1 christos fprintf(stderr, "):"); 1840 1.1 christos for (j = d->follows[i].nelem - 1; j >= 0; --j) 1841 1.1 christos { 1842 1.1 christos fprintf(stderr, " %d:", d->follows[i].elems[j].index); 1843 1.1 christos prtok(d->tokens[d->follows[i].elems[j].index]); 1844 1.1 christos } 1845 1.1 christos putc('\n', stderr); 1846 1.1 christos #endif 1847 1.1 christos copy(&d->follows[i], &merged); 1848 1.1 christos epsclosure(&merged, d); 1849 1.1 christos if (d->follows[i].nelem < merged.nelem) 1850 1.1 christos REALLOC(d->follows[i].elems, position, merged.nelem); 1851 1.1 christos copy(&merged, &d->follows[i]); 1852 1.1 christos } 1853 1.1 christos 1854 1.1 christos /* Get the epsilon closure of the firstpos of the regexp. The result will 1855 1.1 christos be the set of positions of state 0. */ 1856 1.1 christos merged.nelem = 0; 1857 1.1 christos for (i = 0; i < nfirstpos[-1]; ++i) 1858 1.1 christos insert(firstpos[i], &merged); 1859 1.1 christos epsclosure(&merged, d); 1860 1.1 christos 1861 1.1 christos /* Check if any of the positions of state 0 will want newline context. */ 1862 1.1 christos wants_newline = 0; 1863 1.1 christos for (i = 0; i < merged.nelem; ++i) 1864 1.1 christos if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) 1865 1.1 christos wants_newline = 1; 1866 1.1 christos 1867 1.1 christos /* Build the initial state. */ 1868 1.1 christos d->salloc = 1; 1869 1.1 christos d->sindex = 0; 1870 1.1 christos MALLOC(d->states, dfa_state, d->salloc); 1871 1.1 christos state_index(d, &merged, wants_newline, 0); 1872 1.1 christos 1873 1.1 christos free(o_nullable); 1874 1.1 christos free(o_nfirst); 1875 1.1 christos free(o_firstpos); 1876 1.1 christos free(o_nlast); 1877 1.1 christos free(o_lastpos); 1878 1.1 christos free(nalloc); 1879 1.1 christos free(merged.elems); 1880 1.1 christos } 1881 1.1 christos 1882 1.1 christos /* Find, for each character, the transition out of state s of d, and store 1883 1.1 christos it in the appropriate slot of trans. 1884 1.1 christos 1885 1.1 christos We divide the positions of s into groups (positions can appear in more 1886 1.1 christos than one group). Each group is labeled with a set of characters that 1887 1.1 christos every position in the group matches (taking into account, if necessary, 1888 1.1 christos preceding context information of s). For each group, find the union 1889 1.1 christos of the its elements' follows. This set is the set of positions of the 1890 1.1 christos new state. For each character in the group's label, set the transition 1891 1.1 christos on this character to be to a state corresponding to the set's positions, 1892 1.1 christos and its associated backward context information, if necessary. 1893 1.1 christos 1894 1.1 christos If we are building a searching matcher, we include the positions of state 1895 1.1 christos 0 in every state. 1896 1.1 christos 1897 1.1 christos The collection of groups is constructed by building an equivalence-class 1898 1.1 christos partition of the positions of s. 1899 1.1 christos 1900 1.1 christos For each position, find the set of characters C that it matches. Eliminate 1901 1.1 christos any characters from C that fail on grounds of backward context. 1902 1.1 christos 1903 1.1 christos Search through the groups, looking for a group whose label L has nonempty 1904 1.1 christos intersection with C. If L - C is nonempty, create a new group labeled 1905 1.1 christos L - C and having the same positions as the current group, and set L to 1906 1.1 christos the intersection of L and C. Insert the position in this group, set 1907 1.1 christos C = C - L, and resume scanning. 1908 1.1 christos 1909 1.1 christos If after comparing with every group there are characters remaining in C, 1910 1.1 christos create a new group labeled with the characters of C and insert this 1911 1.1 christos position in that group. */ 1912 1.1 christos void 1913 1.1 christos dfastate (int s, struct dfa *d, int trans[]) 1914 1.1 christos { 1915 1.1 christos position_set grps[NOTCHAR]; /* As many as will ever be needed. */ 1916 1.1 christos charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ 1917 1.1 christos int ngrps = 0; /* Number of groups actually used. */ 1918 1.1 christos position pos; /* Current position being considered. */ 1919 1.1 christos charclass matches; /* Set of matching characters. */ 1920 1.1 christos int matchesf; /* True if matches is nonempty. */ 1921 1.1 christos charclass intersect; /* Intersection with some label set. */ 1922 1.1 christos int intersectf; /* True if intersect is nonempty. */ 1923 1.1 christos charclass leftovers; /* Stuff in the label that didn't match. */ 1924 1.1 christos int leftoversf; /* True if leftovers is nonempty. */ 1925 1.1 christos static charclass letters; /* Set of characters considered letters. */ 1926 1.1 christos static charclass newline; /* Set of characters that aren't newline. */ 1927 1.1 christos position_set follows; /* Union of the follows of some group. */ 1928 1.1 christos position_set tmp; /* Temporary space for merging sets. */ 1929 1.1 christos int state; /* New state. */ 1930 1.1 christos int wants_newline; /* New state wants to know newline context. */ 1931 1.1 christos int state_newline; /* New state on a newline transition. */ 1932 1.1 christos int wants_letter; /* New state wants to know letter context. */ 1933 1.1 christos int state_letter; /* New state on a letter transition. */ 1934 1.1 christos static int initialized; /* Flag for static initialization. */ 1935 1.1 christos #ifdef MBS_SUPPORT 1936 1.1 christos int next_isnt_1st_byte = 0; /* Flag If we can't add state0. */ 1937 1.1 christos #endif 1938 1.1 christos int i, j, k; 1939 1.1 christos 1940 1.1 christos /* Initialize the set of letters, if necessary. */ 1941 1.1 christos if (! initialized) 1942 1.1 christos { 1943 1.1 christos initialized = 1; 1944 1.1 christos for (i = 0; i < NOTCHAR; ++i) 1945 1.1 christos if (IS_WORD_CONSTITUENT(i)) 1946 1.1 christos setbit(i, letters); 1947 1.1 christos setbit(eolbyte, newline); 1948 1.1 christos } 1949 1.1 christos 1950 1.1 christos zeroset(matches); 1951 1.1 christos 1952 1.1 christos for (i = 0; i < d->states[s].elems.nelem; ++i) 1953 1.1 christos { 1954 1.1 christos pos = d->states[s].elems.elems[i]; 1955 1.1 christos if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) 1956 1.1 christos setbit(d->tokens[pos.index], matches); 1957 1.1 christos else if (d->tokens[pos.index] >= CSET) 1958 1.1 christos copyset(d->charclasses[d->tokens[pos.index] - CSET], matches); 1959 1.1 christos #ifdef MBS_SUPPORT 1960 1.1 christos else if (d->tokens[pos.index] == ANYCHAR 1961 1.1 christos || d->tokens[pos.index] == MBCSET) 1962 1.1 christos /* MB_CUR_MAX > 1 */ 1963 1.1 christos { 1964 1.1 christos /* ANYCHAR and MBCSET must match with a single character, so we 1965 1.1 christos must put it to d->states[s].mbps, which contains the positions 1966 1.1 christos which can match with a single character not a byte. */ 1967 1.1 christos if (d->states[s].mbps.nelem == 0) 1968 1.1 christos { 1969 1.1 christos MALLOC(d->states[s].mbps.elems, position, 1970 1.1 christos d->states[s].elems.nelem); 1971 1.1 christos } 1972 1.1 christos insert(pos, &(d->states[s].mbps)); 1973 1.1 christos continue; 1974 1.1 christos } 1975 1.1 christos #endif /* MBS_SUPPORT */ 1976 1.1 christos else 1977 1.1 christos continue; 1978 1.1 christos 1979 1.1 christos /* Some characters may need to be eliminated from matches because 1980 1.1 christos they fail in the current context. */ 1981 1.1 christos if (pos.constraint != 0xFF) 1982 1.1 christos { 1983 1.1 christos if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 1984 1.1 christos d->states[s].newline, 1)) 1985 1.1 christos clrbit(eolbyte, matches); 1986 1.1 christos if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 1987 1.1 christos d->states[s].newline, 0)) 1988 1.1 christos for (j = 0; j < CHARCLASS_INTS; ++j) 1989 1.1 christos matches[j] &= newline[j]; 1990 1.1 christos if (! MATCHES_LETTER_CONTEXT(pos.constraint, 1991 1.1 christos d->states[s].letter, 1)) 1992 1.1 christos for (j = 0; j < CHARCLASS_INTS; ++j) 1993 1.1 christos matches[j] &= ~letters[j]; 1994 1.1 christos if (! MATCHES_LETTER_CONTEXT(pos.constraint, 1995 1.1 christos d->states[s].letter, 0)) 1996 1.1 christos for (j = 0; j < CHARCLASS_INTS; ++j) 1997 1.1 christos matches[j] &= letters[j]; 1998 1.1 christos 1999 1.1 christos /* If there are no characters left, there's no point in going on. */ 2000 1.1 christos for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j) 2001 1.1 christos continue; 2002 1.1 christos if (j == CHARCLASS_INTS) 2003 1.1 christos continue; 2004 1.1 christos } 2005 1.1 christos 2006 1.1 christos for (j = 0; j < ngrps; ++j) 2007 1.1 christos { 2008 1.1 christos /* If matches contains a single character only, and the current 2009 1.1 christos group's label doesn't contain that character, go on to the 2010 1.1 christos next group. */ 2011 1.1 christos if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR 2012 1.1 christos && !tstbit(d->tokens[pos.index], labels[j])) 2013 1.1 christos continue; 2014 1.1 christos 2015 1.1 christos /* Check if this group's label has a nonempty intersection with 2016 1.1 christos matches. */ 2017 1.1 christos intersectf = 0; 2018 1.1 christos for (k = 0; k < CHARCLASS_INTS; ++k) 2019 1.1 christos (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0; 2020 1.1 christos if (! intersectf) 2021 1.1 christos continue; 2022 1.1 christos 2023 1.1 christos /* It does; now find the set differences both ways. */ 2024 1.1 christos leftoversf = matchesf = 0; 2025 1.1 christos for (k = 0; k < CHARCLASS_INTS; ++k) 2026 1.1 christos { 2027 1.1 christos /* Even an optimizing compiler can't know this for sure. */ 2028 1.1 christos int match = matches[k], label = labels[j][k]; 2029 1.1 christos 2030 1.1 christos (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0; 2031 1.1 christos (matches[k] = match & ~label) ? (matchesf = 1) : 0; 2032 1.1 christos } 2033 1.1 christos 2034 1.1 christos /* If there were leftovers, create a new group labeled with them. */ 2035 1.1 christos if (leftoversf) 2036 1.1 christos { 2037 1.1 christos copyset(leftovers, labels[ngrps]); 2038 1.1 christos copyset(intersect, labels[j]); 2039 1.1 christos MALLOC(grps[ngrps].elems, position, d->nleaves); 2040 1.1 christos copy(&grps[j], &grps[ngrps]); 2041 1.1 christos ++ngrps; 2042 1.1 christos } 2043 1.1 christos 2044 1.1 christos /* Put the position in the current group. Note that there is no 2045 1.1 christos reason to call insert() here. */ 2046 1.1 christos grps[j].elems[grps[j].nelem++] = pos; 2047 1.1 christos 2048 1.1 christos /* If every character matching the current position has been 2049 1.1 christos accounted for, we're done. */ 2050 1.1 christos if (! matchesf) 2051 1.1 christos break; 2052 1.1 christos } 2053 1.1 christos 2054 1.1 christos /* If we've passed the last group, and there are still characters 2055 1.1 christos unaccounted for, then we'll have to create a new group. */ 2056 1.1 christos if (j == ngrps) 2057 1.1 christos { 2058 1.1 christos copyset(matches, labels[ngrps]); 2059 1.1 christos zeroset(matches); 2060 1.1 christos MALLOC(grps[ngrps].elems, position, d->nleaves); 2061 1.1 christos grps[ngrps].nelem = 1; 2062 1.1 christos grps[ngrps].elems[0] = pos; 2063 1.1 christos ++ngrps; 2064 1.1 christos } 2065 1.1 christos } 2066 1.1 christos 2067 1.1 christos MALLOC(follows.elems, position, d->nleaves); 2068 1.1 christos MALLOC(tmp.elems, position, d->nleaves); 2069 1.1 christos 2070 1.1 christos /* If we are a searching matcher, the default transition is to a state 2071 1.1 christos containing the positions of state 0, otherwise the default transition 2072 1.1 christos is to fail miserably. */ 2073 1.1 christos if (d->searchflag) 2074 1.1 christos { 2075 1.1 christos wants_newline = 0; 2076 1.1 christos wants_letter = 0; 2077 1.1 christos for (i = 0; i < d->states[0].elems.nelem; ++i) 2078 1.1 christos { 2079 1.1 christos if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint)) 2080 1.1 christos wants_newline = 1; 2081 1.1 christos if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint)) 2082 1.1 christos wants_letter = 1; 2083 1.1 christos } 2084 1.1 christos copy(&d->states[0].elems, &follows); 2085 1.1 christos state = state_index(d, &follows, 0, 0); 2086 1.1 christos if (wants_newline) 2087 1.1 christos state_newline = state_index(d, &follows, 1, 0); 2088 1.1 christos else 2089 1.1 christos state_newline = state; 2090 1.1 christos if (wants_letter) 2091 1.1 christos state_letter = state_index(d, &follows, 0, 1); 2092 1.1 christos else 2093 1.1 christos state_letter = state; 2094 1.1 christos for (i = 0; i < NOTCHAR; ++i) 2095 1.1 christos trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state; 2096 1.1 christos trans[eolbyte] = state_newline; 2097 1.1 christos } 2098 1.1 christos else 2099 1.1 christos for (i = 0; i < NOTCHAR; ++i) 2100 1.1 christos trans[i] = -1; 2101 1.1 christos 2102 1.1 christos for (i = 0; i < ngrps; ++i) 2103 1.1 christos { 2104 1.1 christos follows.nelem = 0; 2105 1.1 christos 2106 1.1 christos /* Find the union of the follows of the positions of the group. 2107 1.1 christos This is a hideously inefficient loop. Fix it someday. */ 2108 1.1 christos for (j = 0; j < grps[i].nelem; ++j) 2109 1.1 christos for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) 2110 1.1 christos insert(d->follows[grps[i].elems[j].index].elems[k], &follows); 2111 1.1 christos 2112 1.1 christos #ifdef MBS_SUPPORT 2113 1.1 christos if (MB_CUR_MAX > 1) 2114 1.1 christos { 2115 1.1 christos /* If a token in follows.elems is not 1st byte of a multibyte 2116 1.1 christos character, or the states of follows must accept the bytes 2117 1.1 christos which are not 1st byte of the multibyte character. 2118 1.1 christos Then, if a state of follows encounter a byte, it must not be 2119 1.1 christos a 1st byte of a multibyte character nor singlebyte character. 2120 1.1 christos We cansel to add state[0].follows to next state, because 2121 1.1 christos state[0] must accept 1st-byte 2122 1.1 christos 2123 1.1 christos For example, we assume <sb a> is a certain singlebyte 2124 1.1 christos character, <mb A> is a certain multibyte character, and the 2125 1.1 christos codepoint of <sb a> equals the 2nd byte of the codepoint of 2126 1.1 christos <mb A>. 2127 1.1 christos When state[0] accepts <sb a>, state[i] transit to state[i+1] 2128 1.1 christos by accepting accepts 1st byte of <mb A>, and state[i+1] 2129 1.1 christos accepts 2nd byte of <mb A>, if state[i+1] encounter the 2130 1.1 christos codepoint of <sb a>, it must not be <sb a> but 2nd byte of 2131 1.1 christos <mb A>, so we can not add state[0]. */ 2132 1.1 christos 2133 1.1 christos next_isnt_1st_byte = 0; 2134 1.1 christos for (j = 0; j < follows.nelem; ++j) 2135 1.1 christos { 2136 1.1 christos if (!(d->multibyte_prop[follows.elems[j].index] & 1)) 2137 1.1 christos { 2138 1.1 christos next_isnt_1st_byte = 1; 2139 1.1 christos break; 2140 1.1 christos } 2141 1.1 christos } 2142 1.1 christos } 2143 1.1 christos #endif 2144 1.1 christos 2145 1.1 christos /* If we are building a searching matcher, throw in the positions 2146 1.1 christos of state 0 as well. */ 2147 1.1 christos #ifdef MBS_SUPPORT 2148 1.1 christos if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte)) 2149 1.1 christos #else 2150 1.1 christos if (d->searchflag) 2151 1.1 christos #endif 2152 1.1 christos for (j = 0; j < d->states[0].elems.nelem; ++j) 2153 1.1 christos insert(d->states[0].elems.elems[j], &follows); 2154 1.1 christos 2155 1.1 christos /* Find out if the new state will want any context information. */ 2156 1.1 christos wants_newline = 0; 2157 1.1 christos if (tstbit(eolbyte, labels[i])) 2158 1.1 christos for (j = 0; j < follows.nelem; ++j) 2159 1.1 christos if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) 2160 1.1 christos wants_newline = 1; 2161 1.1 christos 2162 1.1 christos wants_letter = 0; 2163 1.1 christos for (j = 0; j < CHARCLASS_INTS; ++j) 2164 1.1 christos if (labels[i][j] & letters[j]) 2165 1.1 christos break; 2166 1.1 christos if (j < CHARCLASS_INTS) 2167 1.1 christos for (j = 0; j < follows.nelem; ++j) 2168 1.1 christos if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) 2169 1.1 christos wants_letter = 1; 2170 1.1 christos 2171 1.1 christos /* Find the state(s) corresponding to the union of the follows. */ 2172 1.1 christos state = state_index(d, &follows, 0, 0); 2173 1.1 christos if (wants_newline) 2174 1.1 christos state_newline = state_index(d, &follows, 1, 0); 2175 1.1 christos else 2176 1.1 christos state_newline = state; 2177 1.1 christos if (wants_letter) 2178 1.1 christos state_letter = state_index(d, &follows, 0, 1); 2179 1.1 christos else 2180 1.1 christos state_letter = state; 2181 1.1 christos 2182 1.1 christos /* Set the transitions for each character in the current label. */ 2183 1.1 christos for (j = 0; j < CHARCLASS_INTS; ++j) 2184 1.1 christos for (k = 0; k < INTBITS; ++k) 2185 1.1 christos if (labels[i][j] & 1 << k) 2186 1.1 christos { 2187 1.1 christos int c = j * INTBITS + k; 2188 1.1 christos 2189 1.1 christos if (c == eolbyte) 2190 1.1 christos trans[c] = state_newline; 2191 1.1 christos else if (IS_WORD_CONSTITUENT(c)) 2192 1.1 christos trans[c] = state_letter; 2193 1.1 christos else if (c < NOTCHAR) 2194 1.1 christos trans[c] = state; 2195 1.1 christos } 2196 1.1 christos } 2197 1.1 christos 2198 1.1 christos for (i = 0; i < ngrps; ++i) 2199 1.1 christos free(grps[i].elems); 2200 1.1 christos free(follows.elems); 2201 1.1 christos free(tmp.elems); 2202 1.1 christos } 2203 1.1 christos 2204 1.1 christos /* Some routines for manipulating a compiled dfa's transition tables. 2205 1.1 christos Each state may or may not have a transition table; if it does, and it 2206 1.1 christos is a non-accepting state, then d->trans[state] points to its table. 2207 1.1 christos If it is an accepting state then d->fails[state] points to its table. 2208 1.1 christos If it has no table at all, then d->trans[state] is NULL. 2209 1.1 christos TODO: Improve this comment, get rid of the unnecessary redundancy. */ 2210 1.1 christos 2211 1.1 christos static void 2212 1.1 christos build_state (int s, struct dfa *d) 2213 1.1 christos { 2214 1.1 christos int *trans; /* The new transition table. */ 2215 1.1 christos int i; 2216 1.1 christos 2217 1.1 christos /* Set an upper limit on the number of transition tables that will ever 2218 1.1 christos exist at once. 1024 is arbitrary. The idea is that the frequently 2219 1.1 christos used transition tables will be quickly rebuilt, whereas the ones that 2220 1.1 christos were only needed once or twice will be cleared away. */ 2221 1.1 christos if (d->trcount >= 1024) 2222 1.1 christos { 2223 1.1 christos for (i = 0; i < d->tralloc; ++i) 2224 1.1 christos if (d->trans[i]) 2225 1.1 christos { 2226 1.1 christos free((ptr_t) d->trans[i]); 2227 1.1 christos d->trans[i] = NULL; 2228 1.1 christos } 2229 1.1 christos else if (d->fails[i]) 2230 1.1 christos { 2231 1.1 christos free((ptr_t) d->fails[i]); 2232 1.1 christos d->fails[i] = NULL; 2233 1.1 christos } 2234 1.1 christos d->trcount = 0; 2235 1.1 christos } 2236 1.1 christos 2237 1.1 christos ++d->trcount; 2238 1.1 christos 2239 1.1 christos /* Set up the success bits for this state. */ 2240 1.1 christos d->success[s] = 0; 2241 1.1 christos if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0, 2242 1.1 christos s, *d)) 2243 1.1 christos d->success[s] |= 4; 2244 1.1 christos if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1, 2245 1.1 christos s, *d)) 2246 1.1 christos d->success[s] |= 2; 2247 1.1 christos if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0, 2248 1.1 christos s, *d)) 2249 1.1 christos d->success[s] |= 1; 2250 1.1 christos 2251 1.1 christos MALLOC(trans, int, NOTCHAR); 2252 1.1 christos dfastate(s, d, trans); 2253 1.1 christos 2254 1.1 christos /* Now go through the new transition table, and make sure that the trans 2255 1.1 christos and fail arrays are allocated large enough to hold a pointer for the 2256 1.1 christos largest state mentioned in the table. */ 2257 1.1 christos for (i = 0; i < NOTCHAR; ++i) 2258 1.1 christos if (trans[i] >= d->tralloc) 2259 1.1 christos { 2260 1.1 christos int oldalloc = d->tralloc; 2261 1.1 christos 2262 1.1 christos while (trans[i] >= d->tralloc) 2263 1.1 christos d->tralloc *= 2; 2264 1.1 christos REALLOC(d->realtrans, int *, d->tralloc + 1); 2265 1.1 christos d->trans = d->realtrans + 1; 2266 1.1 christos REALLOC(d->fails, int *, d->tralloc); 2267 1.1 christos REALLOC(d->success, int, d->tralloc); 2268 1.1 christos while (oldalloc < d->tralloc) 2269 1.1 christos { 2270 1.1 christos d->trans[oldalloc] = NULL; 2271 1.1 christos d->fails[oldalloc++] = NULL; 2272 1.1 christos } 2273 1.1 christos } 2274 1.1 christos 2275 1.1 christos /* Newline is a sentinel. */ 2276 1.1 christos trans[eolbyte] = -1; 2277 1.1 christos 2278 1.1 christos if (ACCEPTING(s, *d)) 2279 1.1 christos d->fails[s] = trans; 2280 1.1 christos else 2281 1.1 christos d->trans[s] = trans; 2282 1.1 christos } 2283 1.1 christos 2284 1.1 christos static void 2285 1.1 christos build_state_zero (struct dfa *d) 2286 1.1 christos { 2287 1.1 christos d->tralloc = 1; 2288 1.1 christos d->trcount = 0; 2289 1.1 christos CALLOC(d->realtrans, int *, d->tralloc + 1); 2290 1.1 christos d->trans = d->realtrans + 1; 2291 1.1 christos CALLOC(d->fails, int *, d->tralloc); 2292 1.1 christos MALLOC(d->success, int, d->tralloc); 2293 1.1 christos build_state(0, d); 2294 1.1 christos } 2295 1.1 christos 2296 1.1 christos #ifdef MBS_SUPPORT 2297 1.1 christos /* Multibyte character handling sub-routins for dfaexec. */ 2298 1.1 christos 2299 1.1 christos /* Initial state may encounter the byte which is not a singlebyte character 2300 1.1 christos nor 1st byte of a multibyte character. But it is incorrect for initial 2301 1.1 christos state to accept such a byte. 2302 1.1 christos For example, in sjis encoding the regular expression like "\\" accepts 2303 1.1 christos the codepoint 0x5c, but should not accept the 2nd byte of the codepoint 2304 1.1 christos 0x815c. Then Initial state must skip the bytes which are not a singlebyte 2305 1.1 christos character nor 1st byte of a multibyte character. */ 2306 1.1 christos #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \ 2307 1.1 christos if (s == 0) \ 2308 1.1 christos { \ 2309 1.1 christos while (inputwcs[p - buf_begin] == 0 \ 2310 1.1 christos && mblen_buf[p - buf_begin] > 0 \ 2311 1.1 christos && p < buf_end) \ 2312 1.1 christos ++p; \ 2313 1.1 christos if (p >= end) \ 2314 1.1 christos { \ 2315 1.1 christos free(mblen_buf); \ 2316 1.1 christos free(inputwcs); \ 2317 1.1 christos return (size_t) -1; \ 2318 1.1 christos } \ 2319 1.1 christos } 2320 1.1 christos 2321 1.1 christos static void 2322 1.1 christos realloc_trans_if_necessary(struct dfa *d, int new_state) 2323 1.1 christos { 2324 1.1 christos /* Make sure that the trans and fail arrays are allocated large enough 2325 1.1 christos to hold a pointer for the new state. */ 2326 1.1 christos if (new_state >= d->tralloc) 2327 1.1 christos { 2328 1.1 christos int oldalloc = d->tralloc; 2329 1.1 christos 2330 1.1 christos while (new_state >= d->tralloc) 2331 1.1 christos d->tralloc *= 2; 2332 1.1 christos REALLOC(d->realtrans, int *, d->tralloc + 1); 2333 1.1 christos d->trans = d->realtrans + 1; 2334 1.1 christos REALLOC(d->fails, int *, d->tralloc); 2335 1.1 christos REALLOC(d->success, int, d->tralloc); 2336 1.1 christos while (oldalloc < d->tralloc) 2337 1.1 christos { 2338 1.1 christos d->trans[oldalloc] = NULL; 2339 1.1 christos d->fails[oldalloc++] = NULL; 2340 1.1 christos } 2341 1.1 christos } 2342 1.1 christos } 2343 1.1 christos 2344 1.1 christos /* Return values of transit_state_singlebyte(), and 2345 1.1 christos transit_state_consume_1char. */ 2346 1.1 christos typedef enum 2347 1.1 christos { 2348 1.1 christos TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */ 2349 1.1 christos TRANSIT_STATE_DONE, /* State transition has finished. */ 2350 1.1 christos TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */ 2351 1.1 christos } status_transit_state; 2352 1.1 christos 2353 1.1 christos /* Consume a single byte and transit state from 's' to '*next_state'. 2354 1.1 christos This function is almost same as the state transition routin in dfaexec(). 2355 1.1 christos But state transition is done just once, otherwise matching succeed or 2356 1.1 christos reach the end of the buffer. */ 2357 1.1 christos static status_transit_state 2358 1.1 christos transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p, 2359 1.1 christos int *next_state) 2360 1.1 christos { 2361 1.1 christos int *t; 2362 1.1 christos int works = s; 2363 1.1 christos 2364 1.1 christos status_transit_state rval = TRANSIT_STATE_IN_PROGRESS; 2365 1.1 christos 2366 1.1 christos while (rval == TRANSIT_STATE_IN_PROGRESS) 2367 1.1 christos { 2368 1.1 christos if ((t = d->trans[works]) != NULL) 2369 1.1 christos { 2370 1.1 christos works = t[*p]; 2371 1.1 christos rval = TRANSIT_STATE_DONE; 2372 1.1 christos if (works < 0) 2373 1.1 christos works = 0; 2374 1.1 christos } 2375 1.1 christos else if (works < 0) 2376 1.1 christos { 2377 1.1 christos if (p == buf_end) 2378 1.1 christos /* At the moment, it must not happen. */ 2379 1.1 christos return TRANSIT_STATE_END_BUFFER; 2380 1.1 christos works = 0; 2381 1.1 christos } 2382 1.1 christos else if (d->fails[works]) 2383 1.1 christos { 2384 1.1 christos works = d->fails[works][*p]; 2385 1.1 christos rval = TRANSIT_STATE_DONE; 2386 1.1 christos } 2387 1.1 christos else 2388 1.1 christos { 2389 1.1 christos build_state(works, d); 2390 1.1 christos } 2391 1.1 christos } 2392 1.1 christos *next_state = works; 2393 1.1 christos return rval; 2394 1.1 christos } 2395 1.1 christos 2396 1.1 christos /* Check whether period can match or not in the current context. If it can, 2397 1.1 christos return the amount of the bytes with which period can match, otherwise 2398 1.1 christos return 0. 2399 1.1 christos `pos' is the position of the period. `index' is the index from the 2400 1.1 christos buf_begin, and it is the current position in the buffer. */ 2401 1.1 christos static int 2402 1.1 christos match_anychar (struct dfa *d, int s, position pos, int index) 2403 1.1 christos { 2404 1.1 christos int newline = 0; 2405 1.1 christos int letter = 0; 2406 1.1 christos wchar_t wc; 2407 1.1 christos int mbclen; 2408 1.1 christos 2409 1.1 christos wc = inputwcs[index]; 2410 1.1 christos mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; 2411 1.1 christos 2412 1.1 christos /* Check context. */ 2413 1.1 christos if (wc == (wchar_t)eolbyte) 2414 1.1 christos { 2415 1.1 christos if (!(syntax_bits & RE_DOT_NEWLINE)) 2416 1.1 christos return 0; 2417 1.1 christos newline = 1; 2418 1.1 christos } 2419 1.1 christos else if (wc == (wchar_t)'\0') 2420 1.1 christos { 2421 1.1 christos if (syntax_bits & RE_DOT_NOT_NULL) 2422 1.1 christos return 0; 2423 1.1 christos newline = 1; 2424 1.1 christos } 2425 1.1 christos 2426 1.1 christos if (iswalnum(wc) || wc == L'_') 2427 1.1 christos letter = 1; 2428 1.1 christos 2429 1.1 christos if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, 2430 1.1 christos newline, d->states[s].letter, letter)) 2431 1.1 christos return 0; 2432 1.1 christos 2433 1.1 christos return mbclen; 2434 1.1 christos } 2435 1.1 christos 2436 1.1 christos /* Check whether bracket expression can match or not in the current context. 2437 1.1 christos If it can, return the amount of the bytes with which expression can match, 2438 1.1 christos otherwise return 0. 2439 1.1 christos `pos' is the position of the bracket expression. `index' is the index 2440 1.1 christos from the buf_begin, and it is the current position in the buffer. */ 2441 1.1 christos int 2442 1.1 christos match_mb_charset (struct dfa *d, int s, position pos, int index) 2443 1.1 christos { 2444 1.1 christos int i; 2445 1.1 christos int match; /* Flag which represent that matching succeed. */ 2446 1.1 christos int match_len; /* Length of the character (or collating element) 2447 1.1 christos with which this operator match. */ 2448 1.1 christos int op_len; /* Length of the operator. */ 2449 1.1 christos char buffer[128]; 2450 1.1 christos wchar_t wcbuf[6]; 2451 1.1 christos 2452 1.1 christos /* Pointer to the structure to which we are currently reffering. */ 2453 1.1 christos struct mb_char_classes *work_mbc; 2454 1.1 christos 2455 1.1 christos int newline = 0; 2456 1.1 christos int letter = 0; 2457 1.1 christos wchar_t wc; /* Current reffering character. */ 2458 1.1 christos 2459 1.1 christos wc = inputwcs[index]; 2460 1.1 christos 2461 1.1 christos /* Check context. */ 2462 1.1 christos if (wc == (wchar_t)eolbyte) 2463 1.1 christos { 2464 1.1 christos if (!(syntax_bits & RE_DOT_NEWLINE)) 2465 1.1 christos return 0; 2466 1.1 christos newline = 1; 2467 1.1 christos } 2468 1.1 christos else if (wc == (wchar_t)'\0') 2469 1.1 christos { 2470 1.1 christos if (syntax_bits & RE_DOT_NOT_NULL) 2471 1.1 christos return 0; 2472 1.1 christos newline = 1; 2473 1.1 christos } 2474 1.1 christos if (iswalnum(wc) || wc == L'_') 2475 1.1 christos letter = 1; 2476 1.1 christos if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, 2477 1.1 christos newline, d->states[s].letter, letter)) 2478 1.1 christos return 0; 2479 1.1 christos 2480 1.1 christos /* Assign the current reffering operator to work_mbc. */ 2481 1.1 christos work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]); 2482 1.1 christos match = !work_mbc->invert; 2483 1.1 christos match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; 2484 1.1 christos 2485 1.1 christos /* match with a character class? */ 2486 1.1 christos for (i = 0; i<work_mbc->nch_classes; i++) 2487 1.1 christos { 2488 1.1 christos if (iswctype((wint_t)wc, work_mbc->ch_classes[i])) 2489 1.1 christos goto charset_matched; 2490 1.1 christos } 2491 1.1 christos 2492 1.1 christos strncpy(buffer, buf_begin + index, match_len); 2493 1.1 christos buffer[match_len] = '\0'; 2494 1.1 christos 2495 1.1 christos /* match with an equivalent class? */ 2496 1.1 christos for (i = 0; i<work_mbc->nequivs; i++) 2497 1.1 christos { 2498 1.1 christos op_len = strlen(work_mbc->equivs[i]); 2499 1.1 christos strncpy(buffer, buf_begin + index, op_len); 2500 1.1 christos buffer[op_len] = '\0'; 2501 1.1 christos if (strcoll(work_mbc->equivs[i], buffer) == 0) 2502 1.1 christos { 2503 1.1 christos match_len = op_len; 2504 1.1 christos goto charset_matched; 2505 1.1 christos } 2506 1.1 christos } 2507 1.1 christos 2508 1.1 christos /* match with a collating element? */ 2509 1.1 christos for (i = 0; i<work_mbc->ncoll_elems; i++) 2510 1.1 christos { 2511 1.1 christos op_len = strlen(work_mbc->coll_elems[i]); 2512 1.1 christos strncpy(buffer, buf_begin + index, op_len); 2513 1.1 christos buffer[op_len] = '\0'; 2514 1.1 christos 2515 1.1 christos if (strcoll(work_mbc->coll_elems[i], buffer) == 0) 2516 1.1 christos { 2517 1.1 christos match_len = op_len; 2518 1.1 christos goto charset_matched; 2519 1.1 christos } 2520 1.1 christos } 2521 1.1 christos 2522 1.1 christos wcbuf[0] = wc; 2523 1.1 christos wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0'; 2524 1.1 christos 2525 1.1 christos /* match with a range? */ 2526 1.1 christos for (i = 0; i<work_mbc->nranges; i++) 2527 1.1 christos { 2528 1.1 christos wcbuf[2] = work_mbc->range_sts[i]; 2529 1.1 christos wcbuf[4] = work_mbc->range_ends[i]; 2530 1.1 christos 2531 1.1 christos if (wcscoll(wcbuf, wcbuf+2) >= 0 && 2532 1.1 christos wcscoll(wcbuf+4, wcbuf) >= 0) 2533 1.1 christos goto charset_matched; 2534 1.1 christos } 2535 1.1 christos 2536 1.1 christos /* match with a character? */ 2537 1.1 christos for (i = 0; i<work_mbc->nchars; i++) 2538 1.1 christos { 2539 1.1 christos if (wc == work_mbc->chars[i]) 2540 1.1 christos goto charset_matched; 2541 1.1 christos } 2542 1.1 christos 2543 1.1 christos match = !match; 2544 1.1 christos 2545 1.1 christos charset_matched: 2546 1.1 christos return match ? match_len : 0; 2547 1.1 christos } 2548 1.1 christos 2549 1.1 christos /* Check each of `d->states[s].mbps.elem' can match or not. Then return the 2550 1.1 christos array which corresponds to `d->states[s].mbps.elem' and each element of 2551 1.1 christos the array contains the amount of the bytes with which the element can 2552 1.1 christos match. 2553 1.1 christos `index' is the index from the buf_begin, and it is the current position 2554 1.1 christos in the buffer. 2555 1.1 christos Caller MUST free the array which this function return. */ 2556 1.1 christos static int* 2557 1.1 christos check_matching_with_multibyte_ops (struct dfa *d, int s, int index) 2558 1.1 christos { 2559 1.1 christos int i; 2560 1.1 christos int* rarray; 2561 1.1 christos 2562 1.1 christos MALLOC(rarray, int, d->states[s].mbps.nelem); 2563 1.1 christos for (i = 0; i < d->states[s].mbps.nelem; ++i) 2564 1.1 christos { 2565 1.1 christos position pos = d->states[s].mbps.elems[i]; 2566 1.1 christos switch(d->tokens[pos.index]) 2567 1.1 christos { 2568 1.1 christos case ANYCHAR: 2569 1.1 christos rarray[i] = match_anychar(d, s, pos, index); 2570 1.1 christos break; 2571 1.1 christos case MBCSET: 2572 1.1 christos rarray[i] = match_mb_charset(d, s, pos, index); 2573 1.1 christos break; 2574 1.1 christos default: 2575 1.1 christos break; /* can not happen. */ 2576 1.1 christos } 2577 1.1 christos } 2578 1.1 christos return rarray; 2579 1.1 christos } 2580 1.1 christos 2581 1.1 christos /* Consume a single character and enumerate all of the positions which can 2582 1.1 christos be next position from the state `s'. 2583 1.1 christos `match_lens' is the input. It can be NULL, but it can also be the output 2584 1.1 christos of check_matching_with_multibyte_ops() for optimization. 2585 1.1 christos `mbclen' and `pps' are the output. `mbclen' is the length of the 2586 1.1 christos character consumed, and `pps' is the set this function enumerate. */ 2587 1.1 christos static status_transit_state 2588 1.1 christos transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp, 2589 1.1 christos int *match_lens, int *mbclen, position_set *pps) 2590 1.1 christos { 2591 1.1 christos int i, j; 2592 1.1 christos int s1, s2; 2593 1.1 christos int* work_mbls; 2594 1.1 christos status_transit_state rs = TRANSIT_STATE_DONE; 2595 1.1 christos 2596 1.1 christos /* Calculate the length of the (single/multi byte) character 2597 1.1 christos to which p points. */ 2598 1.1 christos *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1 2599 1.1 christos : mblen_buf[*pp - buf_begin]; 2600 1.1 christos 2601 1.1 christos /* Calculate the state which can be reached from the state `s' by 2602 1.1 christos consuming `*mbclen' single bytes from the buffer. */ 2603 1.1 christos s1 = s; 2604 1.1 christos for (i = 0; i < *mbclen; i++) 2605 1.1 christos { 2606 1.1 christos s2 = s1; 2607 1.1 christos rs = transit_state_singlebyte(d, s2, (*pp)++, &s1); 2608 1.1 christos } 2609 1.1 christos /* Copy the positions contained by `s1' to the set `pps'. */ 2610 1.1 christos copy(&(d->states[s1].elems), pps); 2611 1.1 christos 2612 1.1 christos /* Check (inputed)match_lens, and initialize if it is NULL. */ 2613 1.1 christos if (match_lens == NULL && d->states[s].mbps.nelem != 0) 2614 1.1 christos work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); 2615 1.1 christos else 2616 1.1 christos work_mbls = match_lens; 2617 1.1 christos 2618 1.1 christos /* Add all of the positions which can be reached from `s' by consuming 2619 1.1 christos a single character. */ 2620 1.1 christos for (i = 0; i < d->states[s].mbps.nelem ; i++) 2621 1.1 christos { 2622 1.1 christos if (work_mbls[i] == *mbclen) 2623 1.1 christos for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem; 2624 1.1 christos j++) 2625 1.1 christos insert(d->follows[d->states[s].mbps.elems[i].index].elems[j], 2626 1.1 christos pps); 2627 1.1 christos } 2628 1.1 christos 2629 1.1 christos if (match_lens == NULL && work_mbls != NULL) 2630 1.1 christos free(work_mbls); 2631 1.1 christos return rs; 2632 1.1 christos } 2633 1.1 christos 2634 1.1 christos /* Transit state from s, then return new state and update the pointer of the 2635 1.1 christos buffer. This function is for some operator which can match with a multi- 2636 1.1 christos byte character or a collating element(which may be multi characters). */ 2637 1.1 christos static int 2638 1.1 christos transit_state (struct dfa *d, int s, unsigned char const **pp) 2639 1.1 christos { 2640 1.1 christos int s1; 2641 1.1 christos int mbclen; /* The length of current input multibyte character. */ 2642 1.1 christos int maxlen = 0; 2643 1.1 christos int i, j; 2644 1.1 christos int *match_lens = NULL; 2645 1.1 christos int nelem = d->states[s].mbps.nelem; /* Just a alias. */ 2646 1.1 christos position_set follows; 2647 1.1 christos unsigned char const *p1 = *pp; 2648 1.1 christos status_transit_state rs; 2649 1.1 christos wchar_t wc; 2650 1.1 christos 2651 1.1 christos if (nelem > 0) 2652 1.1 christos /* This state has (a) multibyte operator(s). 2653 1.1 christos We check whether each of them can match or not. */ 2654 1.1 christos { 2655 1.1 christos /* Note: caller must free the return value of this function. */ 2656 1.1 christos match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); 2657 1.1 christos 2658 1.1 christos for (i = 0; i < nelem; i++) 2659 1.1 christos /* Search the operator which match the longest string, 2660 1.1 christos in this state. */ 2661 1.1 christos { 2662 1.1 christos if (match_lens[i] > maxlen) 2663 1.1 christos maxlen = match_lens[i]; 2664 1.1 christos } 2665 1.1 christos } 2666 1.1 christos 2667 1.1 christos if (nelem == 0 || maxlen == 0) 2668 1.1 christos /* This state has no multibyte operator which can match. 2669 1.1 christos We need to check only one singlebyte character. */ 2670 1.1 christos { 2671 1.1 christos status_transit_state rs; 2672 1.1 christos rs = transit_state_singlebyte(d, s, *pp, &s1); 2673 1.1 christos 2674 1.1 christos /* We must update the pointer if state transition succeeded. */ 2675 1.1 christos if (rs == TRANSIT_STATE_DONE) 2676 1.1 christos ++*pp; 2677 1.1 christos 2678 1.1 christos if (match_lens != NULL) 2679 1.1 christos free(match_lens); 2680 1.1 christos return s1; 2681 1.1 christos } 2682 1.1 christos 2683 1.1 christos /* This state has some operators which can match a multibyte character. */ 2684 1.1 christos follows.nelem = 0; 2685 1.1 christos MALLOC(follows.elems, position, d->nleaves); 2686 1.1 christos 2687 1.1 christos /* `maxlen' may be longer than the length of a character, because it may 2688 1.1 christos not be a character but a (multi character) collating element. 2689 1.1 christos We enumerate all of the positions which `s' can reach by consuming 2690 1.1 christos `maxlen' bytes. */ 2691 1.1 christos rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows); 2692 1.1 christos 2693 1.1 christos wc = inputwcs[*pp - mbclen - buf_begin]; 2694 1.1 christos s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); 2695 1.1 christos realloc_trans_if_necessary(d, s1); 2696 1.1 christos 2697 1.1 christos while (*pp - p1 < maxlen) 2698 1.1 christos { 2699 1.1 christos follows.nelem = 0; 2700 1.1 christos rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows); 2701 1.1 christos 2702 1.1 christos for (i = 0; i < nelem ; i++) 2703 1.1 christos { 2704 1.1 christos if (match_lens[i] == *pp - p1) 2705 1.1 christos for (j = 0; 2706 1.1 christos j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) 2707 1.1 christos insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j], 2708 1.1 christos &follows); 2709 1.1 christos } 2710 1.1 christos 2711 1.1 christos wc = inputwcs[*pp - mbclen - buf_begin]; 2712 1.1 christos s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); 2713 1.1 christos realloc_trans_if_necessary(d, s1); 2714 1.1 christos } 2715 1.1 christos free(match_lens); 2716 1.1 christos free(follows.elems); 2717 1.1 christos return s1; 2718 1.1 christos } 2719 1.1 christos 2720 1.1 christos #endif 2721 1.1 christos 2722 1.1 christos /* Search through a buffer looking for a match to the given struct dfa. 2723 1.1 christos Find the first occurrence of a string matching the regexp in the buffer, 2724 1.1 christos and the shortest possible version thereof. Return the offset of the first 2725 1.1 christos character after the match, or (size_t) -1 if none is found. BEGIN points to 2726 1.1 christos the beginning of the buffer, and SIZE is the size of the buffer. If SIZE 2727 1.1 christos is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place 2728 1.1 christos where we're supposed to store a 1 if backreferencing happened and the 2729 1.1 christos match needs to be verified by a backtracking matcher. Otherwise 2730 1.1 christos we store a 0 in *backref. */ 2731 1.1 christos size_t 2732 1.1 christos dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) 2733 1.1 christos { 2734 1.1 christos register int s; /* Current state. */ 2735 1.1 christos register unsigned char const *p; /* Current input character. */ 2736 1.1 christos register unsigned char const *end; /* One past the last input character. */ 2737 1.1 christos register int **trans, *t; /* Copy of d->trans so it can be optimized 2738 1.1 christos into a register. */ 2739 1.1 christos register unsigned char eol = eolbyte; /* Likewise for eolbyte. */ 2740 1.1 christos static int sbit[NOTCHAR]; /* Table for anding with d->success. */ 2741 1.1 christos static int sbit_init; 2742 1.1 christos 2743 1.1 christos if (! sbit_init) 2744 1.1 christos { 2745 1.1 christos int i; 2746 1.1 christos 2747 1.1 christos sbit_init = 1; 2748 1.1 christos for (i = 0; i < NOTCHAR; ++i) 2749 1.1 christos sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1; 2750 1.1 christos sbit[eol] = 4; 2751 1.1 christos } 2752 1.1 christos 2753 1.1 christos if (! d->tralloc) 2754 1.1 christos build_state_zero(d); 2755 1.1 christos 2756 1.1 christos s = 0; 2757 1.1 christos p = (unsigned char const *) begin; 2758 1.1 christos end = p + size; 2759 1.1 christos trans = d->trans; 2760 1.1 christos 2761 1.1 christos #ifdef MBS_SUPPORT 2762 1.1 christos if (MB_CUR_MAX > 1) 2763 1.1 christos { 2764 1.1 christos int remain_bytes, i; 2765 1.1 christos buf_begin = begin; 2766 1.1 christos buf_end = end; 2767 1.1 christos 2768 1.1 christos /* initialize mblen_buf, and inputwcs. */ 2769 1.1 christos MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2); 2770 1.1 christos MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2); 2771 1.1 christos memset(&mbs, 0, sizeof(mbstate_t)); 2772 1.1 christos remain_bytes = 0; 2773 1.1 christos for (i = 0; i < end - (unsigned char const *)begin + 1; i++) 2774 1.1 christos { 2775 1.1 christos if (remain_bytes == 0) 2776 1.1 christos { 2777 1.1 christos remain_bytes 2778 1.1 christos = mbrtowc(inputwcs + i, begin + i, 2779 1.1 christos end - (unsigned char const *)begin - i + 1, &mbs); 2780 1.1 christos if (remain_bytes <= 1) 2781 1.1 christos { 2782 1.1 christos remain_bytes = 0; 2783 1.1 christos inputwcs[i] = (wchar_t)begin[i]; 2784 1.1 christos mblen_buf[i] = 0; 2785 1.1 christos } 2786 1.1 christos else 2787 1.1 christos { 2788 1.1 christos mblen_buf[i] = remain_bytes; 2789 1.1 christos remain_bytes--; 2790 1.1 christos } 2791 1.1 christos } 2792 1.1 christos else 2793 1.1 christos { 2794 1.1 christos mblen_buf[i] = remain_bytes; 2795 1.1 christos inputwcs[i] = 0; 2796 1.1 christos remain_bytes--; 2797 1.1 christos } 2798 1.1 christos } 2799 1.1 christos mblen_buf[i] = 0; 2800 1.1 christos inputwcs[i] = 0; /* sentinel */ 2801 1.1 christos } 2802 1.1 christos #endif /* MBS_SUPPORT */ 2803 1.1 christos 2804 1.1 christos for (;;) 2805 1.1 christos { 2806 1.1 christos #ifdef MBS_SUPPORT 2807 1.1 christos if (MB_CUR_MAX > 1) 2808 1.1 christos while ((t = trans[s])) 2809 1.1 christos { 2810 1.1 christos if (p == end) 2811 1.1 christos { 2812 1.1 christos free(mblen_buf); 2813 1.1 christos free(inputwcs); 2814 1.1 christos return (size_t) -1; 2815 1.1 christos } 2816 1.1 christos if (d->states[s].mbps.nelem != 0) 2817 1.1 christos { 2818 1.1 christos /* Can match with a multibyte character( and multi character 2819 1.1 christos collating element). */ 2820 1.1 christos unsigned char const *nextp; 2821 1.1 christos 2822 1.1 christos SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2823 1.1 christos 2824 1.1 christos nextp = p; 2825 1.1 christos s = transit_state(d, s, &nextp); 2826 1.1 christos p = nextp; 2827 1.1 christos 2828 1.1 christos /* Trans table might be updated. */ 2829 1.1 christos trans = d->trans; 2830 1.1 christos } 2831 1.1 christos else 2832 1.1 christos { 2833 1.1 christos SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2834 1.1 christos s = t[*p++]; 2835 1.1 christos } 2836 1.1 christos } 2837 1.1 christos else 2838 1.1 christos #endif /* MBS_SUPPORT */ 2839 1.1 christos while ((t = trans[s])) 2840 1.1 christos { 2841 1.1 christos if (p == end) 2842 1.1 christos return (size_t) -1; 2843 1.1 christos s = t[*p++]; 2844 1.1 christos } 2845 1.1 christos 2846 1.1 christos if (s < 0) 2847 1.1 christos { 2848 1.1 christos if (p == end) 2849 1.1 christos { 2850 1.1 christos #ifdef MBS_SUPPORT 2851 1.1 christos if (MB_CUR_MAX > 1) 2852 1.1 christos { 2853 1.1 christos free(mblen_buf); 2854 1.1 christos free(inputwcs); 2855 1.1 christos } 2856 1.1 christos #endif /* MBS_SUPPORT */ 2857 1.1 christos return (size_t) -1; 2858 1.1 christos } 2859 1.1 christos s = 0; 2860 1.1 christos } 2861 1.1 christos else if ((t = d->fails[s])) 2862 1.1 christos { 2863 1.1 christos if (d->success[s] & sbit[*p]) 2864 1.1 christos { 2865 1.1 christos if (backref) 2866 1.1 christos *backref = (d->states[s].backref != 0); 2867 1.1 christos #ifdef MBS_SUPPORT 2868 1.1 christos if (MB_CUR_MAX > 1) 2869 1.1 christos { 2870 1.1 christos free(mblen_buf); 2871 1.1 christos free(inputwcs); 2872 1.1 christos } 2873 1.1 christos #endif /* MBS_SUPPORT */ 2874 1.1 christos return (char const *) p - begin; 2875 1.1 christos } 2876 1.1 christos 2877 1.1 christos #ifdef MBS_SUPPORT 2878 1.1 christos if (MB_CUR_MAX > 1) 2879 1.1 christos { 2880 1.1 christos SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2881 1.1 christos if (d->states[s].mbps.nelem != 0) 2882 1.1 christos { 2883 1.1 christos /* Can match with a multibyte character( and multi 2884 1.1 christos character collating element). */ 2885 1.1 christos unsigned char const *nextp; 2886 1.1 christos nextp = p; 2887 1.1 christos s = transit_state(d, s, &nextp); 2888 1.1 christos p = nextp; 2889 1.1 christos 2890 1.1 christos /* Trans table might be updated. */ 2891 1.1 christos trans = d->trans; 2892 1.1 christos } 2893 1.1 christos else 2894 1.1 christos s = t[*p++]; 2895 1.1 christos } 2896 1.1 christos else 2897 1.1 christos #endif /* MBS_SUPPORT */ 2898 1.1 christos s = t[*p++]; 2899 1.1 christos } 2900 1.1 christos else 2901 1.1 christos { 2902 1.1 christos build_state(s, d); 2903 1.1 christos trans = d->trans; 2904 1.1 christos } 2905 1.1 christos } 2906 1.1 christos } 2907 1.1 christos 2908 1.1 christos /* Initialize the components of a dfa that the other routines don't 2909 1.1 christos initialize for themselves. */ 2910 1.1 christos void 2911 1.1 christos dfainit (struct dfa *d) 2912 1.1 christos { 2913 1.1 christos d->calloc = 1; 2914 1.1 christos MALLOC(d->charclasses, charclass, d->calloc); 2915 1.1 christos d->cindex = 0; 2916 1.1 christos 2917 1.1 christos d->talloc = 1; 2918 1.1 christos MALLOC(d->tokens, token, d->talloc); 2919 1.1 christos d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2920 1.1 christos #ifdef MBS_SUPPORT 2921 1.1 christos if (MB_CUR_MAX > 1) 2922 1.1 christos { 2923 1.1 christos d->nmultibyte_prop = 1; 2924 1.1 christos MALLOC(d->multibyte_prop, int, d->nmultibyte_prop); 2925 1.1 christos d->nmbcsets = 0; 2926 1.1 christos d->mbcsets_alloc = 1; 2927 1.1 christos MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc); 2928 1.1 christos } 2929 1.1 christos #endif 2930 1.1 christos 2931 1.1 christos d->searchflag = 0; 2932 1.1 christos d->tralloc = 0; 2933 1.1 christos 2934 1.1 christos d->musts = 0; 2935 1.1 christos } 2936 1.1 christos 2937 1.1 christos /* Parse and analyze a single string of the given length. */ 2938 1.1 christos void 2939 1.1 christos dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) 2940 1.1 christos { 2941 1.1 christos if (case_fold) /* dummy folding in service of dfamust() */ 2942 1.1 christos { 2943 1.1 christos char *lcopy; 2944 1.1 christos int i; 2945 1.1 christos 2946 1.1 christos lcopy = malloc(len); 2947 1.1 christos if (!lcopy) 2948 1.1 christos dfaerror(_("out of memory")); 2949 1.1 christos 2950 1.1 christos /* This is a kludge. */ 2951 1.1 christos case_fold = 0; 2952 1.1 christos for (i = 0; i < len; ++i) 2953 1.1 christos if (ISUPPER ((unsigned char) s[i])) 2954 1.1 christos lcopy[i] = tolower ((unsigned char) s[i]); 2955 1.1 christos else 2956 1.1 christos lcopy[i] = s[i]; 2957 1.1 christos 2958 1.1 christos dfainit(d); 2959 1.1 christos dfaparse(lcopy, len, d); 2960 1.1 christos free(lcopy); 2961 1.1 christos dfamust(d); 2962 1.1 christos d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2963 1.1 christos case_fold = 1; 2964 1.1 christos dfaparse(s, len, d); 2965 1.1 christos dfaanalyze(d, searchflag); 2966 1.1 christos } 2967 1.1 christos else 2968 1.1 christos { 2969 1.1 christos dfainit(d); 2970 1.1 christos dfaparse(s, len, d); 2971 1.1 christos dfamust(d); 2972 1.1 christos dfaanalyze(d, searchflag); 2973 1.1 christos } 2974 1.1 christos } 2975 1.1 christos 2976 1.1 christos /* Free the storage held by the components of a dfa. */ 2977 1.1 christos void 2978 1.1 christos dfafree (struct dfa *d) 2979 1.1 christos { 2980 1.1 christos int i; 2981 1.1 christos struct dfamust *dm, *ndm; 2982 1.1 christos 2983 1.1 christos free((ptr_t) d->charclasses); 2984 1.1 christos free((ptr_t) d->tokens); 2985 1.1 christos 2986 1.1 christos #ifdef MBS_SUPPORT 2987 1.1 christos if (MB_CUR_MAX > 1) 2988 1.1 christos { 2989 1.1 christos free((ptr_t) d->multibyte_prop); 2990 1.1 christos for (i = 0; i < d->nmbcsets; ++i) 2991 1.1 christos { 2992 1.1 christos int j; 2993 1.1 christos struct mb_char_classes *p = &(d->mbcsets[i]); 2994 1.1 christos if (p->chars != NULL) 2995 1.1 christos free(p->chars); 2996 1.1 christos if (p->ch_classes != NULL) 2997 1.1 christos free(p->ch_classes); 2998 1.1 christos if (p->range_sts != NULL) 2999 1.1 christos free(p->range_sts); 3000 1.1 christos if (p->range_ends != NULL) 3001 1.1 christos free(p->range_ends); 3002 1.1 christos 3003 1.1 christos for (j = 0; j < p->nequivs; ++j) 3004 1.1 christos free(p->equivs[j]); 3005 1.1 christos if (p->equivs != NULL) 3006 1.1 christos free(p->equivs); 3007 1.1 christos 3008 1.1 christos for (j = 0; j < p->ncoll_elems; ++j) 3009 1.1 christos free(p->coll_elems[j]); 3010 1.1 christos if (p->coll_elems != NULL) 3011 1.1 christos free(p->coll_elems); 3012 1.1 christos } 3013 1.1 christos free((ptr_t) d->mbcsets); 3014 1.1 christos } 3015 1.1 christos #endif /* MBS_SUPPORT */ 3016 1.1 christos 3017 1.1 christos for (i = 0; i < d->sindex; ++i) 3018 1.1 christos free((ptr_t) d->states[i].elems.elems); 3019 1.1 christos free((ptr_t) d->states); 3020 1.1 christos for (i = 0; i < d->tindex; ++i) 3021 1.1 christos if (d->follows[i].elems) 3022 1.1 christos free((ptr_t) d->follows[i].elems); 3023 1.1 christos free((ptr_t) d->follows); 3024 1.1 christos for (i = 0; i < d->tralloc; ++i) 3025 1.1 christos if (d->trans[i]) 3026 1.1 christos free((ptr_t) d->trans[i]); 3027 1.1 christos else if (d->fails[i]) 3028 1.1 christos free((ptr_t) d->fails[i]); 3029 1.1 christos if (d->realtrans) free((ptr_t) d->realtrans); 3030 1.1 christos if (d->fails) free((ptr_t) d->fails); 3031 1.1 christos if (d->success) free((ptr_t) d->success); 3032 1.1 christos for (dm = d->musts; dm; dm = ndm) 3033 1.1 christos { 3034 1.1 christos ndm = dm->next; 3035 1.1 christos free(dm->must); 3036 1.1 christos free((ptr_t) dm); 3037 1.1 christos } 3038 1.1 christos } 3039 1.1 christos 3040 1.1 christos /* Having found the postfix representation of the regular expression, 3041 1.1 christos try to find a long sequence of characters that must appear in any line 3042 1.1 christos containing the r.e. 3043 1.1 christos Finding a "longest" sequence is beyond the scope here; 3044 1.1 christos we take an easy way out and hope for the best. 3045 1.1 christos (Take "(ab|a)b"--please.) 3046 1.1 christos 3047 1.1 christos We do a bottom-up calculation of sequences of characters that must appear 3048 1.1 christos in matches of r.e.'s represented by trees rooted at the nodes of the postfix 3049 1.1 christos representation: 3050 1.1 christos sequences that must appear at the left of the match ("left") 3051 1.1 christos sequences that must appear at the right of the match ("right") 3052 1.1 christos lists of sequences that must appear somewhere in the match ("in") 3053 1.1 christos sequences that must constitute the match ("is") 3054 1.1 christos 3055 1.1 christos When we get to the root of the tree, we use one of the longest of its 3056 1.1 christos calculated "in" sequences as our answer. The sequence we find is returned in 3057 1.1 christos d->must (where "d" is the single argument passed to "dfamust"); 3058 1.1 christos the length of the sequence is returned in d->mustn. 3059 1.1 christos 3060 1.1 christos The sequences calculated for the various types of node (in pseudo ANSI c) 3061 1.1 christos are shown below. "p" is the operand of unary operators (and the left-hand 3062 1.1 christos operand of binary operators); "q" is the right-hand operand of binary 3063 1.1 christos operators. 3064 1.1 christos 3065 1.1 christos "ZERO" means "a zero-length sequence" below. 3066 1.1 christos 3067 1.1 christos Type left right is in 3068 1.1 christos ---- ---- ----- -- -- 3069 1.1 christos char c # c # c # c # c 3070 1.1 christos 3071 1.1 christos ANYCHAR ZERO ZERO ZERO ZERO 3072 1.1 christos 3073 1.1 christos MBCSET ZERO ZERO ZERO ZERO 3074 1.1 christos 3075 1.1 christos CSET ZERO ZERO ZERO ZERO 3076 1.1 christos 3077 1.1 christos STAR ZERO ZERO ZERO ZERO 3078 1.1 christos 3079 1.1 christos QMARK ZERO ZERO ZERO ZERO 3080 1.1 christos 3081 1.1 christos PLUS p->left p->right ZERO p->in 3082 1.1 christos 3083 1.1 christos CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus 3084 1.1 christos p->left : q->right : q->is!=ZERO) ? q->in plus 3085 1.1 christos p->is##q->left p->right##q->is p->is##q->is : p->right##q->left 3086 1.1 christos ZERO 3087 1.1 christos 3088 1.1 christos OR longest common longest common (do p->is and substrings common to 3089 1.1 christos leading trailing q->is have same p->in and q->in 3090 1.1 christos (sub)sequence (sub)sequence length and 3091 1.1 christos of p->left of p->right content) ? 3092 1.1 christos and q->left and q->right p->is : NULL 3093 1.1 christos 3094 1.1 christos If there's anything else we recognize in the tree, all four sequences get set 3095 1.1 christos to zero-length sequences. If there's something we don't recognize in the tree, 3096 1.1 christos we just return a zero-length sequence. 3097 1.1 christos 3098 1.1 christos Break ties in favor of infrequent letters (choosing 'zzz' in preference to 3099 1.1 christos 'aaa')? 3100 1.1 christos 3101 1.1 christos And. . .is it here or someplace that we might ponder "optimizations" such as 3102 1.1 christos egrep 'psi|epsilon' -> egrep 'psi' 3103 1.1 christos egrep 'pepsi|epsilon' -> egrep 'epsi' 3104 1.1 christos (Yes, we now find "epsi" as a "string 3105 1.1 christos that must occur", but we might also 3106 1.1 christos simplify the *entire* r.e. being sought) 3107 1.1 christos grep '[c]' -> grep 'c' 3108 1.1 christos grep '(ab|a)b' -> grep 'ab' 3109 1.1 christos grep 'ab*' -> grep 'a' 3110 1.1 christos grep 'a*b' -> grep 'b' 3111 1.1 christos 3112 1.1 christos There are several issues: 3113 1.1 christos 3114 1.1 christos Is optimization easy (enough)? 3115 1.1 christos 3116 1.1 christos Does optimization actually accomplish anything, 3117 1.1 christos or is the automaton you get from "psi|epsilon" (for example) 3118 1.1 christos the same as the one you get from "psi" (for example)? 3119 1.1 christos 3120 1.1 christos Are optimizable r.e.'s likely to be used in real-life situations 3121 1.1 christos (something like 'ab*' is probably unlikely; something like is 3122 1.1 christos 'psi|epsilon' is likelier)? */ 3123 1.1 christos 3124 1.1 christos static char * 3125 1.1 christos icatalloc (char *old, char *new) 3126 1.1 christos { 3127 1.1 christos char *result; 3128 1.1 christos size_t oldsize, newsize; 3129 1.1 christos 3130 1.1 christos newsize = (new == NULL) ? 0 : strlen(new); 3131 1.1 christos if (old == NULL) 3132 1.1 christos oldsize = 0; 3133 1.1 christos else if (newsize == 0) 3134 1.1 christos return old; 3135 1.1 christos else oldsize = strlen(old); 3136 1.1 christos if (old == NULL) 3137 1.1 christos result = (char *) malloc(newsize + 1); 3138 1.1 christos else 3139 1.1 christos result = (char *) realloc((void *) old, oldsize + newsize + 1); 3140 1.1 christos if (result != NULL && new != NULL) 3141 1.1 christos (void) strcpy(result + oldsize, new); 3142 1.1 christos return result; 3143 1.1 christos } 3144 1.1 christos 3145 1.1 christos static char * 3146 1.1 christos icpyalloc (char *string) 3147 1.1 christos { 3148 1.1 christos return icatalloc((char *) NULL, string); 3149 1.1 christos } 3150 1.1 christos 3151 1.1 christos static char * 3152 1.1 christos istrstr (char *lookin, char *lookfor) 3153 1.1 christos { 3154 1.1 christos char *cp; 3155 1.1 christos size_t len; 3156 1.1 christos 3157 1.1 christos len = strlen(lookfor); 3158 1.1 christos for (cp = lookin; *cp != '\0'; ++cp) 3159 1.1 christos if (strncmp(cp, lookfor, len) == 0) 3160 1.1 christos return cp; 3161 1.1 christos return NULL; 3162 1.1 christos } 3163 1.1 christos 3164 1.1 christos static void 3165 1.1 christos ifree (char *cp) 3166 1.1 christos { 3167 1.1 christos if (cp != NULL) 3168 1.1 christos free(cp); 3169 1.1 christos } 3170 1.1 christos 3171 1.1 christos static void 3172 1.1 christos freelist (char **cpp) 3173 1.1 christos { 3174 1.1 christos int i; 3175 1.1 christos 3176 1.1 christos if (cpp == NULL) 3177 1.1 christos return; 3178 1.1 christos for (i = 0; cpp[i] != NULL; ++i) 3179 1.1 christos { 3180 1.1 christos free(cpp[i]); 3181 1.1 christos cpp[i] = NULL; 3182 1.1 christos } 3183 1.1 christos } 3184 1.1 christos 3185 1.1 christos static char ** 3186 1.1 christos enlist (char **cpp, char *new, size_t len) 3187 1.1 christos { 3188 1.1 christos int i, j; 3189 1.1 christos 3190 1.1 christos if (cpp == NULL) 3191 1.1 christos return NULL; 3192 1.1 christos if ((new = icpyalloc(new)) == NULL) 3193 1.1 christos { 3194 1.1 christos freelist(cpp); 3195 1.1 christos return NULL; 3196 1.1 christos } 3197 1.1 christos new[len] = '\0'; 3198 1.1 christos /* Is there already something in the list that's new (or longer)? */ 3199 1.1 christos for (i = 0; cpp[i] != NULL; ++i) 3200 1.1 christos if (istrstr(cpp[i], new) != NULL) 3201 1.1 christos { 3202 1.1 christos free(new); 3203 1.1 christos return cpp; 3204 1.1 christos } 3205 1.1 christos /* Eliminate any obsoleted strings. */ 3206 1.1 christos j = 0; 3207 1.1 christos while (cpp[j] != NULL) 3208 1.1 christos if (istrstr(new, cpp[j]) == NULL) 3209 1.1 christos ++j; 3210 1.1 christos else 3211 1.1 christos { 3212 1.1 christos free(cpp[j]); 3213 1.1 christos if (--i == j) 3214 1.1 christos break; 3215 1.1 christos cpp[j] = cpp[i]; 3216 1.1 christos cpp[i] = NULL; 3217 1.1 christos } 3218 1.1 christos /* Add the new string. */ 3219 1.1 christos cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); 3220 1.1 christos if (cpp == NULL) 3221 1.1 christos return NULL; 3222 1.1 christos cpp[i] = new; 3223 1.1 christos cpp[i + 1] = NULL; 3224 1.1 christos return cpp; 3225 1.1 christos } 3226 1.1 christos 3227 1.1 christos /* Given pointers to two strings, return a pointer to an allocated 3228 1.1 christos list of their distinct common substrings. Return NULL if something 3229 1.1 christos seems wild. */ 3230 1.1 christos static char ** 3231 1.1 christos comsubs (char *left, char *right) 3232 1.1 christos { 3233 1.1 christos char **cpp; 3234 1.1 christos char *lcp; 3235 1.1 christos char *rcp; 3236 1.1 christos size_t i, len; 3237 1.1 christos 3238 1.1 christos if (left == NULL || right == NULL) 3239 1.1 christos return NULL; 3240 1.1 christos cpp = (char **) malloc(sizeof *cpp); 3241 1.1 christos if (cpp == NULL) 3242 1.1 christos return NULL; 3243 1.1 christos cpp[0] = NULL; 3244 1.1 christos for (lcp = left; *lcp != '\0'; ++lcp) 3245 1.1 christos { 3246 1.1 christos len = 0; 3247 1.1 christos rcp = strchr (right, *lcp); 3248 1.1 christos while (rcp != NULL) 3249 1.1 christos { 3250 1.1 christos for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) 3251 1.1 christos continue; 3252 1.1 christos if (i > len) 3253 1.1 christos len = i; 3254 1.1 christos rcp = strchr (rcp + 1, *lcp); 3255 1.1 christos } 3256 1.1 christos if (len == 0) 3257 1.1 christos continue; 3258 1.1 christos if ((cpp = enlist(cpp, lcp, len)) == NULL) 3259 1.1 christos break; 3260 1.1 christos } 3261 1.1 christos return cpp; 3262 1.1 christos } 3263 1.1 christos 3264 1.1 christos static char ** 3265 1.1 christos addlists (char **old, char **new) 3266 1.1 christos { 3267 1.1 christos int i; 3268 1.1 christos 3269 1.1 christos if (old == NULL || new == NULL) 3270 1.1 christos return NULL; 3271 1.1 christos for (i = 0; new[i] != NULL; ++i) 3272 1.1 christos { 3273 1.1 christos old = enlist(old, new[i], strlen(new[i])); 3274 1.1 christos if (old == NULL) 3275 1.1 christos break; 3276 1.1 christos } 3277 1.1 christos return old; 3278 1.1 christos } 3279 1.1 christos 3280 1.1 christos /* Given two lists of substrings, return a new list giving substrings 3281 1.1 christos common to both. */ 3282 1.1 christos static char ** 3283 1.1 christos inboth (char **left, char **right) 3284 1.1 christos { 3285 1.1 christos char **both; 3286 1.1 christos char **temp; 3287 1.1 christos int lnum, rnum; 3288 1.1 christos 3289 1.1 christos if (left == NULL || right == NULL) 3290 1.1 christos return NULL; 3291 1.1 christos both = (char **) malloc(sizeof *both); 3292 1.1 christos if (both == NULL) 3293 1.1 christos return NULL; 3294 1.1 christos both[0] = NULL; 3295 1.1 christos for (lnum = 0; left[lnum] != NULL; ++lnum) 3296 1.1 christos { 3297 1.1 christos for (rnum = 0; right[rnum] != NULL; ++rnum) 3298 1.1 christos { 3299 1.1 christos temp = comsubs(left[lnum], right[rnum]); 3300 1.1 christos if (temp == NULL) 3301 1.1 christos { 3302 1.1 christos freelist(both); 3303 1.1 christos return NULL; 3304 1.1 christos } 3305 1.1 christos both = addlists(both, temp); 3306 1.1 christos freelist(temp); 3307 1.1 christos free(temp); 3308 1.1 christos if (both == NULL) 3309 1.1 christos return NULL; 3310 1.1 christos } 3311 1.1 christos } 3312 1.1 christos return both; 3313 1.1 christos } 3314 1.1 christos 3315 1.1 christos typedef struct 3316 1.1 christos { 3317 1.1 christos char **in; 3318 1.1 christos char *left; 3319 1.1 christos char *right; 3320 1.1 christos char *is; 3321 1.1 christos } must; 3322 1.1 christos 3323 1.1 christos static void 3324 1.1 christos resetmust (must *mp) 3325 1.1 christos { 3326 1.1 christos mp->left[0] = mp->right[0] = mp->is[0] = '\0'; 3327 1.1 christos freelist(mp->in); 3328 1.1 christos } 3329 1.1 christos 3330 1.1 christos static void 3331 1.1 christos dfamust (struct dfa *dfa) 3332 1.1 christos { 3333 1.1 christos must *musts; 3334 1.1 christos must *mp; 3335 1.1 christos char *result; 3336 1.1 christos int ri; 3337 1.1 christos int i; 3338 1.1 christos int exact; 3339 1.1 christos token t; 3340 1.1 christos static must must0; 3341 1.1 christos struct dfamust *dm; 3342 1.1 christos static char empty_string[] = ""; 3343 1.1 christos 3344 1.1 christos result = empty_string; 3345 1.1 christos exact = 0; 3346 1.1 christos musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts); 3347 1.1 christos if (musts == NULL) 3348 1.1 christos return; 3349 1.1 christos mp = musts; 3350 1.1 christos for (i = 0; i <= dfa->tindex; ++i) 3351 1.1 christos mp[i] = must0; 3352 1.1 christos for (i = 0; i <= dfa->tindex; ++i) 3353 1.1 christos { 3354 1.1 christos mp[i].in = (char **) malloc(sizeof *mp[i].in); 3355 1.1 christos mp[i].left = malloc(2); 3356 1.1 christos mp[i].right = malloc(2); 3357 1.1 christos mp[i].is = malloc(2); 3358 1.1 christos if (mp[i].in == NULL || mp[i].left == NULL || 3359 1.1 christos mp[i].right == NULL || mp[i].is == NULL) 3360 1.1 christos goto done; 3361 1.1 christos mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; 3362 1.1 christos mp[i].in[0] = NULL; 3363 1.1 christos } 3364 1.1 christos #ifdef DEBUG 3365 1.1 christos fprintf(stderr, "dfamust:\n"); 3366 1.1 christos for (i = 0; i < dfa->tindex; ++i) 3367 1.1 christos { 3368 1.1 christos fprintf(stderr, " %d:", i); 3369 1.1 christos prtok(dfa->tokens[i]); 3370 1.1 christos } 3371 1.1 christos putc('\n', stderr); 3372 1.1 christos #endif 3373 1.1 christos for (ri = 0; ri < dfa->tindex; ++ri) 3374 1.1 christos { 3375 1.1 christos switch (t = dfa->tokens[ri]) 3376 1.1 christos { 3377 1.1 christos case LPAREN: 3378 1.1 christos case RPAREN: 3379 1.1 christos goto done; /* "cannot happen" */ 3380 1.1 christos case EMPTY: 3381 1.1 christos case BEGLINE: 3382 1.1 christos case ENDLINE: 3383 1.1 christos case BEGWORD: 3384 1.1 christos case ENDWORD: 3385 1.1 christos case LIMWORD: 3386 1.1 christos case NOTLIMWORD: 3387 1.1 christos case BACKREF: 3388 1.1 christos resetmust(mp); 3389 1.1 christos break; 3390 1.1 christos case STAR: 3391 1.1 christos case QMARK: 3392 1.1 christos if (mp <= musts) 3393 1.1 christos goto done; /* "cannot happen" */ 3394 1.1 christos --mp; 3395 1.1 christos resetmust(mp); 3396 1.1 christos break; 3397 1.1 christos case OR: 3398 1.1 christos case ORTOP: 3399 1.1 christos if (mp < &musts[2]) 3400 1.1 christos goto done; /* "cannot happen" */ 3401 1.1 christos { 3402 1.1 christos char **new; 3403 1.1 christos must *lmp; 3404 1.1 christos must *rmp; 3405 1.1 christos int j, ln, rn, n; 3406 1.1 christos 3407 1.1 christos rmp = --mp; 3408 1.1 christos lmp = --mp; 3409 1.1 christos /* Guaranteed to be. Unlikely, but. . . */ 3410 1.1 christos if (strcmp(lmp->is, rmp->is) != 0) 3411 1.1 christos lmp->is[0] = '\0'; 3412 1.1 christos /* Left side--easy */ 3413 1.1 christos i = 0; 3414 1.1 christos while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) 3415 1.1 christos ++i; 3416 1.1 christos lmp->left[i] = '\0'; 3417 1.1 christos /* Right side */ 3418 1.1 christos ln = strlen(lmp->right); 3419 1.1 christos rn = strlen(rmp->right); 3420 1.1 christos n = ln; 3421 1.1 christos if (n > rn) 3422 1.1 christos n = rn; 3423 1.1 christos for (i = 0; i < n; ++i) 3424 1.1 christos if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) 3425 1.1 christos break; 3426 1.1 christos for (j = 0; j < i; ++j) 3427 1.1 christos lmp->right[j] = lmp->right[(ln - i) + j]; 3428 1.1 christos lmp->right[j] = '\0'; 3429 1.1 christos new = inboth(lmp->in, rmp->in); 3430 1.1 christos if (new == NULL) 3431 1.1 christos goto done; 3432 1.1 christos freelist(lmp->in); 3433 1.1 christos free((char *) lmp->in); 3434 1.1 christos lmp->in = new; 3435 1.1 christos } 3436 1.1 christos break; 3437 1.1 christos case PLUS: 3438 1.1 christos if (mp <= musts) 3439 1.1 christos goto done; /* "cannot happen" */ 3440 1.1 christos --mp; 3441 1.1 christos mp->is[0] = '\0'; 3442 1.1 christos break; 3443 1.1 christos case END: 3444 1.1 christos if (mp != &musts[1]) 3445 1.1 christos goto done; /* "cannot happen" */ 3446 1.1 christos for (i = 0; musts[0].in[i] != NULL; ++i) 3447 1.1 christos if (strlen(musts[0].in[i]) > strlen(result)) 3448 1.1 christos result = musts[0].in[i]; 3449 1.1 christos if (strcmp(result, musts[0].is) == 0) 3450 1.1 christos exact = 1; 3451 1.1 christos goto done; 3452 1.1 christos case CAT: 3453 1.1 christos if (mp < &musts[2]) 3454 1.1 christos goto done; /* "cannot happen" */ 3455 1.1 christos { 3456 1.1 christos must *lmp; 3457 1.1 christos must *rmp; 3458 1.1 christos 3459 1.1 christos rmp = --mp; 3460 1.1 christos lmp = --mp; 3461 1.1 christos /* In. Everything in left, plus everything in 3462 1.1 christos right, plus catenation of 3463 1.1 christos left's right and right's left. */ 3464 1.1 christos lmp->in = addlists(lmp->in, rmp->in); 3465 1.1 christos if (lmp->in == NULL) 3466 1.1 christos goto done; 3467 1.1 christos if (lmp->right[0] != '\0' && 3468 1.1 christos rmp->left[0] != '\0') 3469 1.1 christos { 3470 1.1 christos char *tp; 3471 1.1 christos 3472 1.1 christos tp = icpyalloc(lmp->right); 3473 1.1 christos if (tp == NULL) 3474 1.1 christos goto done; 3475 1.1 christos tp = icatalloc(tp, rmp->left); 3476 1.1 christos if (tp == NULL) 3477 1.1 christos goto done; 3478 1.1 christos lmp->in = enlist(lmp->in, tp, 3479 1.1 christos strlen(tp)); 3480 1.1 christos free(tp); 3481 1.1 christos if (lmp->in == NULL) 3482 1.1 christos goto done; 3483 1.1 christos } 3484 1.1 christos /* Left-hand */ 3485 1.1 christos if (lmp->is[0] != '\0') 3486 1.1 christos { 3487 1.1 christos lmp->left = icatalloc(lmp->left, 3488 1.1 christos rmp->left); 3489 1.1 christos if (lmp->left == NULL) 3490 1.1 christos goto done; 3491 1.1 christos } 3492 1.1 christos /* Right-hand */ 3493 1.1 christos if (rmp->is[0] == '\0') 3494 1.1 christos lmp->right[0] = '\0'; 3495 1.1 christos lmp->right = icatalloc(lmp->right, rmp->right); 3496 1.1 christos if (lmp->right == NULL) 3497 1.1 christos goto done; 3498 1.1 christos /* Guaranteed to be */ 3499 1.1 christos if (lmp->is[0] != '\0' && rmp->is[0] != '\0') 3500 1.1 christos { 3501 1.1 christos lmp->is = icatalloc(lmp->is, rmp->is); 3502 1.1 christos if (lmp->is == NULL) 3503 1.1 christos goto done; 3504 1.1 christos } 3505 1.1 christos else 3506 1.1 christos lmp->is[0] = '\0'; 3507 1.1 christos } 3508 1.1 christos break; 3509 1.1 christos default: 3510 1.1 christos if (t < END) 3511 1.1 christos { 3512 1.1 christos /* "cannot happen" */ 3513 1.1 christos goto done; 3514 1.1 christos } 3515 1.1 christos else if (t == '\0') 3516 1.1 christos { 3517 1.1 christos /* not on *my* shift */ 3518 1.1 christos goto done; 3519 1.1 christos } 3520 1.1 christos else if (t >= CSET 3521 1.1 christos #ifdef MBS_SUPPORT 3522 1.1 christos || t == ANYCHAR 3523 1.1 christos || t == MBCSET 3524 1.1 christos #endif /* MBS_SUPPORT */ 3525 1.1 christos ) 3526 1.1 christos { 3527 1.1 christos /* easy enough */ 3528 1.1 christos resetmust(mp); 3529 1.1 christos } 3530 1.1 christos else 3531 1.1 christos { 3532 1.1 christos /* plain character */ 3533 1.1 christos resetmust(mp); 3534 1.1 christos mp->is[0] = mp->left[0] = mp->right[0] = t; 3535 1.1 christos mp->is[1] = mp->left[1] = mp->right[1] = '\0'; 3536 1.1 christos mp->in = enlist(mp->in, mp->is, (size_t)1); 3537 1.1 christos if (mp->in == NULL) 3538 1.1 christos goto done; 3539 1.1 christos } 3540 1.1 christos break; 3541 1.1 christos } 3542 1.1 christos #ifdef DEBUG 3543 1.1 christos fprintf(stderr, " node: %d:", ri); 3544 1.1 christos prtok(dfa->tokens[ri]); 3545 1.1 christos fprintf(stderr, "\n in:"); 3546 1.1 christos for (i = 0; mp->in[i]; ++i) 3547 1.1 christos fprintf(stderr, " \"%s\"", mp->in[i]); 3548 1.1 christos fprintf(stderr, "\n is: \"%s\"\n", mp->is); 3549 1.1 christos fprintf(stderr, " left: \"%s\"\n", mp->left); 3550 1.1 christos fprintf(stderr, " right: \"%s\"\n", mp->right); 3551 1.1 christos #endif 3552 1.1 christos ++mp; 3553 1.1 christos } 3554 1.1 christos done: 3555 1.1 christos if (strlen(result)) 3556 1.1 christos { 3557 1.1 christos dm = (struct dfamust *) malloc(sizeof (struct dfamust)); 3558 1.1 christos dm->exact = exact; 3559 1.1 christos dm->must = malloc(strlen(result) + 1); 3560 1.1 christos strcpy(dm->must, result); 3561 1.1 christos dm->next = dfa->musts; 3562 1.1 christos dfa->musts = dm; 3563 1.1 christos } 3564 1.1 christos mp = musts; 3565 1.1 christos for (i = 0; i <= dfa->tindex; ++i) 3566 1.1 christos { 3567 1.1 christos freelist(mp[i].in); 3568 1.1 christos ifree((char *) mp[i].in); 3569 1.1 christos ifree(mp[i].left); 3570 1.1 christos ifree(mp[i].right); 3571 1.1 christos ifree(mp[i].is); 3572 1.1 christos } 3573 1.1 christos free((char *) mp); 3574 1.1 christos } 3575 1.1 christos /* vim:set shiftwidth=2: */ 3576