1 1.3 christos /* $NetBSD: ure.c,v 1.4 2025/09/05 21:16:23 christos Exp $ */ 2 1.2 christos 3 1.2 christos /* $OpenLDAP$ */ 4 1.1 lukem /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 5 1.1 lukem * 6 1.4 christos * Copyright 1998-2024 The OpenLDAP Foundation. 7 1.1 lukem * All rights reserved. 8 1.1 lukem * 9 1.1 lukem * Redistribution and use in source and binary forms, with or without 10 1.1 lukem * modification, are permitted only as authorized by the OpenLDAP 11 1.1 lukem * Public License. 12 1.1 lukem * 13 1.1 lukem * A copy of this license is available in file LICENSE in the 14 1.1 lukem * top-level directory of the distribution or, alternatively, at 15 1.1 lukem * <http://www.OpenLDAP.org/license.html>. 16 1.1 lukem */ 17 1.1 lukem /* Copyright 1997, 1998, 1999 Computing Research Labs, 18 1.1 lukem * New Mexico State University 19 1.1 lukem * 20 1.1 lukem * Permission is hereby granted, free of charge, to any person obtaining a 21 1.1 lukem * copy of this software and associated documentation files (the "Software"), 22 1.1 lukem * to deal in the Software without restriction, including without limitation 23 1.1 lukem * the rights to use, copy, modify, merge, publish, distribute, sublicense, 24 1.1 lukem * and/or sell copies of the Software, and to permit persons to whom the 25 1.1 lukem * Software is furnished to do so, subject to the following conditions: 26 1.1 lukem * 27 1.1 lukem * The above copyright notice and this permission notice shall be included in 28 1.1 lukem * all copies or substantial portions of the Software. 29 1.1 lukem * 30 1.1 lukem * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 31 1.1 lukem * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 32 1.1 lukem * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 33 1.1 lukem * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 34 1.1 lukem * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 35 1.1 lukem * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 36 1.1 lukem * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 37 1.1 lukem */ 38 1.2 christos /* Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp " */ 39 1.2 christos 40 1.2 christos #include <sys/cdefs.h> 41 1.3 christos __RCSID("$NetBSD: ure.c,v 1.4 2025/09/05 21:16:23 christos Exp $"); 42 1.1 lukem 43 1.1 lukem #include "portable.h" 44 1.1 lukem 45 1.1 lukem #include <ac/stdlib.h> 46 1.1 lukem #include <ac/string.h> 47 1.1 lukem #include <ac/unistd.h> 48 1.1 lukem 49 1.1 lukem #include "ure.h" 50 1.1 lukem 51 1.1 lukem /* 52 1.1 lukem * Flags used internally in the DFA. 53 1.1 lukem */ 54 1.1 lukem #define _URE_DFA_CASEFOLD 0x01 55 1.1 lukem #define _URE_DFA_BLANKLINE 0x02 56 1.1 lukem 57 1.1 lukem static unsigned long cclass_flags[] = { 58 1.1 lukem 0, 59 1.1 lukem _URE_NONSPACING, 60 1.1 lukem _URE_COMBINING, 61 1.1 lukem _URE_NUMDIGIT, 62 1.1 lukem _URE_NUMOTHER, 63 1.1 lukem _URE_SPACESEP, 64 1.1 lukem _URE_LINESEP, 65 1.1 lukem _URE_PARASEP, 66 1.1 lukem _URE_CNTRL, 67 1.1 lukem _URE_PUA, 68 1.1 lukem _URE_UPPER, 69 1.1 lukem _URE_LOWER, 70 1.1 lukem _URE_TITLE, 71 1.1 lukem _URE_MODIFIER, 72 1.1 lukem _URE_OTHERLETTER, 73 1.1 lukem _URE_DASHPUNCT, 74 1.1 lukem _URE_OPENPUNCT, 75 1.1 lukem _URE_CLOSEPUNCT, 76 1.1 lukem _URE_OTHERPUNCT, 77 1.1 lukem _URE_MATHSYM, 78 1.1 lukem _URE_CURRENCYSYM, 79 1.1 lukem _URE_OTHERSYM, 80 1.1 lukem _URE_LTR, 81 1.1 lukem _URE_RTL, 82 1.1 lukem _URE_EURONUM, 83 1.1 lukem _URE_EURONUMSEP, 84 1.1 lukem _URE_EURONUMTERM, 85 1.1 lukem _URE_ARABNUM, 86 1.1 lukem _URE_COMMONSEP, 87 1.1 lukem _URE_BLOCKSEP, 88 1.1 lukem _URE_SEGMENTSEP, 89 1.1 lukem _URE_WHITESPACE, 90 1.1 lukem _URE_OTHERNEUT, 91 1.1 lukem }; 92 1.1 lukem 93 1.1 lukem /* 94 1.1 lukem * Symbol types for the DFA. 95 1.1 lukem */ 96 1.1 lukem #define _URE_ANY_CHAR 1 97 1.1 lukem #define _URE_CHAR 2 98 1.1 lukem #define _URE_CCLASS 3 99 1.1 lukem #define _URE_NCCLASS 4 100 1.1 lukem #define _URE_BOL_ANCHOR 5 101 1.1 lukem #define _URE_EOL_ANCHOR 6 102 1.1 lukem 103 1.1 lukem /* 104 1.1 lukem * Op codes for converting the NFA to a DFA. 105 1.1 lukem */ 106 1.1 lukem #define _URE_SYMBOL 10 107 1.1 lukem #define _URE_PAREN 11 108 1.1 lukem #define _URE_QUEST 12 109 1.1 lukem #define _URE_STAR 13 110 1.1 lukem #define _URE_PLUS 14 111 1.1 lukem #define _URE_ONE 15 112 1.1 lukem #define _URE_AND 16 113 1.1 lukem #define _URE_OR 17 114 1.1 lukem 115 1.1 lukem #define _URE_NOOP 0xffff 116 1.1 lukem 117 1.1 lukem #define _URE_REGSTART 0x8000 118 1.1 lukem #define _URE_REGEND 0x4000 119 1.1 lukem 120 1.1 lukem /* 121 1.1 lukem * Structure used to handle a compacted range of characters. 122 1.1 lukem */ 123 1.1 lukem typedef struct { 124 1.1 lukem ucs4_t min_code; 125 1.1 lukem ucs4_t max_code; 126 1.1 lukem } _ure_range_t; 127 1.1 lukem 128 1.1 lukem typedef struct { 129 1.1 lukem _ure_range_t *ranges; 130 1.1 lukem ucs2_t ranges_used; 131 1.1 lukem ucs2_t ranges_size; 132 1.1 lukem } _ure_ccl_t; 133 1.1 lukem 134 1.1 lukem typedef union { 135 1.1 lukem ucs4_t chr; 136 1.1 lukem _ure_ccl_t ccl; 137 1.1 lukem } _ure_sym_t; 138 1.1 lukem 139 1.1 lukem /* 140 1.1 lukem * This is a general element structure used for expressions and stack 141 1.1 lukem * elements. 142 1.1 lukem */ 143 1.1 lukem typedef struct { 144 1.1 lukem ucs2_t reg; 145 1.1 lukem ucs2_t onstack; 146 1.1 lukem ucs2_t type; 147 1.1 lukem ucs2_t lhs; 148 1.1 lukem ucs2_t rhs; 149 1.1 lukem } _ure_elt_t; 150 1.1 lukem 151 1.1 lukem /* 152 1.1 lukem * This is a structure used to track a list or a stack of states. 153 1.1 lukem */ 154 1.1 lukem typedef struct { 155 1.1 lukem ucs2_t *slist; 156 1.1 lukem ucs2_t slist_size; 157 1.1 lukem ucs2_t slist_used; 158 1.1 lukem } _ure_stlist_t; 159 1.1 lukem 160 1.1 lukem /* 161 1.1 lukem * Structure to track the list of unique states for a symbol 162 1.1 lukem * during reduction. 163 1.1 lukem */ 164 1.1 lukem typedef struct { 165 1.1 lukem ucs2_t id; 166 1.1 lukem ucs2_t type; 167 1.1 lukem unsigned long mods; 168 1.1 lukem unsigned long props; 169 1.1 lukem _ure_sym_t sym; 170 1.1 lukem _ure_stlist_t states; 171 1.1 lukem } _ure_symtab_t; 172 1.1 lukem 173 1.1 lukem /* 174 1.1 lukem * Structure to hold a single state. 175 1.1 lukem */ 176 1.1 lukem typedef struct { 177 1.1 lukem ucs2_t id; 178 1.1 lukem ucs2_t accepting; 179 1.1 lukem ucs2_t pad; 180 1.1 lukem _ure_stlist_t st; 181 1.1 lukem _ure_elt_t *trans; 182 1.1 lukem ucs2_t trans_size; 183 1.1 lukem ucs2_t trans_used; 184 1.1 lukem } _ure_state_t; 185 1.1 lukem 186 1.1 lukem /* 187 1.1 lukem * Structure used for keeping lists of states. 188 1.1 lukem */ 189 1.1 lukem typedef struct { 190 1.1 lukem _ure_state_t *states; 191 1.1 lukem ucs2_t states_size; 192 1.1 lukem ucs2_t states_used; 193 1.1 lukem } _ure_statetable_t; 194 1.1 lukem 195 1.1 lukem /* 196 1.1 lukem * Structure to track pairs of DFA states when equivalent states are 197 1.1 lukem * merged. 198 1.1 lukem */ 199 1.1 lukem typedef struct { 200 1.1 lukem ucs2_t l; 201 1.1 lukem ucs2_t r; 202 1.1 lukem } _ure_equiv_t; 203 1.1 lukem 204 1.1 lukem /* 205 1.1 lukem * Structure used for constructing the NFA and reducing to a minimal DFA. 206 1.1 lukem */ 207 1.1 lukem typedef struct _ure_buffer_t { 208 1.1 lukem int reducing; 209 1.1 lukem int error; 210 1.1 lukem unsigned long flags; 211 1.1 lukem 212 1.1 lukem _ure_stlist_t stack; 213 1.1 lukem 214 1.1 lukem /* 215 1.1 lukem * Table of unique symbols encountered. 216 1.1 lukem */ 217 1.1 lukem _ure_symtab_t *symtab; 218 1.1 lukem ucs2_t symtab_size; 219 1.1 lukem ucs2_t symtab_used; 220 1.1 lukem 221 1.1 lukem /* 222 1.1 lukem * Tracks the unique expressions generated for the NFA and when the NFA is 223 1.1 lukem * reduced. 224 1.1 lukem */ 225 1.1 lukem _ure_elt_t *expr; 226 1.1 lukem ucs2_t expr_used; 227 1.1 lukem ucs2_t expr_size; 228 1.1 lukem 229 1.1 lukem /* 230 1.1 lukem * The reduced table of unique groups of NFA states. 231 1.1 lukem */ 232 1.1 lukem _ure_statetable_t states; 233 1.1 lukem 234 1.1 lukem /* 235 1.1 lukem * Tracks states when equivalent states are merged. 236 1.1 lukem */ 237 1.1 lukem _ure_equiv_t *equiv; 238 1.1 lukem ucs2_t equiv_used; 239 1.1 lukem ucs2_t equiv_size; 240 1.1 lukem } _ure_buffer_t; 241 1.1 lukem 242 1.1 lukem typedef struct { 243 1.1 lukem ucs2_t symbol; 244 1.1 lukem ucs2_t next_state; 245 1.1 lukem } _ure_trans_t; 246 1.1 lukem 247 1.1 lukem typedef struct { 248 1.1 lukem ucs2_t accepting; 249 1.1 lukem ucs2_t ntrans; 250 1.1 lukem _ure_trans_t *trans; 251 1.1 lukem } _ure_dstate_t; 252 1.1 lukem 253 1.1 lukem typedef struct _ure_dfa_t { 254 1.1 lukem unsigned long flags; 255 1.1 lukem 256 1.1 lukem _ure_symtab_t *syms; 257 1.1 lukem ucs2_t nsyms; 258 1.1 lukem 259 1.1 lukem _ure_dstate_t *states; 260 1.1 lukem ucs2_t nstates; 261 1.1 lukem 262 1.1 lukem _ure_trans_t *trans; 263 1.1 lukem ucs2_t ntrans; 264 1.1 lukem } _ure_dfa_t; 265 1.1 lukem 266 1.1 lukem /************************************************************************* 267 1.1 lukem * 268 1.1 lukem * Functions. 269 1.1 lukem * 270 1.1 lukem *************************************************************************/ 271 1.1 lukem 272 1.1 lukem static void 273 1.1 lukem _ure_memmove(char *dest, char *src, unsigned long bytes) 274 1.1 lukem { 275 1.1 lukem long i, j; 276 1.1 lukem 277 1.1 lukem i = (long) bytes; 278 1.1 lukem j = i & 7; 279 1.1 lukem i = (i + 7) >> 3; 280 1.1 lukem 281 1.1 lukem /* 282 1.1 lukem * Do a memmove using Ye Olde Duff's Device for efficiency. 283 1.1 lukem */ 284 1.1 lukem if (src < dest) { 285 1.1 lukem src += bytes; 286 1.1 lukem dest += bytes; 287 1.1 lukem 288 1.1 lukem switch (j) { 289 1.1 lukem case 0: do { 290 1.1 lukem *--dest = *--src; 291 1.1 lukem case 7: *--dest = *--src; 292 1.1 lukem case 6: *--dest = *--src; 293 1.1 lukem case 5: *--dest = *--src; 294 1.1 lukem case 4: *--dest = *--src; 295 1.1 lukem case 3: *--dest = *--src; 296 1.1 lukem case 2: *--dest = *--src; 297 1.1 lukem case 1: *--dest = *--src; 298 1.1 lukem } while (--i > 0); 299 1.1 lukem } 300 1.1 lukem } else if (src > dest) { 301 1.1 lukem switch (j) { 302 1.1 lukem case 0: do { 303 1.1 lukem *dest++ = *src++; 304 1.1 lukem case 7: *dest++ = *src++; 305 1.1 lukem case 6: *dest++ = *src++; 306 1.1 lukem case 5: *dest++ = *src++; 307 1.1 lukem case 4: *dest++ = *src++; 308 1.1 lukem case 3: *dest++ = *src++; 309 1.1 lukem case 2: *dest++ = *src++; 310 1.1 lukem case 1: *dest++ = *src++; 311 1.1 lukem } while (--i > 0); 312 1.1 lukem } 313 1.1 lukem } 314 1.1 lukem } 315 1.1 lukem 316 1.1 lukem static void 317 1.1 lukem _ure_push(ucs2_t v, _ure_buffer_t *b) 318 1.1 lukem { 319 1.1 lukem _ure_stlist_t *s; 320 1.1 lukem 321 1.1 lukem if (b == 0) 322 1.1 lukem return; 323 1.1 lukem 324 1.1 lukem /* 325 1.1 lukem * If the `reducing' parameter is non-zero, check to see if the value 326 1.1 lukem * passed is already on the stack. 327 1.1 lukem */ 328 1.1 lukem if (b->reducing != 0 && b->expr[v].onstack != 0) 329 1.1 lukem return; 330 1.1 lukem 331 1.1 lukem s = &b->stack; 332 1.1 lukem if (s->slist_used == s->slist_size) { 333 1.1 lukem if (s->slist_size == 0) 334 1.1 lukem s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3); 335 1.1 lukem else 336 1.1 lukem s->slist = (ucs2_t *) realloc((char *) s->slist, 337 1.1 lukem sizeof(ucs2_t) * (s->slist_size + 8)); 338 1.1 lukem s->slist_size += 8; 339 1.1 lukem } 340 1.1 lukem s->slist[s->slist_used++] = v; 341 1.1 lukem 342 1.1 lukem /* 343 1.1 lukem * If the `reducing' parameter is non-zero, flag the element as being on 344 1.1 lukem * the stack. 345 1.1 lukem */ 346 1.1 lukem if (b->reducing != 0) 347 1.1 lukem b->expr[v].onstack = 1; 348 1.1 lukem } 349 1.1 lukem 350 1.1 lukem static ucs2_t 351 1.1 lukem _ure_peek(_ure_buffer_t *b) 352 1.1 lukem { 353 1.1 lukem if (b == 0 || b->stack.slist_used == 0) 354 1.1 lukem return _URE_NOOP; 355 1.1 lukem 356 1.1 lukem return b->stack.slist[b->stack.slist_used - 1]; 357 1.1 lukem } 358 1.1 lukem 359 1.1 lukem static ucs2_t 360 1.1 lukem _ure_pop(_ure_buffer_t *b) 361 1.1 lukem { 362 1.1 lukem ucs2_t v; 363 1.1 lukem 364 1.1 lukem if (b == 0 || b->stack.slist_used == 0) 365 1.1 lukem return _URE_NOOP; 366 1.1 lukem 367 1.1 lukem v = b->stack.slist[--b->stack.slist_used]; 368 1.1 lukem if (b->reducing) 369 1.1 lukem b->expr[v].onstack = 0; 370 1.1 lukem 371 1.1 lukem return v; 372 1.1 lukem } 373 1.1 lukem 374 1.1 lukem /************************************************************************* 375 1.1 lukem * 376 1.1 lukem * Start symbol parse functions. 377 1.1 lukem * 378 1.1 lukem *************************************************************************/ 379 1.1 lukem 380 1.1 lukem /* 381 1.1 lukem * Parse a comma-separated list of integers that represent character 382 1.1 lukem * properties. Combine them into a mask that is returned in the `mask' 383 1.1 lukem * variable, and return the number of characters consumed. 384 1.1 lukem */ 385 1.1 lukem static unsigned long 386 1.1 lukem _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask, 387 1.1 lukem _ure_buffer_t *b) 388 1.1 lukem { 389 1.1 lukem unsigned long n, m; 390 1.1 lukem ucs2_t *sp, *ep; 391 1.1 lukem 392 1.1 lukem sp = pp; 393 1.1 lukem ep = sp + limit; 394 1.1 lukem 395 1.1 lukem for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) { 396 1.1 lukem if (*sp == ',') { 397 1.1 lukem /* 398 1.1 lukem * Encountered a comma, so select the next character property flag 399 1.1 lukem * and reset the number. 400 1.1 lukem */ 401 1.1 lukem m |= cclass_flags[n]; 402 1.1 lukem n = 0; 403 1.1 lukem } else if (*sp >= '0' && *sp <= '9') 404 1.1 lukem /* 405 1.1 lukem * Encountered a digit, so start or continue building the cardinal 406 1.1 lukem * that represents the character property flag. 407 1.1 lukem */ 408 1.1 lukem n = (n * 10) + (*sp - '0'); 409 1.1 lukem else 410 1.1 lukem /* 411 1.1 lukem * Encountered something that is not part of the property list. 412 1.1 lukem * Indicate that we are done. 413 1.1 lukem */ 414 1.1 lukem break; 415 1.1 lukem 416 1.1 lukem /* 417 1.1 lukem * If a property number greater than 32 occurs, then there is a 418 1.1 lukem * problem. Most likely a missing comma separator. 419 1.1 lukem */ 420 1.1 lukem if (n > 32) 421 1.1 lukem b->error = _URE_INVALID_PROPERTY; 422 1.1 lukem } 423 1.1 lukem 424 1.2 christos if (b->error == _URE_OK && n != 0) 425 1.1 lukem m |= cclass_flags[n]; 426 1.1 lukem 427 1.1 lukem /* 428 1.1 lukem * Set the mask that represents the group of character properties. 429 1.1 lukem */ 430 1.1 lukem *mask = m; 431 1.1 lukem 432 1.1 lukem /* 433 1.1 lukem * Return the number of characters consumed. 434 1.1 lukem */ 435 1.1 lukem return sp - pp; 436 1.1 lukem } 437 1.1 lukem 438 1.1 lukem /* 439 1.1 lukem * Collect a hex number with 1 to 4 digits and return the number 440 1.1 lukem * of characters used. 441 1.1 lukem */ 442 1.1 lukem static unsigned long 443 1.1 lukem _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n) 444 1.1 lukem { 445 1.1 lukem ucs2_t i; 446 1.1 lukem ucs2_t *sp, *ep; 447 1.1 lukem ucs4_t nn; 448 1.1 lukem 449 1.1 lukem sp = np; 450 1.1 lukem ep = sp + limit; 451 1.1 lukem 452 1.1 lukem for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) { 453 1.1 lukem if (*sp >= '0' && *sp <= '9') 454 1.1 lukem nn = (nn << 4) + (*sp - '0'); 455 1.1 lukem else if (*sp >= 'A' && *sp <= 'F') 456 1.1 lukem nn = (nn << 4) + ((*sp - 'A') + 10); 457 1.1 lukem else if (*sp >= 'a' && *sp <= 'f') 458 1.1 lukem nn = (nn << 4) + ((*sp - 'a') + 10); 459 1.1 lukem else 460 1.1 lukem /* 461 1.1 lukem * Encountered something that is not a hex digit. 462 1.1 lukem */ 463 1.1 lukem break; 464 1.1 lukem } 465 1.1 lukem 466 1.1 lukem /* 467 1.1 lukem * Assign the character code collected and return the number of 468 1.1 lukem * characters used. 469 1.1 lukem */ 470 1.1 lukem *n = nn; 471 1.1 lukem 472 1.1 lukem return sp - np; 473 1.1 lukem } 474 1.1 lukem 475 1.1 lukem /* 476 1.1 lukem * Insert a range into a character class, removing duplicates and ordering 477 1.1 lukem * them in increasing range-start order. 478 1.1 lukem */ 479 1.1 lukem static void 480 1.1 lukem _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b) 481 1.1 lukem { 482 1.1 lukem ucs2_t i; 483 1.1 lukem ucs4_t tmp; 484 1.1 lukem _ure_range_t *rp; 485 1.1 lukem 486 1.1 lukem /* 487 1.1 lukem * If the `casefold' flag is set, then make sure both endpoints of the 488 1.1 lukem * range are converted to lower case. 489 1.1 lukem */ 490 1.1 lukem if (b->flags & _URE_DFA_CASEFOLD) { 491 1.1 lukem r->min_code = _ure_tolower(r->min_code); 492 1.1 lukem r->max_code = _ure_tolower(r->max_code); 493 1.1 lukem } 494 1.1 lukem 495 1.1 lukem /* 496 1.1 lukem * Swap the range endpoints if they are not in increasing order. 497 1.1 lukem */ 498 1.1 lukem if (r->min_code > r->max_code) { 499 1.1 lukem tmp = r->min_code; 500 1.1 lukem r->min_code = r->max_code; 501 1.1 lukem r->max_code = tmp; 502 1.1 lukem } 503 1.1 lukem 504 1.1 lukem for (i = 0, rp = ccl->ranges; 505 1.1 lukem i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ; 506 1.1 lukem 507 1.1 lukem /* 508 1.1 lukem * Check for a duplicate. 509 1.1 lukem */ 510 1.1 lukem if (i < ccl->ranges_used && 511 1.1 lukem r->min_code == rp->min_code && r->max_code == rp->max_code) 512 1.1 lukem return; 513 1.1 lukem 514 1.1 lukem if (ccl->ranges_used == ccl->ranges_size) { 515 1.1 lukem if (ccl->ranges_size == 0) 516 1.1 lukem ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3); 517 1.1 lukem else 518 1.1 lukem ccl->ranges = (_ure_range_t *) 519 1.1 lukem realloc((char *) ccl->ranges, 520 1.1 lukem sizeof(_ure_range_t) * (ccl->ranges_size + 8)); 521 1.1 lukem ccl->ranges_size += 8; 522 1.1 lukem } 523 1.1 lukem 524 1.1 lukem rp = ccl->ranges + ccl->ranges_used; 525 1.1 lukem 526 1.1 lukem if (i < ccl->ranges_used) 527 1.1 lukem _ure_memmove((char *) (rp + 1), (char *) rp, 528 1.1 lukem sizeof(_ure_range_t) * (ccl->ranges_used - i)); 529 1.1 lukem 530 1.1 lukem ccl->ranges_used++; 531 1.1 lukem rp->min_code = r->min_code; 532 1.1 lukem rp->max_code = r->max_code; 533 1.1 lukem } 534 1.1 lukem 535 1.1 lukem #define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\ 536 1.1 lukem _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING) 537 1.1 lukem #define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT) 538 1.1 lukem #define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\ 539 1.1 lukem _URE_OTHERPUNCT) 540 1.1 lukem #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\ 541 1.1 lukem _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM) 542 1.1 lukem #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP) 543 1.1 lukem #define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP) 544 1.1 lukem 545 1.1 lukem typedef void (*_ure_cclsetup_t)( 546 1.1 lukem _ure_symtab_t *sym, 547 1.1 lukem unsigned long mask, 548 1.1 lukem _ure_buffer_t *b 549 1.1 lukem ); 550 1.1 lukem 551 1.1 lukem typedef struct { 552 1.1 lukem ucs2_t key; 553 1.1 lukem unsigned long len; 554 1.1 lukem unsigned long next; 555 1.1 lukem _ure_cclsetup_t func; 556 1.1 lukem unsigned long mask; 557 1.1 lukem } _ure_trie_t; 558 1.1 lukem 559 1.1 lukem static void 560 1.1 lukem _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) 561 1.1 lukem { 562 1.1 lukem sym->props |= mask; 563 1.1 lukem } 564 1.1 lukem 565 1.1 lukem static void 566 1.1 lukem _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) 567 1.1 lukem { 568 1.1 lukem _ure_range_t range; 569 1.1 lukem 570 1.1 lukem sym->props |= mask; 571 1.1 lukem 572 1.1 lukem /* 573 1.1 lukem * Add the additional characters needed for handling isspace(). 574 1.1 lukem */ 575 1.1 lukem range.min_code = range.max_code = '\t'; 576 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b); 577 1.1 lukem range.min_code = range.max_code = '\r'; 578 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b); 579 1.1 lukem range.min_code = range.max_code = '\n'; 580 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b); 581 1.1 lukem range.min_code = range.max_code = '\f'; 582 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b); 583 1.1 lukem range.min_code = range.max_code = 0xfeff; 584 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b); 585 1.1 lukem } 586 1.1 lukem 587 1.1 lukem static void 588 1.1 lukem _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) 589 1.1 lukem { 590 1.1 lukem _ure_range_t range; 591 1.1 lukem 592 1.1 lukem /* 593 1.1 lukem * Add the additional characters needed for handling isxdigit(). 594 1.1 lukem */ 595 1.1 lukem range.min_code = '0'; 596 1.1 lukem range.max_code = '9'; 597 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b); 598 1.1 lukem range.min_code = 'A'; 599 1.1 lukem range.max_code = 'F'; 600 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b); 601 1.1 lukem range.min_code = 'a'; 602 1.1 lukem range.max_code = 'f'; 603 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b); 604 1.1 lukem } 605 1.1 lukem 606 1.1 lukem static _ure_trie_t cclass_trie[] = { 607 1.1 lukem {0x003a, 1, 1, 0, 0}, 608 1.1 lukem {0x0061, 9, 10, 0, 0}, 609 1.1 lukem {0x0063, 8, 19, 0, 0}, 610 1.1 lukem {0x0064, 7, 24, 0, 0}, 611 1.1 lukem {0x0067, 6, 29, 0, 0}, 612 1.1 lukem {0x006c, 5, 34, 0, 0}, 613 1.1 lukem {0x0070, 4, 39, 0, 0}, 614 1.1 lukem {0x0073, 3, 49, 0, 0}, 615 1.1 lukem {0x0075, 2, 54, 0, 0}, 616 1.1 lukem {0x0078, 1, 59, 0, 0}, 617 1.1 lukem {0x006c, 1, 11, 0, 0}, 618 1.1 lukem {0x006e, 2, 13, 0, 0}, 619 1.1 lukem {0x0070, 1, 16, 0, 0}, 620 1.1 lukem {0x0075, 1, 14, 0, 0}, 621 1.1 lukem {0x006d, 1, 15, 0, 0}, 622 1.1 lukem {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK}, 623 1.1 lukem {0x0068, 1, 17, 0, 0}, 624 1.1 lukem {0x0061, 1, 18, 0, 0}, 625 1.1 lukem {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK}, 626 1.1 lukem {0x006e, 1, 20, 0, 0}, 627 1.1 lukem {0x0074, 1, 21, 0, 0}, 628 1.1 lukem {0x0072, 1, 22, 0, 0}, 629 1.1 lukem {0x006c, 1, 23, 0, 0}, 630 1.1 lukem {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL}, 631 1.1 lukem {0x0069, 1, 25, 0, 0}, 632 1.1 lukem {0x0067, 1, 26, 0, 0}, 633 1.1 lukem {0x0069, 1, 27, 0, 0}, 634 1.1 lukem {0x0074, 1, 28, 0, 0}, 635 1.1 lukem {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT}, 636 1.1 lukem {0x0072, 1, 30, 0, 0}, 637 1.1 lukem {0x0061, 1, 31, 0, 0}, 638 1.1 lukem {0x0070, 1, 32, 0, 0}, 639 1.1 lukem {0x0068, 1, 33, 0, 0}, 640 1.1 lukem {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK}, 641 1.1 lukem {0x006f, 1, 35, 0, 0}, 642 1.1 lukem {0x0077, 1, 36, 0, 0}, 643 1.1 lukem {0x0065, 1, 37, 0, 0}, 644 1.1 lukem {0x0072, 1, 38, 0, 0}, 645 1.1 lukem {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER}, 646 1.1 lukem {0x0072, 2, 41, 0, 0}, 647 1.1 lukem {0x0075, 1, 45, 0, 0}, 648 1.1 lukem {0x0069, 1, 42, 0, 0}, 649 1.1 lukem {0x006e, 1, 43, 0, 0}, 650 1.1 lukem {0x0074, 1, 44, 0, 0}, 651 1.1 lukem {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK}, 652 1.1 lukem {0x006e, 1, 46, 0, 0}, 653 1.1 lukem {0x0063, 1, 47, 0, 0}, 654 1.1 lukem {0x0074, 1, 48, 0, 0}, 655 1.1 lukem {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK}, 656 1.1 lukem {0x0070, 1, 50, 0, 0}, 657 1.1 lukem {0x0061, 1, 51, 0, 0}, 658 1.1 lukem {0x0063, 1, 52, 0, 0}, 659 1.1 lukem {0x0065, 1, 53, 0, 0}, 660 1.1 lukem {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK}, 661 1.1 lukem {0x0070, 1, 55, 0, 0}, 662 1.1 lukem {0x0070, 1, 56, 0, 0}, 663 1.1 lukem {0x0065, 1, 57, 0, 0}, 664 1.1 lukem {0x0072, 1, 58, 0, 0}, 665 1.1 lukem {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER}, 666 1.1 lukem {0x0064, 1, 60, 0, 0}, 667 1.1 lukem {0x0069, 1, 61, 0, 0}, 668 1.1 lukem {0x0067, 1, 62, 0, 0}, 669 1.1 lukem {0x0069, 1, 63, 0, 0}, 670 1.1 lukem {0x0074, 1, 64, 0, 0}, 671 1.1 lukem {0x003a, 1, 65, _ure_xdigit_setup, 0}, 672 1.1 lukem }; 673 1.1 lukem 674 1.1 lukem /* 675 1.1 lukem * Probe for one of the POSIX colon delimited character classes in the static 676 1.1 lukem * trie. 677 1.1 lukem */ 678 1.1 lukem static unsigned long 679 1.1 lukem _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym, 680 1.1 lukem _ure_buffer_t *b) 681 1.1 lukem { 682 1.1 lukem int i; 683 1.1 lukem unsigned long n; 684 1.1 lukem _ure_trie_t *tp; 685 1.1 lukem ucs2_t *sp, *ep; 686 1.1 lukem 687 1.1 lukem /* 688 1.1 lukem * If the number of characters left is less than 7, then this cannot be 689 1.1 lukem * interpreted as one of the colon delimited classes. 690 1.1 lukem */ 691 1.1 lukem if (limit < 7) 692 1.1 lukem return 0; 693 1.1 lukem 694 1.1 lukem sp = cp; 695 1.1 lukem ep = sp + limit; 696 1.1 lukem tp = cclass_trie; 697 1.1 lukem for (i = 0; sp < ep && i < 8; i++, sp++) { 698 1.1 lukem n = tp->len; 699 1.1 lukem 700 1.1 lukem for (; n > 0 && tp->key != *sp; tp++, n--) ; 701 1.1 lukem 702 1.1 lukem if (n == 0) 703 1.1 lukem return 0; 704 1.1 lukem 705 1.1 lukem if (*sp == ':' && (i == 6 || i == 7)) { 706 1.1 lukem sp++; 707 1.1 lukem break; 708 1.1 lukem } 709 1.1 lukem if (sp + 1 < ep) 710 1.1 lukem tp = cclass_trie + tp->next; 711 1.1 lukem } 712 1.1 lukem if (tp->func == 0) 713 1.1 lukem return 0; 714 1.1 lukem 715 1.1 lukem (*tp->func)(sym, tp->mask, b); 716 1.1 lukem 717 1.1 lukem return sp - cp; 718 1.1 lukem } 719 1.1 lukem 720 1.1 lukem /* 721 1.1 lukem * Construct a list of ranges and return the number of characters consumed. 722 1.1 lukem */ 723 1.1 lukem static unsigned long 724 1.1 lukem _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp, 725 1.1 lukem _ure_buffer_t *b) 726 1.1 lukem { 727 1.1 lukem int range_end; 728 1.1 lukem unsigned long n; 729 1.1 lukem ucs2_t *sp, *ep; 730 1.1 lukem ucs4_t c, last; 731 1.1 lukem _ure_ccl_t *cclp; 732 1.1 lukem _ure_range_t range; 733 1.1 lukem 734 1.1 lukem sp = cp; 735 1.1 lukem ep = sp + limit; 736 1.1 lukem 737 1.1 lukem if (*sp == '^') { 738 1.1 lukem symp->type = _URE_NCCLASS; 739 1.1 lukem sp++; 740 1.1 lukem } else 741 1.1 lukem symp->type = _URE_CCLASS; 742 1.1 lukem 743 1.1 lukem for (last = 0, range_end = 0; 744 1.1 lukem b->error == _URE_OK && sp < ep && *sp != ']'; ) { 745 1.1 lukem c = *sp++; 746 1.1 lukem if (c == '\\') { 747 1.1 lukem if (sp == ep) { 748 1.1 lukem /* 749 1.1 lukem * The EOS was encountered when expecting the reverse solidus 750 1.1 lukem * to be followed by the character it is escaping. Set an 751 1.1 lukem * error code and return the number of characters consumed up 752 1.1 lukem * to this point. 753 1.1 lukem */ 754 1.1 lukem b->error = _URE_UNEXPECTED_EOS; 755 1.1 lukem return sp - cp; 756 1.1 lukem } 757 1.1 lukem 758 1.1 lukem c = *sp++; 759 1.1 lukem switch (c) { 760 1.1 lukem case 'a': 761 1.1 lukem c = 0x07; 762 1.1 lukem break; 763 1.1 lukem case 'b': 764 1.1 lukem c = 0x08; 765 1.1 lukem break; 766 1.1 lukem case 'f': 767 1.1 lukem c = 0x0c; 768 1.1 lukem break; 769 1.1 lukem case 'n': 770 1.1 lukem c = 0x0a; 771 1.1 lukem break; 772 1.1 lukem case 'r': 773 1.1 lukem c = 0x0d; 774 1.1 lukem break; 775 1.1 lukem case 't': 776 1.1 lukem c = 0x09; 777 1.1 lukem break; 778 1.1 lukem case 'v': 779 1.1 lukem c = 0x0b; 780 1.1 lukem break; 781 1.1 lukem case 'p': 782 1.1 lukem case 'P': 783 1.1 lukem sp += _ure_prop_list(sp, ep - sp, &symp->props, b); 784 1.1 lukem /* 785 1.1 lukem * Invert the bit mask of the properties if this is a negated 786 1.1 lukem * character class or if 'P' is used to specify a list of 787 1.1 lukem * character properties that should *not* match in a 788 1.1 lukem * character class. 789 1.1 lukem */ 790 1.1 lukem if (c == 'P') 791 1.1 lukem symp->props = ~symp->props; 792 1.1 lukem continue; 793 1.1 lukem break; 794 1.1 lukem case 'x': 795 1.1 lukem case 'X': 796 1.1 lukem case 'u': 797 1.1 lukem case 'U': 798 1.1 lukem if (sp < ep && 799 1.1 lukem ((*sp >= '0' && *sp <= '9') || 800 1.1 lukem (*sp >= 'A' && *sp <= 'F') || 801 1.1 lukem (*sp >= 'a' && *sp <= 'f'))) 802 1.1 lukem sp += _ure_hex(sp, ep - sp, &c); 803 1.1 lukem } 804 1.1 lukem } else if (c == ':') { 805 1.1 lukem /* 806 1.1 lukem * Probe for a POSIX colon delimited character class. 807 1.1 lukem */ 808 1.1 lukem sp--; 809 1.1 lukem if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0) 810 1.1 lukem sp++; 811 1.1 lukem else { 812 1.1 lukem sp += n; 813 1.1 lukem continue; 814 1.1 lukem } 815 1.1 lukem } 816 1.1 lukem 817 1.1 lukem cclp = &symp->sym.ccl; 818 1.1 lukem 819 1.1 lukem /* 820 1.1 lukem * Check to see if the current character is a low surrogate that needs 821 1.1 lukem * to be combined with a preceding high surrogate. 822 1.1 lukem */ 823 1.1 lukem if (last != 0) { 824 1.1 lukem if (c >= 0xdc00 && c <= 0xdfff) 825 1.1 lukem /* 826 1.1 lukem * Construct the UTF16 character code. 827 1.1 lukem */ 828 1.1 lukem c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff)); 829 1.1 lukem else { 830 1.1 lukem /* 831 1.1 lukem * Add the isolated high surrogate to the range. 832 1.1 lukem */ 833 1.1 lukem if (range_end == 1) 834 1.1 lukem range.max_code = last & 0xffff; 835 1.1 lukem else 836 1.1 lukem range.min_code = range.max_code = last & 0xffff; 837 1.1 lukem 838 1.1 lukem _ure_add_range(cclp, &range, b); 839 1.1 lukem range_end = 0; 840 1.1 lukem } 841 1.1 lukem } 842 1.1 lukem 843 1.1 lukem /* 844 1.1 lukem * Clear the last character code. 845 1.1 lukem */ 846 1.1 lukem last = 0; 847 1.1 lukem 848 1.1 lukem /* 849 1.1 lukem * This slightly awkward code handles the different cases needed to 850 1.1 lukem * construct a range. 851 1.1 lukem */ 852 1.1 lukem if (c >= 0xd800 && c <= 0xdbff) { 853 1.1 lukem /* 854 1.1 lukem * If the high surrogate is followed by a range indicator, simply 855 1.1 lukem * add it as the range start. Otherwise, save it in case the next 856 1.1 lukem * character is a low surrogate. 857 1.1 lukem */ 858 1.1 lukem if (*sp == '-') { 859 1.1 lukem sp++; 860 1.1 lukem range.min_code = c; 861 1.1 lukem range_end = 1; 862 1.1 lukem } else 863 1.1 lukem last = c; 864 1.1 lukem } else if (range_end == 1) { 865 1.1 lukem range.max_code = c; 866 1.1 lukem _ure_add_range(cclp, &range, b); 867 1.1 lukem range_end = 0; 868 1.1 lukem } else { 869 1.1 lukem range.min_code = range.max_code = c; 870 1.1 lukem if (*sp == '-') { 871 1.1 lukem sp++; 872 1.1 lukem range_end = 1; 873 1.1 lukem } else 874 1.1 lukem _ure_add_range(cclp, &range, b); 875 1.1 lukem } 876 1.1 lukem } 877 1.1 lukem 878 1.1 lukem if (sp < ep && *sp == ']') 879 1.1 lukem sp++; 880 1.1 lukem else 881 1.1 lukem /* 882 1.1 lukem * The parse was not terminated by the character class close symbol 883 1.1 lukem * (']'), so set an error code. 884 1.1 lukem */ 885 1.1 lukem b->error = _URE_CCLASS_OPEN; 886 1.1 lukem 887 1.1 lukem return sp - cp; 888 1.1 lukem } 889 1.1 lukem 890 1.1 lukem /* 891 1.1 lukem * Probe for a low surrogate hex code. 892 1.1 lukem */ 893 1.1 lukem static unsigned long 894 1.1 lukem _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c) 895 1.1 lukem { 896 1.1 lukem ucs4_t i, code; 897 1.1 lukem ucs2_t *sp, *ep; 898 1.1 lukem 899 1.1 lukem for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) { 900 1.1 lukem if (*sp >= '0' && *sp <= '9') 901 1.1 lukem code = (code << 4) + (*sp - '0'); 902 1.1 lukem else if (*sp >= 'A' && *sp <= 'F') 903 1.1 lukem code = (code << 4) + ((*sp - 'A') + 10); 904 1.1 lukem else if (*sp >= 'a' && *sp <= 'f') 905 1.1 lukem code = (code << 4) + ((*sp - 'a') + 10); 906 1.1 lukem else 907 1.1 lukem break; 908 1.1 lukem } 909 1.1 lukem 910 1.1 lukem *c = code; 911 1.1 lukem return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0; 912 1.1 lukem } 913 1.1 lukem 914 1.1 lukem static unsigned long 915 1.1 lukem _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp, 916 1.1 lukem _ure_buffer_t *b) 917 1.1 lukem { 918 1.1 lukem ucs4_t c; 919 1.1 lukem ucs2_t *sp, *ep; 920 1.1 lukem 921 1.1 lukem sp = sym; 922 1.1 lukem ep = sym + limit; 923 1.1 lukem 924 1.1 lukem if ((c = *sp++) == '\\') { 925 1.1 lukem 926 1.1 lukem if (sp == ep) { 927 1.1 lukem /* 928 1.1 lukem * The EOS was encountered when expecting the reverse solidus to 929 1.1 lukem * be followed by the character it is escaping. Set an error code 930 1.1 lukem * and return the number of characters consumed up to this point. 931 1.1 lukem */ 932 1.1 lukem b->error = _URE_UNEXPECTED_EOS; 933 1.1 lukem return sp - sym; 934 1.1 lukem } 935 1.1 lukem 936 1.1 lukem c = *sp++; 937 1.1 lukem switch (c) { 938 1.1 lukem case 'p': 939 1.1 lukem case 'P': 940 1.1 lukem symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS; 941 1.1 lukem sp += _ure_prop_list(sp, ep - sp, &symp->props, b); 942 1.1 lukem break; 943 1.1 lukem case 'a': 944 1.1 lukem symp->type = _URE_CHAR; 945 1.1 lukem symp->sym.chr = 0x07; 946 1.1 lukem break; 947 1.1 lukem case 'b': 948 1.1 lukem symp->type = _URE_CHAR; 949 1.1 lukem symp->sym.chr = 0x08; 950 1.1 lukem break; 951 1.1 lukem case 'f': 952 1.1 lukem symp->type = _URE_CHAR; 953 1.1 lukem symp->sym.chr = 0x0c; 954 1.1 lukem break; 955 1.1 lukem case 'n': 956 1.1 lukem symp->type = _URE_CHAR; 957 1.1 lukem symp->sym.chr = 0x0a; 958 1.1 lukem break; 959 1.1 lukem case 'r': 960 1.1 lukem symp->type = _URE_CHAR; 961 1.1 lukem symp->sym.chr = 0x0d; 962 1.1 lukem break; 963 1.1 lukem case 't': 964 1.1 lukem symp->type = _URE_CHAR; 965 1.1 lukem symp->sym.chr = 0x09; 966 1.1 lukem break; 967 1.1 lukem case 'v': 968 1.1 lukem symp->type = _URE_CHAR; 969 1.1 lukem symp->sym.chr = 0x0b; 970 1.1 lukem break; 971 1.1 lukem case 'x': 972 1.1 lukem case 'X': 973 1.1 lukem case 'u': 974 1.1 lukem case 'U': 975 1.1 lukem /* 976 1.1 lukem * Collect between 1 and 4 digits representing a UCS2 code. Fall 977 1.1 lukem * through to the next case. 978 1.1 lukem */ 979 1.1 lukem if (sp < ep && 980 1.1 lukem ((*sp >= '0' && *sp <= '9') || 981 1.1 lukem (*sp >= 'A' && *sp <= 'F') || 982 1.1 lukem (*sp >= 'a' && *sp <= 'f'))) 983 1.1 lukem sp += _ure_hex(sp, ep - sp, &c); 984 1.1 lukem /* FALLTHROUGH */ 985 1.1 lukem default: 986 1.1 lukem /* 987 1.1 lukem * Simply add an escaped character here. 988 1.1 lukem */ 989 1.1 lukem symp->type = _URE_CHAR; 990 1.1 lukem symp->sym.chr = c; 991 1.1 lukem } 992 1.1 lukem } else if (c == '^' || c == '$') 993 1.1 lukem /* 994 1.1 lukem * Handle the BOL and EOL anchors. This actually consists simply of 995 1.1 lukem * setting a flag that indicates that the user supplied anchor match 996 1.1 lukem * function should be called. This needs to be done instead of simply 997 1.1 lukem * matching line/paragraph separators because beginning-of-text and 998 1.1 lukem * end-of-text tests are needed as well. 999 1.1 lukem */ 1000 1.1 lukem symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR; 1001 1.1 lukem else if (c == '[') 1002 1.1 lukem /* 1003 1.1 lukem * Construct a character class. 1004 1.1 lukem */ 1005 1.1 lukem sp += _ure_cclass(sp, ep - sp, symp, b); 1006 1.1 lukem else if (c == '.') 1007 1.1 lukem symp->type = _URE_ANY_CHAR; 1008 1.1 lukem else { 1009 1.1 lukem symp->type = _URE_CHAR; 1010 1.1 lukem symp->sym.chr = c; 1011 1.1 lukem } 1012 1.1 lukem 1013 1.1 lukem /* 1014 1.1 lukem * If the symbol type happens to be a character and is a high surrogate, 1015 1.1 lukem * then probe forward to see if it is followed by a low surrogate that 1016 1.1 lukem * needs to be added. 1017 1.1 lukem */ 1018 1.1 lukem if (sp < ep && symp->type == _URE_CHAR && 1019 1.1 lukem 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) { 1020 1.1 lukem 1021 1.1 lukem if (0xdc00 <= *sp && *sp <= 0xdfff) { 1022 1.1 lukem symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | 1023 1.1 lukem (*sp & 0x03ff)); 1024 1.1 lukem sp++; 1025 1.1 lukem } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' || 1026 1.1 lukem *(sp + 1) == 'u' || *(sp + 1) == 'U')) { 1027 1.1 lukem sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c); 1028 1.1 lukem if (0xdc00 <= c && c <= 0xdfff) { 1029 1.1 lukem /* 1030 1.1 lukem * Take into account the \[xu] in front of the hex code. 1031 1.1 lukem */ 1032 1.1 lukem sp += 2; 1033 1.1 lukem symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | 1034 1.1 lukem (c & 0x03ff)); 1035 1.1 lukem } 1036 1.1 lukem } 1037 1.1 lukem } 1038 1.1 lukem 1039 1.1 lukem /* 1040 1.1 lukem * Last, make sure any _URE_CHAR type symbols are changed to lower case if 1041 1.1 lukem * the `casefold' flag is set. 1042 1.1 lukem */ 1043 1.1 lukem if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR) 1044 1.1 lukem symp->sym.chr = _ure_tolower(symp->sym.chr); 1045 1.1 lukem 1046 1.1 lukem /* 1047 1.1 lukem * If the symbol constructed is anything other than one of the anchors, 1048 1.1 lukem * make sure the _URE_DFA_BLANKLINE flag is removed. 1049 1.1 lukem */ 1050 1.1 lukem if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR) 1051 1.1 lukem b->flags &= ~_URE_DFA_BLANKLINE; 1052 1.1 lukem 1053 1.1 lukem /* 1054 1.1 lukem * Return the number of characters consumed. 1055 1.1 lukem */ 1056 1.1 lukem return sp - sym; 1057 1.1 lukem } 1058 1.1 lukem 1059 1.1 lukem static int 1060 1.1 lukem _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b) 1061 1.1 lukem { 1062 1.1 lukem if (a->type != b->type || a->mods != b->mods || a->props != b->props) 1063 1.1 lukem return 1; 1064 1.1 lukem 1065 1.1 lukem if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) { 1066 1.1 lukem if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used) 1067 1.1 lukem return 1; 1068 1.1 lukem if (a->sym.ccl.ranges_used > 0 && 1069 1.1 lukem memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges, 1070 1.1 lukem sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0) 1071 1.1 lukem return 1; 1072 1.1 lukem } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr) 1073 1.1 lukem return 1; 1074 1.1 lukem return 0; 1075 1.1 lukem } 1076 1.1 lukem 1077 1.1 lukem /* 1078 1.1 lukem * Construct a symbol, but only keep unique symbols. 1079 1.1 lukem */ 1080 1.1 lukem static ucs2_t 1081 1.1 lukem _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed, 1082 1.1 lukem _ure_buffer_t *b) 1083 1.1 lukem { 1084 1.1 lukem ucs2_t i; 1085 1.1 lukem _ure_symtab_t *sp, symbol; 1086 1.1 lukem 1087 1.1 lukem /* 1088 1.1 lukem * Build the next symbol so we can test to see if it is already in the 1089 1.1 lukem * symbol table. 1090 1.1 lukem */ 1091 1.1 lukem (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t)); 1092 1.1 lukem *consumed = _ure_compile_symbol(sym, limit, &symbol, b); 1093 1.1 lukem 1094 1.1 lukem /* 1095 1.1 lukem * Check to see if the symbol exists. 1096 1.1 lukem */ 1097 1.1 lukem for (i = 0, sp = b->symtab; 1098 1.1 lukem i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ; 1099 1.1 lukem 1100 1.1 lukem if (i < b->symtab_used) { 1101 1.1 lukem /* 1102 1.1 lukem * Free up any ranges used for the symbol. 1103 1.1 lukem */ 1104 1.1 lukem if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) && 1105 1.1 lukem symbol.sym.ccl.ranges_size > 0) 1106 1.1 lukem free((char *) symbol.sym.ccl.ranges); 1107 1.1 lukem 1108 1.1 lukem return b->symtab[i].id; 1109 1.1 lukem } 1110 1.1 lukem 1111 1.1 lukem /* 1112 1.1 lukem * Need to add the new symbol. 1113 1.1 lukem */ 1114 1.1 lukem if (b->symtab_used == b->symtab_size) { 1115 1.1 lukem if (b->symtab_size == 0) 1116 1.1 lukem b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3); 1117 1.1 lukem else 1118 1.1 lukem b->symtab = (_ure_symtab_t *) 1119 1.1 lukem realloc((char *) b->symtab, 1120 1.1 lukem sizeof(_ure_symtab_t) * (b->symtab_size + 8)); 1121 1.1 lukem sp = b->symtab + b->symtab_size; 1122 1.1 lukem (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3); 1123 1.1 lukem b->symtab_size += 8; 1124 1.1 lukem } 1125 1.1 lukem 1126 1.1 lukem symbol.id = b->symtab_used++; 1127 1.1 lukem (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol, 1128 1.1 lukem sizeof(_ure_symtab_t)); 1129 1.1 lukem 1130 1.1 lukem return symbol.id; 1131 1.1 lukem } 1132 1.1 lukem 1133 1.1 lukem /************************************************************************* 1134 1.1 lukem * 1135 1.1 lukem * End symbol parse functions. 1136 1.1 lukem * 1137 1.1 lukem *************************************************************************/ 1138 1.1 lukem 1139 1.1 lukem static ucs2_t 1140 1.1 lukem _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b) 1141 1.1 lukem { 1142 1.1 lukem ucs2_t i; 1143 1.1 lukem 1144 1.1 lukem if (b == 0) 1145 1.1 lukem return _URE_NOOP; 1146 1.1 lukem 1147 1.1 lukem /* 1148 1.1 lukem * Determine if the expression already exists or not. 1149 1.1 lukem */ 1150 1.1 lukem for (i = 0; i < b->expr_used; i++) { 1151 1.1 lukem if (b->expr[i].type == type && b->expr[i].lhs == lhs && 1152 1.1 lukem b->expr[i].rhs == rhs) 1153 1.1 lukem break; 1154 1.1 lukem } 1155 1.1 lukem if (i < b->expr_used) 1156 1.1 lukem return i; 1157 1.1 lukem 1158 1.1 lukem /* 1159 1.1 lukem * Need to add a new expression. 1160 1.1 lukem */ 1161 1.1 lukem if (b->expr_used == b->expr_size) { 1162 1.1 lukem if (b->expr_size == 0) 1163 1.1 lukem b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3); 1164 1.1 lukem else 1165 1.1 lukem b->expr = (_ure_elt_t *) 1166 1.1 lukem realloc((char *) b->expr, 1167 1.1 lukem sizeof(_ure_elt_t) * (b->expr_size + 8)); 1168 1.1 lukem b->expr_size += 8; 1169 1.1 lukem } 1170 1.1 lukem 1171 1.1 lukem b->expr[b->expr_used].onstack = 0; 1172 1.1 lukem b->expr[b->expr_used].type = type; 1173 1.1 lukem b->expr[b->expr_used].lhs = lhs; 1174 1.1 lukem b->expr[b->expr_used].rhs = rhs; 1175 1.1 lukem 1176 1.1 lukem return b->expr_used++; 1177 1.1 lukem } 1178 1.1 lukem 1179 1.1 lukem static unsigned char spmap[] = { 1180 1.1 lukem 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 1181 1.1 lukem 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1182 1.1 lukem 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1183 1.1 lukem }; 1184 1.1 lukem 1185 1.1 lukem #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \ 1186 1.1 lukem (spmap[(cc) >> 3] & (1 << ((cc) & 7)))) 1187 1.1 lukem 1188 1.1 lukem /* 1189 1.1 lukem * Convert the regular expression into an NFA in a form that will be easy to 1190 1.1 lukem * reduce to a DFA. The starting state for the reduction will be returned. 1191 1.1 lukem */ 1192 1.1 lukem static ucs2_t 1193 1.1 lukem _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b) 1194 1.1 lukem { 1195 1.1 lukem ucs2_t c, state, top, sym, *sp, *ep; 1196 1.1 lukem unsigned long used; 1197 1.1 lukem 1198 1.1 lukem state = _URE_NOOP; 1199 1.1 lukem 1200 1.1 lukem sp = re; 1201 1.1 lukem ep = sp + relen; 1202 1.1 lukem while (b->error == _URE_OK && sp < ep) { 1203 1.1 lukem c = *sp++; 1204 1.1 lukem switch (c) { 1205 1.1 lukem case '(': 1206 1.1 lukem _ure_push(_URE_PAREN, b); 1207 1.1 lukem break; 1208 1.1 lukem case ')': 1209 1.1 lukem /* 1210 1.1 lukem * Check for the case of too many close parentheses. 1211 1.1 lukem */ 1212 1.1 lukem if (_ure_peek(b) == _URE_NOOP) { 1213 1.1 lukem b->error = _URE_UNBALANCED_GROUP; 1214 1.1 lukem break; 1215 1.1 lukem } 1216 1.1 lukem 1217 1.1 lukem while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) 1218 1.1 lukem /* 1219 1.1 lukem * Make an expression with the AND or OR operator and its right 1220 1.1 lukem * hand side. 1221 1.1 lukem */ 1222 1.1 lukem state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); 1223 1.1 lukem 1224 1.1 lukem /* 1225 1.1 lukem * Remove the _URE_PAREN off the stack. 1226 1.1 lukem */ 1227 1.1 lukem (void) _ure_pop(b); 1228 1.1 lukem break; 1229 1.1 lukem case '*': 1230 1.1 lukem state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b); 1231 1.1 lukem break; 1232 1.1 lukem case '+': 1233 1.1 lukem state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b); 1234 1.1 lukem break; 1235 1.1 lukem case '?': 1236 1.1 lukem state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b); 1237 1.1 lukem break; 1238 1.1 lukem case '|': 1239 1.1 lukem while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) 1240 1.1 lukem /* 1241 1.1 lukem * Make an expression with the AND or OR operator and its right 1242 1.1 lukem * hand side. 1243 1.1 lukem */ 1244 1.1 lukem state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); 1245 1.1 lukem 1246 1.1 lukem _ure_push(state, b); 1247 1.1 lukem _ure_push(_URE_OR, b); 1248 1.1 lukem break; 1249 1.1 lukem default: 1250 1.1 lukem sp--; 1251 1.1 lukem sym = _ure_make_symbol(sp, ep - sp, &used, b); 1252 1.1 lukem sp += used; 1253 1.1 lukem state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b); 1254 1.1 lukem break; 1255 1.1 lukem } 1256 1.1 lukem 1257 1.1 lukem if (c != '(' && c != '|' && sp < ep && 1258 1.1 lukem (!_ure_isspecial(*sp) || *sp == '(')) { 1259 1.1 lukem _ure_push(state, b); 1260 1.1 lukem _ure_push(_URE_AND, b); 1261 1.1 lukem } 1262 1.1 lukem } 1263 1.1 lukem while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) 1264 1.1 lukem /* 1265 1.1 lukem * Make an expression with the AND or OR operator and its right 1266 1.1 lukem * hand side. 1267 1.1 lukem */ 1268 1.1 lukem state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); 1269 1.1 lukem 1270 1.1 lukem if (b->stack.slist_used > 0) 1271 1.1 lukem b->error = _URE_UNBALANCED_GROUP; 1272 1.1 lukem 1273 1.1 lukem return (b->error == _URE_OK) ? state : _URE_NOOP; 1274 1.1 lukem } 1275 1.1 lukem 1276 1.1 lukem static void 1277 1.1 lukem _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b) 1278 1.1 lukem { 1279 1.1 lukem ucs2_t i, *stp; 1280 1.1 lukem _ure_symtab_t *sp; 1281 1.1 lukem 1282 1.1 lukem /* 1283 1.1 lukem * Locate the symbol in the symbol table so the state can be added. 1284 1.1 lukem * If the symbol doesn't exist, then a real problem exists. 1285 1.1 lukem */ 1286 1.1 lukem for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id; 1287 1.1 lukem i++, sp++) ; 1288 1.1 lukem 1289 1.1 lukem /* 1290 1.1 lukem * Now find out if the state exists in the symbol's state list. 1291 1.1 lukem */ 1292 1.1 lukem for (i = 0, stp = sp->states.slist; 1293 1.1 lukem i < sp->states.slist_used && state > *stp; i++, stp++) ; 1294 1.1 lukem 1295 1.1 lukem if (i == sp->states.slist_used || state < *stp) { 1296 1.1 lukem /* 1297 1.1 lukem * Need to add the state in order. 1298 1.1 lukem */ 1299 1.1 lukem if (sp->states.slist_used == sp->states.slist_size) { 1300 1.1 lukem if (sp->states.slist_size == 0) 1301 1.1 lukem sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3); 1302 1.1 lukem else 1303 1.1 lukem sp->states.slist = (ucs2_t *) 1304 1.1 lukem realloc((char *) sp->states.slist, 1305 1.1 lukem sizeof(ucs2_t) * (sp->states.slist_size + 8)); 1306 1.1 lukem sp->states.slist_size += 8; 1307 1.1 lukem } 1308 1.1 lukem if (i < sp->states.slist_used) 1309 1.1 lukem (void) _ure_memmove((char *) (sp->states.slist + i + 1), 1310 1.1 lukem (char *) (sp->states.slist + i), 1311 1.1 lukem sizeof(ucs2_t) * (sp->states.slist_used - i)); 1312 1.1 lukem sp->states.slist[i] = state; 1313 1.1 lukem sp->states.slist_used++; 1314 1.1 lukem } 1315 1.1 lukem } 1316 1.1 lukem 1317 1.1 lukem static ucs2_t 1318 1.1 lukem _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b) 1319 1.1 lukem { 1320 1.1 lukem ucs2_t i; 1321 1.1 lukem _ure_state_t *sp; 1322 1.1 lukem 1323 1.1 lukem for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) { 1324 1.1 lukem if (sp->st.slist_used == nstates && 1325 1.1 lukem memcmp((char *) states, (char *) sp->st.slist, 1326 1.1 lukem sizeof(ucs2_t) * nstates) == 0) 1327 1.1 lukem break; 1328 1.1 lukem } 1329 1.1 lukem 1330 1.1 lukem if (i == b->states.states_used) { 1331 1.1 lukem /* 1332 1.1 lukem * Need to add a new DFA state (set of NFA states). 1333 1.1 lukem */ 1334 1.1 lukem if (b->states.states_used == b->states.states_size) { 1335 1.1 lukem if (b->states.states_size == 0) 1336 1.1 lukem b->states.states = (_ure_state_t *) 1337 1.1 lukem malloc(sizeof(_ure_state_t) << 3); 1338 1.1 lukem else 1339 1.1 lukem b->states.states = (_ure_state_t *) 1340 1.1 lukem realloc((char *) b->states.states, 1341 1.1 lukem sizeof(_ure_state_t) * (b->states.states_size + 8)); 1342 1.1 lukem sp = b->states.states + b->states.states_size; 1343 1.1 lukem (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3); 1344 1.1 lukem b->states.states_size += 8; 1345 1.1 lukem } 1346 1.1 lukem 1347 1.1 lukem sp = b->states.states + b->states.states_used++; 1348 1.1 lukem sp->id = i; 1349 1.1 lukem 1350 1.1 lukem if (sp->st.slist_used + nstates > sp->st.slist_size) { 1351 1.1 lukem if (sp->st.slist_size == 0) 1352 1.1 lukem sp->st.slist = (ucs2_t *) 1353 1.1 lukem malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates)); 1354 1.1 lukem else 1355 1.1 lukem sp->st.slist = (ucs2_t *) 1356 1.1 lukem realloc((char *) sp->st.slist, 1357 1.1 lukem sizeof(ucs2_t) * (sp->st.slist_used + nstates)); 1358 1.1 lukem sp->st.slist_size = sp->st.slist_used + nstates; 1359 1.1 lukem } 1360 1.1 lukem sp->st.slist_used = nstates; 1361 1.1 lukem (void) AC_MEMCPY((char *) sp->st.slist, (char *) states, 1362 1.1 lukem sizeof(ucs2_t) * nstates); 1363 1.1 lukem } 1364 1.1 lukem 1365 1.1 lukem /* 1366 1.1 lukem * Return the ID of the DFA state representing a group of NFA states. 1367 1.1 lukem */ 1368 1.1 lukem return i; 1369 1.1 lukem } 1370 1.1 lukem 1371 1.1 lukem static void 1372 1.1 lukem _ure_reduce(ucs2_t start, _ure_buffer_t *b) 1373 1.1 lukem { 1374 1.1 lukem ucs2_t i, j, state, eval, syms, rhs; 1375 1.1 lukem ucs2_t s1, s2, ns1, ns2; 1376 1.1 lukem _ure_state_t *sp; 1377 1.1 lukem _ure_symtab_t *smp; 1378 1.1 lukem 1379 1.1 lukem b->reducing = 1; 1380 1.1 lukem 1381 1.1 lukem /* 1382 1.1 lukem * Add the starting state for the reduction. 1383 1.1 lukem */ 1384 1.1 lukem _ure_add_state(1, &start, b); 1385 1.1 lukem 1386 1.1 lukem /* 1387 1.1 lukem * Process each set of NFA states that get created. 1388 1.1 lukem */ 1389 1.1 lukem for (i = 0; i < b->states.states_used; i++) { 1390 1.1 lukem sp = b->states.states + i; 1391 1.1 lukem 1392 1.1 lukem /* 1393 1.1 lukem * Push the current states on the stack. 1394 1.1 lukem */ 1395 1.1 lukem for (j = 0; j < sp->st.slist_used; j++) 1396 1.1 lukem _ure_push(sp->st.slist[j], b); 1397 1.1 lukem 1398 1.1 lukem /* 1399 1.1 lukem * Reduce the NFA states. 1400 1.1 lukem */ 1401 1.1 lukem for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) { 1402 1.1 lukem state = b->stack.slist[j]; 1403 1.1 lukem eval = 1; 1404 1.1 lukem 1405 1.1 lukem /* 1406 1.1 lukem * This inner loop is the iterative equivalent of recursively 1407 1.1 lukem * reducing subexpressions generated as a result of a reduction. 1408 1.1 lukem */ 1409 1.1 lukem while (eval) { 1410 1.1 lukem switch (b->expr[state].type) { 1411 1.1 lukem case _URE_SYMBOL: 1412 1.1 lukem ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); 1413 1.1 lukem _ure_add_symstate(b->expr[state].lhs, ns1, b); 1414 1.1 lukem syms++; 1415 1.1 lukem eval = 0; 1416 1.1 lukem break; 1417 1.1 lukem case _URE_ONE: 1418 1.1 lukem sp->accepting = 1; 1419 1.1 lukem eval = 0; 1420 1.1 lukem break; 1421 1.1 lukem case _URE_QUEST: 1422 1.1 lukem s1 = b->expr[state].lhs; 1423 1.1 lukem ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); 1424 1.1 lukem state = _ure_make_expr(_URE_OR, ns1, s1, b); 1425 1.1 lukem break; 1426 1.1 lukem case _URE_PLUS: 1427 1.1 lukem s1 = b->expr[state].lhs; 1428 1.1 lukem ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b); 1429 1.1 lukem state = _ure_make_expr(_URE_AND, s1, ns1, b); 1430 1.1 lukem break; 1431 1.1 lukem case _URE_STAR: 1432 1.1 lukem s1 = b->expr[state].lhs; 1433 1.1 lukem ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); 1434 1.1 lukem ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b); 1435 1.1 lukem state = _ure_make_expr(_URE_OR, ns1, ns2, b); 1436 1.1 lukem break; 1437 1.1 lukem case _URE_OR: 1438 1.1 lukem s1 = b->expr[state].lhs; 1439 1.1 lukem s2 = b->expr[state].rhs; 1440 1.1 lukem _ure_push(s1, b); 1441 1.1 lukem _ure_push(s2, b); 1442 1.1 lukem eval = 0; 1443 1.1 lukem break; 1444 1.1 lukem case _URE_AND: 1445 1.1 lukem s1 = b->expr[state].lhs; 1446 1.1 lukem s2 = b->expr[state].rhs; 1447 1.1 lukem switch (b->expr[s1].type) { 1448 1.1 lukem case _URE_SYMBOL: 1449 1.1 lukem _ure_add_symstate(b->expr[s1].lhs, s2, b); 1450 1.1 lukem syms++; 1451 1.1 lukem eval = 0; 1452 1.1 lukem break; 1453 1.1 lukem case _URE_ONE: 1454 1.1 lukem state = s2; 1455 1.1 lukem break; 1456 1.1 lukem case _URE_QUEST: 1457 1.1 lukem ns1 = b->expr[s1].lhs; 1458 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns1, s2, b); 1459 1.1 lukem state = _ure_make_expr(_URE_OR, s2, ns2, b); 1460 1.1 lukem break; 1461 1.1 lukem case _URE_PLUS: 1462 1.1 lukem ns1 = b->expr[s1].lhs; 1463 1.1 lukem ns2 = _ure_make_expr(_URE_OR, s2, state, b); 1464 1.1 lukem state = _ure_make_expr(_URE_AND, ns1, ns2, b); 1465 1.1 lukem break; 1466 1.1 lukem case _URE_STAR: 1467 1.1 lukem ns1 = b->expr[s1].lhs; 1468 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns1, state, b); 1469 1.1 lukem state = _ure_make_expr(_URE_OR, s2, ns2, b); 1470 1.1 lukem break; 1471 1.1 lukem case _URE_OR: 1472 1.1 lukem ns1 = b->expr[s1].lhs; 1473 1.1 lukem ns2 = b->expr[s1].rhs; 1474 1.1 lukem ns1 = _ure_make_expr(_URE_AND, ns1, s2, b); 1475 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns2, s2, b); 1476 1.1 lukem state = _ure_make_expr(_URE_OR, ns1, ns2, b); 1477 1.1 lukem break; 1478 1.1 lukem case _URE_AND: 1479 1.1 lukem ns1 = b->expr[s1].lhs; 1480 1.1 lukem ns2 = b->expr[s1].rhs; 1481 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns2, s2, b); 1482 1.1 lukem state = _ure_make_expr(_URE_AND, ns1, ns2, b); 1483 1.1 lukem break; 1484 1.1 lukem } 1485 1.1 lukem } 1486 1.1 lukem } 1487 1.1 lukem } 1488 1.1 lukem 1489 1.1 lukem /* 1490 1.1 lukem * Clear the state stack. 1491 1.1 lukem */ 1492 1.1 lukem while (_ure_pop(b) != _URE_NOOP) ; 1493 1.1 lukem 1494 1.1 lukem /* 1495 1.1 lukem * Reset the state pointer because the reduction may have moved it 1496 1.1 lukem * during a reallocation. 1497 1.1 lukem */ 1498 1.1 lukem sp = b->states.states + i; 1499 1.1 lukem 1500 1.1 lukem /* 1501 1.1 lukem * Generate the DFA states for the symbols collected during the 1502 1.1 lukem * current reduction. 1503 1.1 lukem */ 1504 1.1 lukem if (sp->trans_used + syms > sp->trans_size) { 1505 1.1 lukem if (sp->trans_size == 0) 1506 1.1 lukem sp->trans = (_ure_elt_t *) 1507 1.1 lukem malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms)); 1508 1.1 lukem else 1509 1.1 lukem sp->trans = (_ure_elt_t *) 1510 1.1 lukem realloc((char *) sp->trans, 1511 1.1 lukem sizeof(_ure_elt_t) * (sp->trans_used + syms)); 1512 1.1 lukem sp->trans_size = sp->trans_used + syms; 1513 1.1 lukem } 1514 1.1 lukem 1515 1.1 lukem /* 1516 1.1 lukem * Go through the symbol table and generate the DFA state transitions 1517 1.1 lukem * for each symbol that has collected NFA states. 1518 1.1 lukem */ 1519 1.1 lukem for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) { 1520 1.1 lukem sp = b->states.states + i; 1521 1.1 lukem 1522 1.1 lukem if (smp->states.slist_used > 0) { 1523 1.1 lukem sp->trans[syms].lhs = smp->id; 1524 1.1 lukem rhs = _ure_add_state(smp->states.slist_used, 1525 1.1 lukem smp->states.slist, b); 1526 1.1 lukem /* 1527 1.1 lukem * Reset the state pointer in case the reallocation moves it 1528 1.1 lukem * in memory. 1529 1.1 lukem */ 1530 1.1 lukem sp = b->states.states + i; 1531 1.1 lukem sp->trans[syms].rhs = rhs; 1532 1.1 lukem 1533 1.1 lukem smp->states.slist_used = 0; 1534 1.1 lukem syms++; 1535 1.1 lukem } 1536 1.1 lukem } 1537 1.1 lukem 1538 1.1 lukem /* 1539 1.1 lukem * Set the number of transitions actually used. 1540 1.1 lukem */ 1541 1.1 lukem sp->trans_used = syms; 1542 1.1 lukem } 1543 1.1 lukem b->reducing = 0; 1544 1.1 lukem } 1545 1.1 lukem 1546 1.1 lukem static void 1547 1.1 lukem _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b) 1548 1.1 lukem { 1549 1.1 lukem ucs2_t tmp; 1550 1.1 lukem 1551 1.1 lukem l = b->states.states[l].id; 1552 1.1 lukem r = b->states.states[r].id; 1553 1.1 lukem 1554 1.1 lukem if (l == r) 1555 1.1 lukem return; 1556 1.1 lukem 1557 1.1 lukem if (l > r) { 1558 1.1 lukem tmp = l; 1559 1.1 lukem l = r; 1560 1.1 lukem r = tmp; 1561 1.1 lukem } 1562 1.1 lukem 1563 1.1 lukem /* 1564 1.1 lukem * Check to see if the equivalence pair already exists. 1565 1.1 lukem */ 1566 1.1 lukem for (tmp = 0; tmp < b->equiv_used && 1567 1.1 lukem (b->equiv[tmp].l != l || b->equiv[tmp].r != r); 1568 1.1 lukem tmp++) ; 1569 1.1 lukem 1570 1.1 lukem if (tmp < b->equiv_used) 1571 1.1 lukem return; 1572 1.1 lukem 1573 1.1 lukem if (b->equiv_used == b->equiv_size) { 1574 1.1 lukem if (b->equiv_size == 0) 1575 1.1 lukem b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3); 1576 1.1 lukem else 1577 1.1 lukem b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv, 1578 1.1 lukem sizeof(_ure_equiv_t) * 1579 1.1 lukem (b->equiv_size + 8)); 1580 1.1 lukem b->equiv_size += 8; 1581 1.1 lukem } 1582 1.1 lukem b->equiv[b->equiv_used].l = l; 1583 1.1 lukem b->equiv[b->equiv_used].r = r; 1584 1.1 lukem b->equiv_used++; 1585 1.1 lukem } 1586 1.1 lukem 1587 1.1 lukem /* 1588 1.1 lukem * Merge the DFA states that are equivalent. 1589 1.1 lukem */ 1590 1.1 lukem static void 1591 1.1 lukem _ure_merge_equiv(_ure_buffer_t *b) 1592 1.1 lukem { 1593 1.1 lukem ucs2_t i, j, k, eq, done; 1594 1.1 lukem _ure_state_t *sp1, *sp2, *ls, *rs; 1595 1.1 lukem 1596 1.1 lukem for (i = 0; i < b->states.states_used; i++) { 1597 1.1 lukem sp1 = b->states.states + i; 1598 1.1 lukem if (sp1->id != i) 1599 1.1 lukem continue; 1600 1.1 lukem for (j = 0; j < i; j++) { 1601 1.1 lukem sp2 = b->states.states + j; 1602 1.1 lukem if (sp2->id != j) 1603 1.1 lukem continue; 1604 1.1 lukem b->equiv_used = 0; 1605 1.1 lukem _ure_add_equiv(i, j, b); 1606 1.1 lukem for (eq = 0, done = 0; eq < b->equiv_used; eq++) { 1607 1.1 lukem ls = b->states.states + b->equiv[eq].l; 1608 1.1 lukem rs = b->states.states + b->equiv[eq].r; 1609 1.1 lukem if (ls->accepting != rs->accepting || 1610 1.1 lukem ls->trans_used != rs->trans_used) { 1611 1.1 lukem done = 1; 1612 1.1 lukem break; 1613 1.1 lukem } 1614 1.1 lukem for (k = 0; k < ls->trans_used && 1615 1.1 lukem ls->trans[k].lhs == rs->trans[k].lhs; k++) ; 1616 1.1 lukem if (k < ls->trans_used) { 1617 1.1 lukem done = 1; 1618 1.1 lukem break; 1619 1.1 lukem } 1620 1.1 lukem 1621 1.1 lukem for (k = 0; k < ls->trans_used; k++) 1622 1.1 lukem _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b); 1623 1.1 lukem } 1624 1.1 lukem if (done == 0) 1625 1.1 lukem break; 1626 1.1 lukem } 1627 1.1 lukem for (eq = 0; j < i && eq < b->equiv_used; eq++) 1628 1.1 lukem b->states.states[b->equiv[eq].r].id = 1629 1.1 lukem b->states.states[b->equiv[eq].l].id; 1630 1.1 lukem } 1631 1.1 lukem 1632 1.1 lukem /* 1633 1.1 lukem * Renumber the states appropriately. 1634 1.1 lukem */ 1635 1.1 lukem for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used; 1636 1.1 lukem sp1++, i++) 1637 1.1 lukem sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id; 1638 1.1 lukem } 1639 1.1 lukem 1640 1.1 lukem /************************************************************************* 1641 1.1 lukem * 1642 1.1 lukem * API. 1643 1.1 lukem * 1644 1.1 lukem *************************************************************************/ 1645 1.1 lukem 1646 1.1 lukem ure_buffer_t 1647 1.1 lukem ure_buffer_create(void) 1648 1.1 lukem { 1649 1.1 lukem ure_buffer_t b; 1650 1.1 lukem 1651 1.1 lukem b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t)); 1652 1.1 lukem 1653 1.1 lukem return b; 1654 1.1 lukem } 1655 1.1 lukem 1656 1.1 lukem void 1657 1.1 lukem ure_buffer_free(ure_buffer_t buf) 1658 1.1 lukem { 1659 1.1 lukem unsigned long i; 1660 1.1 lukem 1661 1.1 lukem if (buf == 0) 1662 1.1 lukem return; 1663 1.1 lukem 1664 1.1 lukem if (buf->stack.slist_size > 0) 1665 1.1 lukem free((char *) buf->stack.slist); 1666 1.1 lukem 1667 1.1 lukem if (buf->expr_size > 0) 1668 1.1 lukem free((char *) buf->expr); 1669 1.1 lukem 1670 1.1 lukem for (i = 0; i < buf->symtab_size; i++) { 1671 1.1 lukem if (buf->symtab[i].states.slist_size > 0) 1672 1.1 lukem free((char *) buf->symtab[i].states.slist); 1673 1.1 lukem } 1674 1.1 lukem 1675 1.1 lukem if (buf->symtab_size > 0) 1676 1.1 lukem free((char *) buf->symtab); 1677 1.1 lukem 1678 1.1 lukem for (i = 0; i < buf->states.states_size; i++) { 1679 1.1 lukem if (buf->states.states[i].trans_size > 0) 1680 1.1 lukem free((char *) buf->states.states[i].trans); 1681 1.1 lukem if (buf->states.states[i].st.slist_size > 0) 1682 1.1 lukem free((char *) buf->states.states[i].st.slist); 1683 1.1 lukem } 1684 1.1 lukem 1685 1.1 lukem if (buf->states.states_size > 0) 1686 1.1 lukem free((char *) buf->states.states); 1687 1.1 lukem 1688 1.1 lukem if (buf->equiv_size > 0) 1689 1.1 lukem free((char *) buf->equiv); 1690 1.1 lukem 1691 1.1 lukem free((char *) buf); 1692 1.1 lukem } 1693 1.1 lukem 1694 1.1 lukem ure_dfa_t 1695 1.1 lukem ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf) 1696 1.1 lukem { 1697 1.1 lukem ucs2_t i, j, state; 1698 1.1 lukem _ure_state_t *sp; 1699 1.1 lukem _ure_dstate_t *dsp; 1700 1.1 lukem _ure_trans_t *tp; 1701 1.1 lukem ure_dfa_t dfa; 1702 1.1 lukem 1703 1.1 lukem if (re == 0 || *re == 0 || relen == 0 || buf == 0) 1704 1.1 lukem return 0; 1705 1.1 lukem 1706 1.1 lukem /* 1707 1.1 lukem * Reset the various fields of the compilation buffer. Default the flags 1708 1.3 christos * to indicate the presence of the "^$" pattern. If any other pattern 1709 1.1 lukem * occurs, then this flag will be removed. This is done to catch this 1710 1.1 lukem * special pattern and handle it specially when matching. 1711 1.1 lukem */ 1712 1.1 lukem buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0); 1713 1.1 lukem buf->reducing = 0; 1714 1.1 lukem buf->stack.slist_used = 0; 1715 1.1 lukem buf->expr_used = 0; 1716 1.1 lukem 1717 1.1 lukem for (i = 0; i < buf->symtab_used; i++) 1718 1.1 lukem buf->symtab[i].states.slist_used = 0; 1719 1.1 lukem buf->symtab_used = 0; 1720 1.1 lukem 1721 1.1 lukem for (i = 0; i < buf->states.states_used; i++) { 1722 1.1 lukem buf->states.states[i].st.slist_used = 0; 1723 1.1 lukem buf->states.states[i].trans_used = 0; 1724 1.1 lukem } 1725 1.1 lukem buf->states.states_used = 0; 1726 1.1 lukem 1727 1.1 lukem /* 1728 1.3 christos * Construct the NFA. If this stage returns a 0, then an error occurred or 1729 1.1 lukem * an empty expression was passed. 1730 1.1 lukem */ 1731 1.1 lukem if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP) 1732 1.1 lukem return 0; 1733 1.1 lukem 1734 1.1 lukem /* 1735 1.1 lukem * Do the expression reduction to get the initial DFA. 1736 1.1 lukem */ 1737 1.1 lukem _ure_reduce(state, buf); 1738 1.1 lukem 1739 1.1 lukem /* 1740 1.1 lukem * Merge all the equivalent DFA states. 1741 1.1 lukem */ 1742 1.1 lukem _ure_merge_equiv(buf); 1743 1.1 lukem 1744 1.1 lukem /* 1745 1.1 lukem * Construct the minimal DFA. 1746 1.1 lukem */ 1747 1.1 lukem dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t)); 1748 1.1 lukem (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t)); 1749 1.1 lukem 1750 1.1 lukem dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE); 1751 1.1 lukem 1752 1.1 lukem /* 1753 1.1 lukem * Free up the NFA state groups and transfer the symbols from the buffer 1754 1.1 lukem * to the DFA. 1755 1.1 lukem */ 1756 1.1 lukem for (i = 0; i < buf->symtab_size; i++) { 1757 1.1 lukem if (buf->symtab[i].states.slist_size > 0) 1758 1.1 lukem free((char *) buf->symtab[i].states.slist); 1759 1.1 lukem } 1760 1.1 lukem dfa->syms = buf->symtab; 1761 1.1 lukem dfa->nsyms = buf->symtab_used; 1762 1.1 lukem 1763 1.1 lukem buf->symtab_used = buf->symtab_size = 0; 1764 1.1 lukem 1765 1.1 lukem /* 1766 1.1 lukem * Collect the total number of states and transitions needed for the DFA. 1767 1.1 lukem */ 1768 1.1 lukem for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; 1769 1.1 lukem i++, sp++) { 1770 1.1 lukem if (sp->id == state) { 1771 1.1 lukem dfa->nstates++; 1772 1.1 lukem dfa->ntrans += sp->trans_used; 1773 1.1 lukem state++; 1774 1.1 lukem } 1775 1.1 lukem } 1776 1.1 lukem 1777 1.1 lukem /* 1778 1.1 lukem * Allocate enough space for the states and transitions. 1779 1.1 lukem */ 1780 1.1 lukem dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) * 1781 1.1 lukem dfa->nstates); 1782 1.1 lukem dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans); 1783 1.1 lukem 1784 1.1 lukem /* 1785 1.1 lukem * Actually transfer the DFA states from the buffer. 1786 1.1 lukem */ 1787 1.1 lukem dsp = dfa->states; 1788 1.1 lukem tp = dfa->trans; 1789 1.1 lukem for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; 1790 1.1 lukem i++, sp++) { 1791 1.1 lukem if (sp->id == state) { 1792 1.1 lukem dsp->trans = tp; 1793 1.1 lukem dsp->ntrans = sp->trans_used; 1794 1.1 lukem dsp->accepting = sp->accepting; 1795 1.1 lukem 1796 1.1 lukem /* 1797 1.1 lukem * Add the transitions for the state. 1798 1.1 lukem */ 1799 1.1 lukem for (j = 0; j < dsp->ntrans; j++, tp++) { 1800 1.1 lukem tp->symbol = sp->trans[j].lhs; 1801 1.1 lukem tp->next_state = buf->states.states[sp->trans[j].rhs].id; 1802 1.1 lukem } 1803 1.1 lukem 1804 1.1 lukem dsp++; 1805 1.1 lukem state++; 1806 1.1 lukem } 1807 1.1 lukem } 1808 1.1 lukem 1809 1.1 lukem return dfa; 1810 1.1 lukem } 1811 1.1 lukem 1812 1.1 lukem void 1813 1.1 lukem ure_dfa_free(ure_dfa_t dfa) 1814 1.1 lukem { 1815 1.1 lukem ucs2_t i; 1816 1.1 lukem 1817 1.1 lukem if (dfa == 0) 1818 1.1 lukem return; 1819 1.1 lukem 1820 1.1 lukem for (i = 0; i < dfa->nsyms; i++) { 1821 1.1 lukem if ((dfa->syms[i].type == _URE_CCLASS || 1822 1.1 lukem dfa->syms[i].type == _URE_NCCLASS) && 1823 1.1 lukem dfa->syms[i].sym.ccl.ranges_size > 0) 1824 1.1 lukem free((char *) dfa->syms[i].sym.ccl.ranges); 1825 1.1 lukem } 1826 1.1 lukem if (dfa->nsyms > 0) 1827 1.1 lukem free((char *) dfa->syms); 1828 1.1 lukem 1829 1.1 lukem if (dfa->nstates > 0) 1830 1.1 lukem free((char *) dfa->states); 1831 1.1 lukem if (dfa->ntrans > 0) 1832 1.1 lukem free((char *) dfa->trans); 1833 1.1 lukem free((char *) dfa); 1834 1.1 lukem } 1835 1.1 lukem 1836 1.1 lukem void 1837 1.1 lukem ure_write_dfa(ure_dfa_t dfa, FILE *out) 1838 1.1 lukem { 1839 1.1 lukem ucs2_t i, j, k, h, l; 1840 1.1 lukem _ure_dstate_t *sp; 1841 1.1 lukem _ure_symtab_t *sym; 1842 1.1 lukem _ure_range_t *rp; 1843 1.1 lukem 1844 1.1 lukem if (dfa == 0 || out == 0) 1845 1.1 lukem return; 1846 1.1 lukem 1847 1.1 lukem /* 1848 1.1 lukem * Write all the different character classes. 1849 1.1 lukem */ 1850 1.1 lukem for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) { 1851 1.1 lukem if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) { 1852 1.1 lukem fprintf(out, "C%hd = ", sym->id); 1853 1.1 lukem if (sym->sym.ccl.ranges_used > 0) { 1854 1.1 lukem putc('[', out); 1855 1.1 lukem if (sym->type == _URE_NCCLASS) 1856 1.1 lukem putc('^', out); 1857 1.1 lukem } 1858 1.1 lukem if (sym->props != 0) { 1859 1.1 lukem if (sym->type == _URE_NCCLASS) 1860 1.1 lukem fprintf(out, "\\P"); 1861 1.1 lukem else 1862 1.1 lukem fprintf(out, "\\p"); 1863 1.1 lukem for (k = h = 0; k < 32; k++) { 1864 1.1 lukem if (sym->props & (1 << k)) { 1865 1.1 lukem if (h != 0) 1866 1.1 lukem putc(',', out); 1867 1.3 christos fprintf(out, "%d", k + 1); 1868 1.1 lukem h = 1; 1869 1.1 lukem } 1870 1.1 lukem } 1871 1.1 lukem } 1872 1.1 lukem /* 1873 1.1 lukem * Dump the ranges. 1874 1.1 lukem */ 1875 1.1 lukem for (k = 0, rp = sym->sym.ccl.ranges; 1876 1.1 lukem k < sym->sym.ccl.ranges_used; k++, rp++) { 1877 1.1 lukem /* 1878 1.1 lukem * Check for UTF16 characters. 1879 1.1 lukem */ 1880 1.1 lukem if (0x10000 <= rp->min_code && 1881 1.1 lukem rp->min_code <= 0x10ffff) { 1882 1.1 lukem h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800); 1883 1.1 lukem l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00); 1884 1.1 lukem fprintf(out, "\\x%04hX\\x%04hX", h, l); 1885 1.1 lukem } else 1886 1.1 lukem fprintf(out, "\\x%04lX", rp->min_code & 0xffff); 1887 1.1 lukem if (rp->max_code != rp->min_code) { 1888 1.1 lukem putc('-', out); 1889 1.1 lukem if (rp->max_code >= 0x10000 && 1890 1.1 lukem rp->max_code <= 0x10ffff) { 1891 1.1 lukem h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800); 1892 1.1 lukem l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00); 1893 1.1 lukem fprintf(out, "\\x%04hX\\x%04hX", h, l); 1894 1.1 lukem } else 1895 1.1 lukem fprintf(out, "\\x%04lX", rp->max_code & 0xffff); 1896 1.1 lukem } 1897 1.1 lukem } 1898 1.1 lukem if (sym->sym.ccl.ranges_used > 0) 1899 1.1 lukem putc(']', out); 1900 1.1 lukem putc('\n', out); 1901 1.1 lukem } 1902 1.1 lukem } 1903 1.1 lukem 1904 1.1 lukem for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) { 1905 1.1 lukem fprintf(out, "S%hd = ", i); 1906 1.1 lukem if (sp->accepting) { 1907 1.1 lukem fprintf(out, "1 "); 1908 1.1 lukem if (sp->ntrans) 1909 1.1 lukem fprintf(out, "| "); 1910 1.1 lukem } 1911 1.1 lukem for (j = 0; j < sp->ntrans; j++) { 1912 1.1 lukem if (j > 0) 1913 1.1 lukem fprintf(out, "| "); 1914 1.1 lukem 1915 1.1 lukem sym = dfa->syms + sp->trans[j].symbol; 1916 1.1 lukem switch (sym->type) { 1917 1.1 lukem case _URE_CHAR: 1918 1.1 lukem if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) { 1919 1.1 lukem /* 1920 1.1 lukem * Take care of UTF16 characters. 1921 1.1 lukem */ 1922 1.1 lukem h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800); 1923 1.1 lukem l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00); 1924 1.1 lukem fprintf(out, "\\x%04hX\\x%04hX ", h, l); 1925 1.1 lukem } else 1926 1.1 lukem fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff); 1927 1.1 lukem break; 1928 1.1 lukem case _URE_ANY_CHAR: 1929 1.1 lukem fprintf(out, "<any> "); 1930 1.1 lukem break; 1931 1.1 lukem case _URE_BOL_ANCHOR: 1932 1.1 lukem fprintf(out, "<bol-anchor> "); 1933 1.1 lukem break; 1934 1.1 lukem case _URE_EOL_ANCHOR: 1935 1.1 lukem fprintf(out, "<eol-anchor> "); 1936 1.1 lukem break; 1937 1.1 lukem case _URE_CCLASS: 1938 1.1 lukem case _URE_NCCLASS: 1939 1.1 lukem fprintf(out, "[C%hd] ", sym->id); 1940 1.1 lukem break; 1941 1.1 lukem } 1942 1.1 lukem fprintf(out, "S%hd", sp->trans[j].next_state); 1943 1.1 lukem if (j + 1 < sp->ntrans) 1944 1.1 lukem putc(' ', out); 1945 1.1 lukem } 1946 1.1 lukem putc('\n', out); 1947 1.1 lukem } 1948 1.1 lukem } 1949 1.1 lukem 1950 1.1 lukem #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\ 1951 1.1 lukem (cc) == 0x2029) 1952 1.1 lukem 1953 1.1 lukem int 1954 1.1 lukem ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen, 1955 1.1 lukem unsigned long *match_start, unsigned long *match_end) 1956 1.1 lukem { 1957 1.1 lukem int i, j, matched, found, skip; 1958 1.1 lukem unsigned long ms, me; 1959 1.1 lukem ucs4_t c; 1960 1.1 lukem ucs2_t *sp, *ep, *lp; 1961 1.1 lukem _ure_dstate_t *stp; 1962 1.1 lukem _ure_symtab_t *sym; 1963 1.1 lukem _ure_range_t *rp; 1964 1.1 lukem 1965 1.1 lukem if (dfa == 0 || text == 0) 1966 1.1 lukem return 0; 1967 1.1 lukem 1968 1.1 lukem /* 1969 1.1 lukem * Handle the special case of an empty string matching the "^$" pattern. 1970 1.1 lukem */ 1971 1.1 lukem if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) { 1972 1.1 lukem *match_start = *match_end = 0; 1973 1.1 lukem return 1; 1974 1.1 lukem } 1975 1.1 lukem 1976 1.1 lukem sp = text; 1977 1.1 lukem ep = sp + textlen; 1978 1.1 lukem 1979 1.1 lukem ms = me = ~0; 1980 1.1 lukem 1981 1.1 lukem stp = dfa->states; 1982 1.1 lukem 1983 1.1 lukem for (found = skip = 0; found == 0 && sp < ep; ) { 1984 1.1 lukem lp = sp; 1985 1.1 lukem c = *sp++; 1986 1.1 lukem 1987 1.1 lukem /* 1988 1.1 lukem * Check to see if this is a high surrogate that should be 1989 1.1 lukem * combined with a following low surrogate. 1990 1.1 lukem */ 1991 1.1 lukem if (sp < ep && 0xd800 <= c && c <= 0xdbff && 1992 1.1 lukem 0xdc00 <= *sp && *sp <= 0xdfff) 1993 1.1 lukem c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff)); 1994 1.1 lukem 1995 1.1 lukem /* 1996 1.1 lukem * Determine if the character is non-spacing and should be skipped. 1997 1.1 lukem */ 1998 1.1 lukem if (_ure_matches_properties(_URE_NONSPACING, c) && 1999 1.1 lukem (flags & URE_IGNORE_NONSPACING)) { 2000 1.1 lukem sp++; 2001 1.1 lukem continue; 2002 1.1 lukem } 2003 1.1 lukem 2004 1.1 lukem if (dfa->flags & _URE_DFA_CASEFOLD) 2005 1.1 lukem c = _ure_tolower(c); 2006 1.1 lukem 2007 1.1 lukem /* 2008 1.1 lukem * See if one of the transitions matches. 2009 1.1 lukem */ 2010 1.1 lukem for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) { 2011 1.1 lukem sym = dfa->syms + stp->trans[i].symbol; 2012 1.1 lukem switch (sym->type) { 2013 1.1 lukem case _URE_ANY_CHAR: 2014 1.1 lukem if ((flags & URE_DOT_MATCHES_SEPARATORS) || 2015 1.1 lukem !_ure_issep(c)) 2016 1.1 lukem matched = 1; 2017 1.1 lukem break; 2018 1.1 lukem case _URE_CHAR: 2019 1.1 lukem if (c == sym->sym.chr) 2020 1.1 lukem matched = 1; 2021 1.1 lukem break; 2022 1.1 lukem case _URE_BOL_ANCHOR: 2023 1.1 lukem if (lp == text) { 2024 1.1 lukem sp = lp; 2025 1.1 lukem matched = 1; 2026 1.1 lukem } else if (_ure_issep(c)) { 2027 1.1 lukem if (c == '\r' && sp < ep && *sp == '\n') 2028 1.1 lukem sp++; 2029 1.1 lukem lp = sp; 2030 1.1 lukem matched = 1; 2031 1.1 lukem } 2032 1.1 lukem break; 2033 1.1 lukem case _URE_EOL_ANCHOR: 2034 1.1 lukem if (_ure_issep(c)) { 2035 1.1 lukem /* 2036 1.1 lukem * Put the pointer back before the separator so the match 2037 1.1 lukem * end position will be correct. This case will also 2038 1.1 lukem * cause the `sp' pointer to be advanced over the current 2039 1.1 lukem * separator once the match end point has been recorded. 2040 1.1 lukem */ 2041 1.1 lukem sp = lp; 2042 1.1 lukem matched = 1; 2043 1.1 lukem } 2044 1.1 lukem break; 2045 1.1 lukem case _URE_CCLASS: 2046 1.1 lukem case _URE_NCCLASS: 2047 1.1 lukem if (sym->props != 0) 2048 1.1 lukem matched = _ure_matches_properties(sym->props, c); 2049 1.1 lukem for (j = 0, rp = sym->sym.ccl.ranges; 2050 1.1 lukem j < sym->sym.ccl.ranges_used; j++, rp++) { 2051 1.1 lukem if (rp->min_code <= c && c <= rp->max_code) 2052 1.1 lukem matched = 1; 2053 1.1 lukem } 2054 1.1 lukem if (sym->type == _URE_NCCLASS) 2055 1.1 lukem matched = !matched; 2056 1.1 lukem break; 2057 1.1 lukem } 2058 1.1 lukem 2059 1.1 lukem if (matched) { 2060 1.1 lukem if (ms == ~0UL) 2061 1.1 lukem ms = lp - text; 2062 1.1 lukem else 2063 1.1 lukem me = sp - text; 2064 1.1 lukem stp = dfa->states + stp->trans[i].next_state; 2065 1.1 lukem 2066 1.1 lukem /* 2067 1.1 lukem * If the match was an EOL anchor, adjust the pointer past the 2068 1.1 lukem * separator that caused the match. The correct match 2069 1.1 lukem * position has been recorded already. 2070 1.1 lukem */ 2071 1.1 lukem if (sym->type == _URE_EOL_ANCHOR) { 2072 1.1 lukem /* 2073 1.1 lukem * Skip the character that caused the match. 2074 1.1 lukem */ 2075 1.1 lukem sp++; 2076 1.1 lukem 2077 1.1 lukem /* 2078 1.1 lukem * Handle the infamous CRLF situation. 2079 1.1 lukem */ 2080 1.1 lukem if (sp < ep && c == '\r' && *sp == '\n') 2081 1.1 lukem sp++; 2082 1.1 lukem } 2083 1.1 lukem } 2084 1.1 lukem } 2085 1.1 lukem 2086 1.1 lukem if (matched == 0) { 2087 1.1 lukem if (stp->accepting == 0) { 2088 1.1 lukem /* 2089 1.1 lukem * If the last state was not accepting, then reset 2090 1.1 lukem * and start over. 2091 1.1 lukem */ 2092 1.1 lukem stp = dfa->states; 2093 1.1 lukem ms = me = ~0; 2094 1.1 lukem } else 2095 1.1 lukem /* 2096 1.1 lukem * The last state was accepting, so terminate the matching 2097 1.1 lukem * loop to avoid more work. 2098 1.1 lukem */ 2099 1.1 lukem found = 1; 2100 1.1 lukem } else if (sp == ep) { 2101 1.1 lukem if (!stp->accepting) { 2102 1.1 lukem /* 2103 1.1 lukem * This ugly hack is to make sure the end-of-line anchors 2104 1.1 lukem * match when the source text hits the end. This is only done 2105 1.1 lukem * if the last subexpression matches. 2106 1.1 lukem */ 2107 1.1 lukem for (i = 0; found == 0 && i < stp->ntrans; i++) { 2108 1.1 lukem sym = dfa->syms + stp->trans[i].symbol; 2109 1.1 lukem if (sym->type ==_URE_EOL_ANCHOR) { 2110 1.1 lukem stp = dfa->states + stp->trans[i].next_state; 2111 1.1 lukem if (stp->accepting) { 2112 1.1 lukem me = sp - text; 2113 1.1 lukem found = 1; 2114 1.1 lukem } else 2115 1.1 lukem break; 2116 1.1 lukem } 2117 1.1 lukem } 2118 1.1 lukem } else { 2119 1.1 lukem /* 2120 1.1 lukem * Make sure any conditions that match all the way to the end 2121 1.1 lukem * of the string match. 2122 1.1 lukem */ 2123 1.1 lukem found = 1; 2124 1.1 lukem me = sp - text; 2125 1.1 lukem } 2126 1.1 lukem } 2127 1.1 lukem } 2128 1.1 lukem 2129 1.1 lukem if (found == 0) 2130 1.1 lukem ms = me = ~0; 2131 1.1 lukem 2132 1.1 lukem *match_start = ms; 2133 1.1 lukem *match_end = me; 2134 1.1 lukem 2135 1.1 lukem return (ms != ~0UL) ? 1 : 0; 2136 1.1 lukem } 2137