ure.c revision 1.1.1.6.6.1 1 1.1.1.6.6.1 martin /* $NetBSD: ure.c,v 1.1.1.6.6.1 2019/08/10 06:17:15 martin Exp $ */
2 1.1.1.2 lukem
3 1.1.1.4 tron /* $OpenLDAP$ */
4 1.1 lukem /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 1.1 lukem *
6 1.1.1.6.6.1 martin * Copyright 1998-2019 The OpenLDAP Foundation.
7 1.1 lukem * All rights reserved.
8 1.1 lukem *
9 1.1 lukem * Redistribution and use in source and binary forms, with or without
10 1.1 lukem * modification, are permitted only as authorized by the OpenLDAP
11 1.1 lukem * Public License.
12 1.1 lukem *
13 1.1 lukem * A copy of this license is available in file LICENSE in the
14 1.1 lukem * top-level directory of the distribution or, alternatively, at
15 1.1 lukem * <http://www.OpenLDAP.org/license.html>.
16 1.1 lukem */
17 1.1 lukem /* Copyright 1997, 1998, 1999 Computing Research Labs,
18 1.1 lukem * New Mexico State University
19 1.1 lukem *
20 1.1 lukem * Permission is hereby granted, free of charge, to any person obtaining a
21 1.1 lukem * copy of this software and associated documentation files (the "Software"),
22 1.1 lukem * to deal in the Software without restriction, including without limitation
23 1.1 lukem * the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 1.1 lukem * and/or sell copies of the Software, and to permit persons to whom the
25 1.1 lukem * Software is furnished to do so, subject to the following conditions:
26 1.1 lukem *
27 1.1 lukem * The above copyright notice and this permission notice shall be included in
28 1.1 lukem * all copies or substantial portions of the Software.
29 1.1 lukem *
30 1.1 lukem * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31 1.1 lukem * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32 1.1 lukem * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
33 1.1 lukem * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
34 1.1 lukem * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
35 1.1 lukem * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
36 1.1 lukem * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37 1.1 lukem */
38 1.1.1.4 tron /* Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp " */
39 1.1 lukem
40 1.1.1.5 christos #include <sys/cdefs.h>
41 1.1.1.6.6.1 martin __RCSID("$NetBSD: ure.c,v 1.1.1.6.6.1 2019/08/10 06:17:15 martin Exp $");
42 1.1.1.5 christos
43 1.1 lukem #include "portable.h"
44 1.1 lukem
45 1.1 lukem #include <ac/stdlib.h>
46 1.1 lukem #include <ac/string.h>
47 1.1 lukem #include <ac/unistd.h>
48 1.1 lukem
49 1.1 lukem #include "ure.h"
50 1.1 lukem
51 1.1 lukem /*
52 1.1 lukem * Flags used internally in the DFA.
53 1.1 lukem */
54 1.1 lukem #define _URE_DFA_CASEFOLD 0x01
55 1.1 lukem #define _URE_DFA_BLANKLINE 0x02
56 1.1 lukem
57 1.1 lukem static unsigned long cclass_flags[] = {
58 1.1 lukem 0,
59 1.1 lukem _URE_NONSPACING,
60 1.1 lukem _URE_COMBINING,
61 1.1 lukem _URE_NUMDIGIT,
62 1.1 lukem _URE_NUMOTHER,
63 1.1 lukem _URE_SPACESEP,
64 1.1 lukem _URE_LINESEP,
65 1.1 lukem _URE_PARASEP,
66 1.1 lukem _URE_CNTRL,
67 1.1 lukem _URE_PUA,
68 1.1 lukem _URE_UPPER,
69 1.1 lukem _URE_LOWER,
70 1.1 lukem _URE_TITLE,
71 1.1 lukem _URE_MODIFIER,
72 1.1 lukem _URE_OTHERLETTER,
73 1.1 lukem _URE_DASHPUNCT,
74 1.1 lukem _URE_OPENPUNCT,
75 1.1 lukem _URE_CLOSEPUNCT,
76 1.1 lukem _URE_OTHERPUNCT,
77 1.1 lukem _URE_MATHSYM,
78 1.1 lukem _URE_CURRENCYSYM,
79 1.1 lukem _URE_OTHERSYM,
80 1.1 lukem _URE_LTR,
81 1.1 lukem _URE_RTL,
82 1.1 lukem _URE_EURONUM,
83 1.1 lukem _URE_EURONUMSEP,
84 1.1 lukem _URE_EURONUMTERM,
85 1.1 lukem _URE_ARABNUM,
86 1.1 lukem _URE_COMMONSEP,
87 1.1 lukem _URE_BLOCKSEP,
88 1.1 lukem _URE_SEGMENTSEP,
89 1.1 lukem _URE_WHITESPACE,
90 1.1 lukem _URE_OTHERNEUT,
91 1.1 lukem };
92 1.1 lukem
93 1.1 lukem /*
94 1.1 lukem * Symbol types for the DFA.
95 1.1 lukem */
96 1.1 lukem #define _URE_ANY_CHAR 1
97 1.1 lukem #define _URE_CHAR 2
98 1.1 lukem #define _URE_CCLASS 3
99 1.1 lukem #define _URE_NCCLASS 4
100 1.1 lukem #define _URE_BOL_ANCHOR 5
101 1.1 lukem #define _URE_EOL_ANCHOR 6
102 1.1 lukem
103 1.1 lukem /*
104 1.1 lukem * Op codes for converting the NFA to a DFA.
105 1.1 lukem */
106 1.1 lukem #define _URE_SYMBOL 10
107 1.1 lukem #define _URE_PAREN 11
108 1.1 lukem #define _URE_QUEST 12
109 1.1 lukem #define _URE_STAR 13
110 1.1 lukem #define _URE_PLUS 14
111 1.1 lukem #define _URE_ONE 15
112 1.1 lukem #define _URE_AND 16
113 1.1 lukem #define _URE_OR 17
114 1.1 lukem
115 1.1 lukem #define _URE_NOOP 0xffff
116 1.1 lukem
117 1.1 lukem #define _URE_REGSTART 0x8000
118 1.1 lukem #define _URE_REGEND 0x4000
119 1.1 lukem
120 1.1 lukem /*
121 1.1 lukem * Structure used to handle a compacted range of characters.
122 1.1 lukem */
123 1.1 lukem typedef struct {
124 1.1 lukem ucs4_t min_code;
125 1.1 lukem ucs4_t max_code;
126 1.1 lukem } _ure_range_t;
127 1.1 lukem
128 1.1 lukem typedef struct {
129 1.1 lukem _ure_range_t *ranges;
130 1.1 lukem ucs2_t ranges_used;
131 1.1 lukem ucs2_t ranges_size;
132 1.1 lukem } _ure_ccl_t;
133 1.1 lukem
134 1.1 lukem typedef union {
135 1.1 lukem ucs4_t chr;
136 1.1 lukem _ure_ccl_t ccl;
137 1.1 lukem } _ure_sym_t;
138 1.1 lukem
139 1.1 lukem /*
140 1.1 lukem * This is a general element structure used for expressions and stack
141 1.1 lukem * elements.
142 1.1 lukem */
143 1.1 lukem typedef struct {
144 1.1 lukem ucs2_t reg;
145 1.1 lukem ucs2_t onstack;
146 1.1 lukem ucs2_t type;
147 1.1 lukem ucs2_t lhs;
148 1.1 lukem ucs2_t rhs;
149 1.1 lukem } _ure_elt_t;
150 1.1 lukem
151 1.1 lukem /*
152 1.1 lukem * This is a structure used to track a list or a stack of states.
153 1.1 lukem */
154 1.1 lukem typedef struct {
155 1.1 lukem ucs2_t *slist;
156 1.1 lukem ucs2_t slist_size;
157 1.1 lukem ucs2_t slist_used;
158 1.1 lukem } _ure_stlist_t;
159 1.1 lukem
160 1.1 lukem /*
161 1.1 lukem * Structure to track the list of unique states for a symbol
162 1.1 lukem * during reduction.
163 1.1 lukem */
164 1.1 lukem typedef struct {
165 1.1 lukem ucs2_t id;
166 1.1 lukem ucs2_t type;
167 1.1 lukem unsigned long mods;
168 1.1 lukem unsigned long props;
169 1.1 lukem _ure_sym_t sym;
170 1.1 lukem _ure_stlist_t states;
171 1.1 lukem } _ure_symtab_t;
172 1.1 lukem
173 1.1 lukem /*
174 1.1 lukem * Structure to hold a single state.
175 1.1 lukem */
176 1.1 lukem typedef struct {
177 1.1 lukem ucs2_t id;
178 1.1 lukem ucs2_t accepting;
179 1.1 lukem ucs2_t pad;
180 1.1 lukem _ure_stlist_t st;
181 1.1 lukem _ure_elt_t *trans;
182 1.1 lukem ucs2_t trans_size;
183 1.1 lukem ucs2_t trans_used;
184 1.1 lukem } _ure_state_t;
185 1.1 lukem
186 1.1 lukem /*
187 1.1 lukem * Structure used for keeping lists of states.
188 1.1 lukem */
189 1.1 lukem typedef struct {
190 1.1 lukem _ure_state_t *states;
191 1.1 lukem ucs2_t states_size;
192 1.1 lukem ucs2_t states_used;
193 1.1 lukem } _ure_statetable_t;
194 1.1 lukem
195 1.1 lukem /*
196 1.1 lukem * Structure to track pairs of DFA states when equivalent states are
197 1.1 lukem * merged.
198 1.1 lukem */
199 1.1 lukem typedef struct {
200 1.1 lukem ucs2_t l;
201 1.1 lukem ucs2_t r;
202 1.1 lukem } _ure_equiv_t;
203 1.1 lukem
204 1.1 lukem /*
205 1.1 lukem * Structure used for constructing the NFA and reducing to a minimal DFA.
206 1.1 lukem */
207 1.1 lukem typedef struct _ure_buffer_t {
208 1.1 lukem int reducing;
209 1.1 lukem int error;
210 1.1 lukem unsigned long flags;
211 1.1 lukem
212 1.1 lukem _ure_stlist_t stack;
213 1.1 lukem
214 1.1 lukem /*
215 1.1 lukem * Table of unique symbols encountered.
216 1.1 lukem */
217 1.1 lukem _ure_symtab_t *symtab;
218 1.1 lukem ucs2_t symtab_size;
219 1.1 lukem ucs2_t symtab_used;
220 1.1 lukem
221 1.1 lukem /*
222 1.1 lukem * Tracks the unique expressions generated for the NFA and when the NFA is
223 1.1 lukem * reduced.
224 1.1 lukem */
225 1.1 lukem _ure_elt_t *expr;
226 1.1 lukem ucs2_t expr_used;
227 1.1 lukem ucs2_t expr_size;
228 1.1 lukem
229 1.1 lukem /*
230 1.1 lukem * The reduced table of unique groups of NFA states.
231 1.1 lukem */
232 1.1 lukem _ure_statetable_t states;
233 1.1 lukem
234 1.1 lukem /*
235 1.1 lukem * Tracks states when equivalent states are merged.
236 1.1 lukem */
237 1.1 lukem _ure_equiv_t *equiv;
238 1.1 lukem ucs2_t equiv_used;
239 1.1 lukem ucs2_t equiv_size;
240 1.1 lukem } _ure_buffer_t;
241 1.1 lukem
242 1.1 lukem typedef struct {
243 1.1 lukem ucs2_t symbol;
244 1.1 lukem ucs2_t next_state;
245 1.1 lukem } _ure_trans_t;
246 1.1 lukem
247 1.1 lukem typedef struct {
248 1.1 lukem ucs2_t accepting;
249 1.1 lukem ucs2_t ntrans;
250 1.1 lukem _ure_trans_t *trans;
251 1.1 lukem } _ure_dstate_t;
252 1.1 lukem
253 1.1 lukem typedef struct _ure_dfa_t {
254 1.1 lukem unsigned long flags;
255 1.1 lukem
256 1.1 lukem _ure_symtab_t *syms;
257 1.1 lukem ucs2_t nsyms;
258 1.1 lukem
259 1.1 lukem _ure_dstate_t *states;
260 1.1 lukem ucs2_t nstates;
261 1.1 lukem
262 1.1 lukem _ure_trans_t *trans;
263 1.1 lukem ucs2_t ntrans;
264 1.1 lukem } _ure_dfa_t;
265 1.1 lukem
266 1.1 lukem /*************************************************************************
267 1.1 lukem *
268 1.1 lukem * Functions.
269 1.1 lukem *
270 1.1 lukem *************************************************************************/
271 1.1 lukem
272 1.1 lukem static void
273 1.1 lukem _ure_memmove(char *dest, char *src, unsigned long bytes)
274 1.1 lukem {
275 1.1 lukem long i, j;
276 1.1 lukem
277 1.1 lukem i = (long) bytes;
278 1.1 lukem j = i & 7;
279 1.1 lukem i = (i + 7) >> 3;
280 1.1 lukem
281 1.1 lukem /*
282 1.1 lukem * Do a memmove using Ye Olde Duff's Device for efficiency.
283 1.1 lukem */
284 1.1 lukem if (src < dest) {
285 1.1 lukem src += bytes;
286 1.1 lukem dest += bytes;
287 1.1 lukem
288 1.1 lukem switch (j) {
289 1.1 lukem case 0: do {
290 1.1 lukem *--dest = *--src;
291 1.1 lukem case 7: *--dest = *--src;
292 1.1 lukem case 6: *--dest = *--src;
293 1.1 lukem case 5: *--dest = *--src;
294 1.1 lukem case 4: *--dest = *--src;
295 1.1 lukem case 3: *--dest = *--src;
296 1.1 lukem case 2: *--dest = *--src;
297 1.1 lukem case 1: *--dest = *--src;
298 1.1 lukem } while (--i > 0);
299 1.1 lukem }
300 1.1 lukem } else if (src > dest) {
301 1.1 lukem switch (j) {
302 1.1 lukem case 0: do {
303 1.1 lukem *dest++ = *src++;
304 1.1 lukem case 7: *dest++ = *src++;
305 1.1 lukem case 6: *dest++ = *src++;
306 1.1 lukem case 5: *dest++ = *src++;
307 1.1 lukem case 4: *dest++ = *src++;
308 1.1 lukem case 3: *dest++ = *src++;
309 1.1 lukem case 2: *dest++ = *src++;
310 1.1 lukem case 1: *dest++ = *src++;
311 1.1 lukem } while (--i > 0);
312 1.1 lukem }
313 1.1 lukem }
314 1.1 lukem }
315 1.1 lukem
316 1.1 lukem static void
317 1.1 lukem _ure_push(ucs2_t v, _ure_buffer_t *b)
318 1.1 lukem {
319 1.1 lukem _ure_stlist_t *s;
320 1.1 lukem
321 1.1 lukem if (b == 0)
322 1.1 lukem return;
323 1.1 lukem
324 1.1 lukem /*
325 1.1 lukem * If the `reducing' parameter is non-zero, check to see if the value
326 1.1 lukem * passed is already on the stack.
327 1.1 lukem */
328 1.1 lukem if (b->reducing != 0 && b->expr[v].onstack != 0)
329 1.1 lukem return;
330 1.1 lukem
331 1.1 lukem s = &b->stack;
332 1.1 lukem if (s->slist_used == s->slist_size) {
333 1.1 lukem if (s->slist_size == 0)
334 1.1 lukem s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
335 1.1 lukem else
336 1.1 lukem s->slist = (ucs2_t *) realloc((char *) s->slist,
337 1.1 lukem sizeof(ucs2_t) * (s->slist_size + 8));
338 1.1 lukem s->slist_size += 8;
339 1.1 lukem }
340 1.1 lukem s->slist[s->slist_used++] = v;
341 1.1 lukem
342 1.1 lukem /*
343 1.1 lukem * If the `reducing' parameter is non-zero, flag the element as being on
344 1.1 lukem * the stack.
345 1.1 lukem */
346 1.1 lukem if (b->reducing != 0)
347 1.1 lukem b->expr[v].onstack = 1;
348 1.1 lukem }
349 1.1 lukem
350 1.1 lukem static ucs2_t
351 1.1 lukem _ure_peek(_ure_buffer_t *b)
352 1.1 lukem {
353 1.1 lukem if (b == 0 || b->stack.slist_used == 0)
354 1.1 lukem return _URE_NOOP;
355 1.1 lukem
356 1.1 lukem return b->stack.slist[b->stack.slist_used - 1];
357 1.1 lukem }
358 1.1 lukem
359 1.1 lukem static ucs2_t
360 1.1 lukem _ure_pop(_ure_buffer_t *b)
361 1.1 lukem {
362 1.1 lukem ucs2_t v;
363 1.1 lukem
364 1.1 lukem if (b == 0 || b->stack.slist_used == 0)
365 1.1 lukem return _URE_NOOP;
366 1.1 lukem
367 1.1 lukem v = b->stack.slist[--b->stack.slist_used];
368 1.1 lukem if (b->reducing)
369 1.1 lukem b->expr[v].onstack = 0;
370 1.1 lukem
371 1.1 lukem return v;
372 1.1 lukem }
373 1.1 lukem
374 1.1 lukem /*************************************************************************
375 1.1 lukem *
376 1.1 lukem * Start symbol parse functions.
377 1.1 lukem *
378 1.1 lukem *************************************************************************/
379 1.1 lukem
380 1.1 lukem /*
381 1.1 lukem * Parse a comma-separated list of integers that represent character
382 1.1 lukem * properties. Combine them into a mask that is returned in the `mask'
383 1.1 lukem * variable, and return the number of characters consumed.
384 1.1 lukem */
385 1.1 lukem static unsigned long
386 1.1 lukem _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
387 1.1 lukem _ure_buffer_t *b)
388 1.1 lukem {
389 1.1 lukem unsigned long n, m;
390 1.1 lukem ucs2_t *sp, *ep;
391 1.1 lukem
392 1.1 lukem sp = pp;
393 1.1 lukem ep = sp + limit;
394 1.1 lukem
395 1.1 lukem for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
396 1.1 lukem if (*sp == ',') {
397 1.1 lukem /*
398 1.1 lukem * Encountered a comma, so select the next character property flag
399 1.1 lukem * and reset the number.
400 1.1 lukem */
401 1.1 lukem m |= cclass_flags[n];
402 1.1 lukem n = 0;
403 1.1 lukem } else if (*sp >= '0' && *sp <= '9')
404 1.1 lukem /*
405 1.1 lukem * Encountered a digit, so start or continue building the cardinal
406 1.1 lukem * that represents the character property flag.
407 1.1 lukem */
408 1.1 lukem n = (n * 10) + (*sp - '0');
409 1.1 lukem else
410 1.1 lukem /*
411 1.1 lukem * Encountered something that is not part of the property list.
412 1.1 lukem * Indicate that we are done.
413 1.1 lukem */
414 1.1 lukem break;
415 1.1 lukem
416 1.1 lukem /*
417 1.1 lukem * If a property number greater than 32 occurs, then there is a
418 1.1 lukem * problem. Most likely a missing comma separator.
419 1.1 lukem */
420 1.1 lukem if (n > 32)
421 1.1 lukem b->error = _URE_INVALID_PROPERTY;
422 1.1 lukem }
423 1.1 lukem
424 1.1.1.5 christos if (b->error == _URE_OK && n != 0)
425 1.1 lukem m |= cclass_flags[n];
426 1.1 lukem
427 1.1 lukem /*
428 1.1 lukem * Set the mask that represents the group of character properties.
429 1.1 lukem */
430 1.1 lukem *mask = m;
431 1.1 lukem
432 1.1 lukem /*
433 1.1 lukem * Return the number of characters consumed.
434 1.1 lukem */
435 1.1 lukem return sp - pp;
436 1.1 lukem }
437 1.1 lukem
438 1.1 lukem /*
439 1.1 lukem * Collect a hex number with 1 to 4 digits and return the number
440 1.1 lukem * of characters used.
441 1.1 lukem */
442 1.1 lukem static unsigned long
443 1.1 lukem _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
444 1.1 lukem {
445 1.1 lukem ucs2_t i;
446 1.1 lukem ucs2_t *sp, *ep;
447 1.1 lukem ucs4_t nn;
448 1.1 lukem
449 1.1 lukem sp = np;
450 1.1 lukem ep = sp + limit;
451 1.1 lukem
452 1.1 lukem for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
453 1.1 lukem if (*sp >= '0' && *sp <= '9')
454 1.1 lukem nn = (nn << 4) + (*sp - '0');
455 1.1 lukem else if (*sp >= 'A' && *sp <= 'F')
456 1.1 lukem nn = (nn << 4) + ((*sp - 'A') + 10);
457 1.1 lukem else if (*sp >= 'a' && *sp <= 'f')
458 1.1 lukem nn = (nn << 4) + ((*sp - 'a') + 10);
459 1.1 lukem else
460 1.1 lukem /*
461 1.1 lukem * Encountered something that is not a hex digit.
462 1.1 lukem */
463 1.1 lukem break;
464 1.1 lukem }
465 1.1 lukem
466 1.1 lukem /*
467 1.1 lukem * Assign the character code collected and return the number of
468 1.1 lukem * characters used.
469 1.1 lukem */
470 1.1 lukem *n = nn;
471 1.1 lukem
472 1.1 lukem return sp - np;
473 1.1 lukem }
474 1.1 lukem
475 1.1 lukem /*
476 1.1 lukem * Insert a range into a character class, removing duplicates and ordering
477 1.1 lukem * them in increasing range-start order.
478 1.1 lukem */
479 1.1 lukem static void
480 1.1 lukem _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
481 1.1 lukem {
482 1.1 lukem ucs2_t i;
483 1.1 lukem ucs4_t tmp;
484 1.1 lukem _ure_range_t *rp;
485 1.1 lukem
486 1.1 lukem /*
487 1.1 lukem * If the `casefold' flag is set, then make sure both endpoints of the
488 1.1 lukem * range are converted to lower case.
489 1.1 lukem */
490 1.1 lukem if (b->flags & _URE_DFA_CASEFOLD) {
491 1.1 lukem r->min_code = _ure_tolower(r->min_code);
492 1.1 lukem r->max_code = _ure_tolower(r->max_code);
493 1.1 lukem }
494 1.1 lukem
495 1.1 lukem /*
496 1.1 lukem * Swap the range endpoints if they are not in increasing order.
497 1.1 lukem */
498 1.1 lukem if (r->min_code > r->max_code) {
499 1.1 lukem tmp = r->min_code;
500 1.1 lukem r->min_code = r->max_code;
501 1.1 lukem r->max_code = tmp;
502 1.1 lukem }
503 1.1 lukem
504 1.1 lukem for (i = 0, rp = ccl->ranges;
505 1.1 lukem i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
506 1.1 lukem
507 1.1 lukem /*
508 1.1 lukem * Check for a duplicate.
509 1.1 lukem */
510 1.1 lukem if (i < ccl->ranges_used &&
511 1.1 lukem r->min_code == rp->min_code && r->max_code == rp->max_code)
512 1.1 lukem return;
513 1.1 lukem
514 1.1 lukem if (ccl->ranges_used == ccl->ranges_size) {
515 1.1 lukem if (ccl->ranges_size == 0)
516 1.1 lukem ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
517 1.1 lukem else
518 1.1 lukem ccl->ranges = (_ure_range_t *)
519 1.1 lukem realloc((char *) ccl->ranges,
520 1.1 lukem sizeof(_ure_range_t) * (ccl->ranges_size + 8));
521 1.1 lukem ccl->ranges_size += 8;
522 1.1 lukem }
523 1.1 lukem
524 1.1 lukem rp = ccl->ranges + ccl->ranges_used;
525 1.1 lukem
526 1.1 lukem if (i < ccl->ranges_used)
527 1.1 lukem _ure_memmove((char *) (rp + 1), (char *) rp,
528 1.1 lukem sizeof(_ure_range_t) * (ccl->ranges_used - i));
529 1.1 lukem
530 1.1 lukem ccl->ranges_used++;
531 1.1 lukem rp->min_code = r->min_code;
532 1.1 lukem rp->max_code = r->max_code;
533 1.1 lukem }
534 1.1 lukem
535 1.1 lukem #define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
536 1.1 lukem _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
537 1.1 lukem #define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT)
538 1.1 lukem #define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
539 1.1 lukem _URE_OTHERPUNCT)
540 1.1 lukem #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
541 1.1 lukem _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
542 1.1 lukem #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
543 1.1 lukem #define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
544 1.1 lukem
545 1.1 lukem typedef void (*_ure_cclsetup_t)(
546 1.1 lukem _ure_symtab_t *sym,
547 1.1 lukem unsigned long mask,
548 1.1 lukem _ure_buffer_t *b
549 1.1 lukem );
550 1.1 lukem
551 1.1 lukem typedef struct {
552 1.1 lukem ucs2_t key;
553 1.1 lukem unsigned long len;
554 1.1 lukem unsigned long next;
555 1.1 lukem _ure_cclsetup_t func;
556 1.1 lukem unsigned long mask;
557 1.1 lukem } _ure_trie_t;
558 1.1 lukem
559 1.1 lukem static void
560 1.1 lukem _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
561 1.1 lukem {
562 1.1 lukem sym->props |= mask;
563 1.1 lukem }
564 1.1 lukem
565 1.1 lukem static void
566 1.1 lukem _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
567 1.1 lukem {
568 1.1 lukem _ure_range_t range;
569 1.1 lukem
570 1.1 lukem sym->props |= mask;
571 1.1 lukem
572 1.1 lukem /*
573 1.1 lukem * Add the additional characters needed for handling isspace().
574 1.1 lukem */
575 1.1 lukem range.min_code = range.max_code = '\t';
576 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
577 1.1 lukem range.min_code = range.max_code = '\r';
578 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
579 1.1 lukem range.min_code = range.max_code = '\n';
580 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
581 1.1 lukem range.min_code = range.max_code = '\f';
582 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
583 1.1 lukem range.min_code = range.max_code = 0xfeff;
584 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
585 1.1 lukem }
586 1.1 lukem
587 1.1 lukem static void
588 1.1 lukem _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
589 1.1 lukem {
590 1.1 lukem _ure_range_t range;
591 1.1 lukem
592 1.1 lukem /*
593 1.1 lukem * Add the additional characters needed for handling isxdigit().
594 1.1 lukem */
595 1.1 lukem range.min_code = '0';
596 1.1 lukem range.max_code = '9';
597 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
598 1.1 lukem range.min_code = 'A';
599 1.1 lukem range.max_code = 'F';
600 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
601 1.1 lukem range.min_code = 'a';
602 1.1 lukem range.max_code = 'f';
603 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
604 1.1 lukem }
605 1.1 lukem
606 1.1 lukem static _ure_trie_t cclass_trie[] = {
607 1.1 lukem {0x003a, 1, 1, 0, 0},
608 1.1 lukem {0x0061, 9, 10, 0, 0},
609 1.1 lukem {0x0063, 8, 19, 0, 0},
610 1.1 lukem {0x0064, 7, 24, 0, 0},
611 1.1 lukem {0x0067, 6, 29, 0, 0},
612 1.1 lukem {0x006c, 5, 34, 0, 0},
613 1.1 lukem {0x0070, 4, 39, 0, 0},
614 1.1 lukem {0x0073, 3, 49, 0, 0},
615 1.1 lukem {0x0075, 2, 54, 0, 0},
616 1.1 lukem {0x0078, 1, 59, 0, 0},
617 1.1 lukem {0x006c, 1, 11, 0, 0},
618 1.1 lukem {0x006e, 2, 13, 0, 0},
619 1.1 lukem {0x0070, 1, 16, 0, 0},
620 1.1 lukem {0x0075, 1, 14, 0, 0},
621 1.1 lukem {0x006d, 1, 15, 0, 0},
622 1.1 lukem {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
623 1.1 lukem {0x0068, 1, 17, 0, 0},
624 1.1 lukem {0x0061, 1, 18, 0, 0},
625 1.1 lukem {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
626 1.1 lukem {0x006e, 1, 20, 0, 0},
627 1.1 lukem {0x0074, 1, 21, 0, 0},
628 1.1 lukem {0x0072, 1, 22, 0, 0},
629 1.1 lukem {0x006c, 1, 23, 0, 0},
630 1.1 lukem {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
631 1.1 lukem {0x0069, 1, 25, 0, 0},
632 1.1 lukem {0x0067, 1, 26, 0, 0},
633 1.1 lukem {0x0069, 1, 27, 0, 0},
634 1.1 lukem {0x0074, 1, 28, 0, 0},
635 1.1 lukem {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
636 1.1 lukem {0x0072, 1, 30, 0, 0},
637 1.1 lukem {0x0061, 1, 31, 0, 0},
638 1.1 lukem {0x0070, 1, 32, 0, 0},
639 1.1 lukem {0x0068, 1, 33, 0, 0},
640 1.1 lukem {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
641 1.1 lukem {0x006f, 1, 35, 0, 0},
642 1.1 lukem {0x0077, 1, 36, 0, 0},
643 1.1 lukem {0x0065, 1, 37, 0, 0},
644 1.1 lukem {0x0072, 1, 38, 0, 0},
645 1.1 lukem {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
646 1.1 lukem {0x0072, 2, 41, 0, 0},
647 1.1 lukem {0x0075, 1, 45, 0, 0},
648 1.1 lukem {0x0069, 1, 42, 0, 0},
649 1.1 lukem {0x006e, 1, 43, 0, 0},
650 1.1 lukem {0x0074, 1, 44, 0, 0},
651 1.1 lukem {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
652 1.1 lukem {0x006e, 1, 46, 0, 0},
653 1.1 lukem {0x0063, 1, 47, 0, 0},
654 1.1 lukem {0x0074, 1, 48, 0, 0},
655 1.1 lukem {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
656 1.1 lukem {0x0070, 1, 50, 0, 0},
657 1.1 lukem {0x0061, 1, 51, 0, 0},
658 1.1 lukem {0x0063, 1, 52, 0, 0},
659 1.1 lukem {0x0065, 1, 53, 0, 0},
660 1.1 lukem {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
661 1.1 lukem {0x0070, 1, 55, 0, 0},
662 1.1 lukem {0x0070, 1, 56, 0, 0},
663 1.1 lukem {0x0065, 1, 57, 0, 0},
664 1.1 lukem {0x0072, 1, 58, 0, 0},
665 1.1 lukem {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
666 1.1 lukem {0x0064, 1, 60, 0, 0},
667 1.1 lukem {0x0069, 1, 61, 0, 0},
668 1.1 lukem {0x0067, 1, 62, 0, 0},
669 1.1 lukem {0x0069, 1, 63, 0, 0},
670 1.1 lukem {0x0074, 1, 64, 0, 0},
671 1.1 lukem {0x003a, 1, 65, _ure_xdigit_setup, 0},
672 1.1 lukem };
673 1.1 lukem
674 1.1 lukem /*
675 1.1 lukem * Probe for one of the POSIX colon delimited character classes in the static
676 1.1 lukem * trie.
677 1.1 lukem */
678 1.1 lukem static unsigned long
679 1.1 lukem _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
680 1.1 lukem _ure_buffer_t *b)
681 1.1 lukem {
682 1.1 lukem int i;
683 1.1 lukem unsigned long n;
684 1.1 lukem _ure_trie_t *tp;
685 1.1 lukem ucs2_t *sp, *ep;
686 1.1 lukem
687 1.1 lukem /*
688 1.1 lukem * If the number of characters left is less than 7, then this cannot be
689 1.1 lukem * interpreted as one of the colon delimited classes.
690 1.1 lukem */
691 1.1 lukem if (limit < 7)
692 1.1 lukem return 0;
693 1.1 lukem
694 1.1 lukem sp = cp;
695 1.1 lukem ep = sp + limit;
696 1.1 lukem tp = cclass_trie;
697 1.1 lukem for (i = 0; sp < ep && i < 8; i++, sp++) {
698 1.1 lukem n = tp->len;
699 1.1 lukem
700 1.1 lukem for (; n > 0 && tp->key != *sp; tp++, n--) ;
701 1.1 lukem
702 1.1 lukem if (n == 0)
703 1.1 lukem return 0;
704 1.1 lukem
705 1.1 lukem if (*sp == ':' && (i == 6 || i == 7)) {
706 1.1 lukem sp++;
707 1.1 lukem break;
708 1.1 lukem }
709 1.1 lukem if (sp + 1 < ep)
710 1.1 lukem tp = cclass_trie + tp->next;
711 1.1 lukem }
712 1.1 lukem if (tp->func == 0)
713 1.1 lukem return 0;
714 1.1 lukem
715 1.1 lukem (*tp->func)(sym, tp->mask, b);
716 1.1 lukem
717 1.1 lukem return sp - cp;
718 1.1 lukem }
719 1.1 lukem
720 1.1 lukem /*
721 1.1 lukem * Construct a list of ranges and return the number of characters consumed.
722 1.1 lukem */
723 1.1 lukem static unsigned long
724 1.1 lukem _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
725 1.1 lukem _ure_buffer_t *b)
726 1.1 lukem {
727 1.1 lukem int range_end;
728 1.1 lukem unsigned long n;
729 1.1 lukem ucs2_t *sp, *ep;
730 1.1 lukem ucs4_t c, last;
731 1.1 lukem _ure_ccl_t *cclp;
732 1.1 lukem _ure_range_t range;
733 1.1 lukem
734 1.1 lukem sp = cp;
735 1.1 lukem ep = sp + limit;
736 1.1 lukem
737 1.1 lukem if (*sp == '^') {
738 1.1 lukem symp->type = _URE_NCCLASS;
739 1.1 lukem sp++;
740 1.1 lukem } else
741 1.1 lukem symp->type = _URE_CCLASS;
742 1.1 lukem
743 1.1 lukem for (last = 0, range_end = 0;
744 1.1 lukem b->error == _URE_OK && sp < ep && *sp != ']'; ) {
745 1.1 lukem c = *sp++;
746 1.1 lukem if (c == '\\') {
747 1.1 lukem if (sp == ep) {
748 1.1 lukem /*
749 1.1 lukem * The EOS was encountered when expecting the reverse solidus
750 1.1 lukem * to be followed by the character it is escaping. Set an
751 1.1 lukem * error code and return the number of characters consumed up
752 1.1 lukem * to this point.
753 1.1 lukem */
754 1.1 lukem b->error = _URE_UNEXPECTED_EOS;
755 1.1 lukem return sp - cp;
756 1.1 lukem }
757 1.1 lukem
758 1.1 lukem c = *sp++;
759 1.1 lukem switch (c) {
760 1.1 lukem case 'a':
761 1.1 lukem c = 0x07;
762 1.1 lukem break;
763 1.1 lukem case 'b':
764 1.1 lukem c = 0x08;
765 1.1 lukem break;
766 1.1 lukem case 'f':
767 1.1 lukem c = 0x0c;
768 1.1 lukem break;
769 1.1 lukem case 'n':
770 1.1 lukem c = 0x0a;
771 1.1 lukem break;
772 1.1 lukem case 'r':
773 1.1 lukem c = 0x0d;
774 1.1 lukem break;
775 1.1 lukem case 't':
776 1.1 lukem c = 0x09;
777 1.1 lukem break;
778 1.1 lukem case 'v':
779 1.1 lukem c = 0x0b;
780 1.1 lukem break;
781 1.1 lukem case 'p':
782 1.1 lukem case 'P':
783 1.1 lukem sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
784 1.1 lukem /*
785 1.1 lukem * Invert the bit mask of the properties if this is a negated
786 1.1 lukem * character class or if 'P' is used to specify a list of
787 1.1 lukem * character properties that should *not* match in a
788 1.1 lukem * character class.
789 1.1 lukem */
790 1.1 lukem if (c == 'P')
791 1.1 lukem symp->props = ~symp->props;
792 1.1 lukem continue;
793 1.1 lukem break;
794 1.1 lukem case 'x':
795 1.1 lukem case 'X':
796 1.1 lukem case 'u':
797 1.1 lukem case 'U':
798 1.1 lukem if (sp < ep &&
799 1.1 lukem ((*sp >= '0' && *sp <= '9') ||
800 1.1 lukem (*sp >= 'A' && *sp <= 'F') ||
801 1.1 lukem (*sp >= 'a' && *sp <= 'f')))
802 1.1 lukem sp += _ure_hex(sp, ep - sp, &c);
803 1.1 lukem }
804 1.1 lukem } else if (c == ':') {
805 1.1 lukem /*
806 1.1 lukem * Probe for a POSIX colon delimited character class.
807 1.1 lukem */
808 1.1 lukem sp--;
809 1.1 lukem if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
810 1.1 lukem sp++;
811 1.1 lukem else {
812 1.1 lukem sp += n;
813 1.1 lukem continue;
814 1.1 lukem }
815 1.1 lukem }
816 1.1 lukem
817 1.1 lukem cclp = &symp->sym.ccl;
818 1.1 lukem
819 1.1 lukem /*
820 1.1 lukem * Check to see if the current character is a low surrogate that needs
821 1.1 lukem * to be combined with a preceding high surrogate.
822 1.1 lukem */
823 1.1 lukem if (last != 0) {
824 1.1 lukem if (c >= 0xdc00 && c <= 0xdfff)
825 1.1 lukem /*
826 1.1 lukem * Construct the UTF16 character code.
827 1.1 lukem */
828 1.1 lukem c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
829 1.1 lukem else {
830 1.1 lukem /*
831 1.1 lukem * Add the isolated high surrogate to the range.
832 1.1 lukem */
833 1.1 lukem if (range_end == 1)
834 1.1 lukem range.max_code = last & 0xffff;
835 1.1 lukem else
836 1.1 lukem range.min_code = range.max_code = last & 0xffff;
837 1.1 lukem
838 1.1 lukem _ure_add_range(cclp, &range, b);
839 1.1 lukem range_end = 0;
840 1.1 lukem }
841 1.1 lukem }
842 1.1 lukem
843 1.1 lukem /*
844 1.1 lukem * Clear the last character code.
845 1.1 lukem */
846 1.1 lukem last = 0;
847 1.1 lukem
848 1.1 lukem /*
849 1.1 lukem * This slightly awkward code handles the different cases needed to
850 1.1 lukem * construct a range.
851 1.1 lukem */
852 1.1 lukem if (c >= 0xd800 && c <= 0xdbff) {
853 1.1 lukem /*
854 1.1 lukem * If the high surrogate is followed by a range indicator, simply
855 1.1 lukem * add it as the range start. Otherwise, save it in case the next
856 1.1 lukem * character is a low surrogate.
857 1.1 lukem */
858 1.1 lukem if (*sp == '-') {
859 1.1 lukem sp++;
860 1.1 lukem range.min_code = c;
861 1.1 lukem range_end = 1;
862 1.1 lukem } else
863 1.1 lukem last = c;
864 1.1 lukem } else if (range_end == 1) {
865 1.1 lukem range.max_code = c;
866 1.1 lukem _ure_add_range(cclp, &range, b);
867 1.1 lukem range_end = 0;
868 1.1 lukem } else {
869 1.1 lukem range.min_code = range.max_code = c;
870 1.1 lukem if (*sp == '-') {
871 1.1 lukem sp++;
872 1.1 lukem range_end = 1;
873 1.1 lukem } else
874 1.1 lukem _ure_add_range(cclp, &range, b);
875 1.1 lukem }
876 1.1 lukem }
877 1.1 lukem
878 1.1 lukem if (sp < ep && *sp == ']')
879 1.1 lukem sp++;
880 1.1 lukem else
881 1.1 lukem /*
882 1.1 lukem * The parse was not terminated by the character class close symbol
883 1.1 lukem * (']'), so set an error code.
884 1.1 lukem */
885 1.1 lukem b->error = _URE_CCLASS_OPEN;
886 1.1 lukem
887 1.1 lukem return sp - cp;
888 1.1 lukem }
889 1.1 lukem
890 1.1 lukem /*
891 1.1 lukem * Probe for a low surrogate hex code.
892 1.1 lukem */
893 1.1 lukem static unsigned long
894 1.1 lukem _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
895 1.1 lukem {
896 1.1 lukem ucs4_t i, code;
897 1.1 lukem ucs2_t *sp, *ep;
898 1.1 lukem
899 1.1 lukem for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
900 1.1 lukem if (*sp >= '0' && *sp <= '9')
901 1.1 lukem code = (code << 4) + (*sp - '0');
902 1.1 lukem else if (*sp >= 'A' && *sp <= 'F')
903 1.1 lukem code = (code << 4) + ((*sp - 'A') + 10);
904 1.1 lukem else if (*sp >= 'a' && *sp <= 'f')
905 1.1 lukem code = (code << 4) + ((*sp - 'a') + 10);
906 1.1 lukem else
907 1.1 lukem break;
908 1.1 lukem }
909 1.1 lukem
910 1.1 lukem *c = code;
911 1.1 lukem return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
912 1.1 lukem }
913 1.1 lukem
914 1.1 lukem static unsigned long
915 1.1 lukem _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
916 1.1 lukem _ure_buffer_t *b)
917 1.1 lukem {
918 1.1 lukem ucs4_t c;
919 1.1 lukem ucs2_t *sp, *ep;
920 1.1 lukem
921 1.1 lukem sp = sym;
922 1.1 lukem ep = sym + limit;
923 1.1 lukem
924 1.1 lukem if ((c = *sp++) == '\\') {
925 1.1 lukem
926 1.1 lukem if (sp == ep) {
927 1.1 lukem /*
928 1.1 lukem * The EOS was encountered when expecting the reverse solidus to
929 1.1 lukem * be followed by the character it is escaping. Set an error code
930 1.1 lukem * and return the number of characters consumed up to this point.
931 1.1 lukem */
932 1.1 lukem b->error = _URE_UNEXPECTED_EOS;
933 1.1 lukem return sp - sym;
934 1.1 lukem }
935 1.1 lukem
936 1.1 lukem c = *sp++;
937 1.1 lukem switch (c) {
938 1.1 lukem case 'p':
939 1.1 lukem case 'P':
940 1.1 lukem symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
941 1.1 lukem sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
942 1.1 lukem break;
943 1.1 lukem case 'a':
944 1.1 lukem symp->type = _URE_CHAR;
945 1.1 lukem symp->sym.chr = 0x07;
946 1.1 lukem break;
947 1.1 lukem case 'b':
948 1.1 lukem symp->type = _URE_CHAR;
949 1.1 lukem symp->sym.chr = 0x08;
950 1.1 lukem break;
951 1.1 lukem case 'f':
952 1.1 lukem symp->type = _URE_CHAR;
953 1.1 lukem symp->sym.chr = 0x0c;
954 1.1 lukem break;
955 1.1 lukem case 'n':
956 1.1 lukem symp->type = _URE_CHAR;
957 1.1 lukem symp->sym.chr = 0x0a;
958 1.1 lukem break;
959 1.1 lukem case 'r':
960 1.1 lukem symp->type = _URE_CHAR;
961 1.1 lukem symp->sym.chr = 0x0d;
962 1.1 lukem break;
963 1.1 lukem case 't':
964 1.1 lukem symp->type = _URE_CHAR;
965 1.1 lukem symp->sym.chr = 0x09;
966 1.1 lukem break;
967 1.1 lukem case 'v':
968 1.1 lukem symp->type = _URE_CHAR;
969 1.1 lukem symp->sym.chr = 0x0b;
970 1.1 lukem break;
971 1.1 lukem case 'x':
972 1.1 lukem case 'X':
973 1.1 lukem case 'u':
974 1.1 lukem case 'U':
975 1.1 lukem /*
976 1.1 lukem * Collect between 1 and 4 digits representing a UCS2 code. Fall
977 1.1 lukem * through to the next case.
978 1.1 lukem */
979 1.1 lukem if (sp < ep &&
980 1.1 lukem ((*sp >= '0' && *sp <= '9') ||
981 1.1 lukem (*sp >= 'A' && *sp <= 'F') ||
982 1.1 lukem (*sp >= 'a' && *sp <= 'f')))
983 1.1 lukem sp += _ure_hex(sp, ep - sp, &c);
984 1.1 lukem /* FALLTHROUGH */
985 1.1 lukem default:
986 1.1 lukem /*
987 1.1 lukem * Simply add an escaped character here.
988 1.1 lukem */
989 1.1 lukem symp->type = _URE_CHAR;
990 1.1 lukem symp->sym.chr = c;
991 1.1 lukem }
992 1.1 lukem } else if (c == '^' || c == '$')
993 1.1 lukem /*
994 1.1 lukem * Handle the BOL and EOL anchors. This actually consists simply of
995 1.1 lukem * setting a flag that indicates that the user supplied anchor match
996 1.1 lukem * function should be called. This needs to be done instead of simply
997 1.1 lukem * matching line/paragraph separators because beginning-of-text and
998 1.1 lukem * end-of-text tests are needed as well.
999 1.1 lukem */
1000 1.1 lukem symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
1001 1.1 lukem else if (c == '[')
1002 1.1 lukem /*
1003 1.1 lukem * Construct a character class.
1004 1.1 lukem */
1005 1.1 lukem sp += _ure_cclass(sp, ep - sp, symp, b);
1006 1.1 lukem else if (c == '.')
1007 1.1 lukem symp->type = _URE_ANY_CHAR;
1008 1.1 lukem else {
1009 1.1 lukem symp->type = _URE_CHAR;
1010 1.1 lukem symp->sym.chr = c;
1011 1.1 lukem }
1012 1.1 lukem
1013 1.1 lukem /*
1014 1.1 lukem * If the symbol type happens to be a character and is a high surrogate,
1015 1.1 lukem * then probe forward to see if it is followed by a low surrogate that
1016 1.1 lukem * needs to be added.
1017 1.1 lukem */
1018 1.1 lukem if (sp < ep && symp->type == _URE_CHAR &&
1019 1.1 lukem 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
1020 1.1 lukem
1021 1.1 lukem if (0xdc00 <= *sp && *sp <= 0xdfff) {
1022 1.1 lukem symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1023 1.1 lukem (*sp & 0x03ff));
1024 1.1 lukem sp++;
1025 1.1 lukem } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
1026 1.1 lukem *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
1027 1.1 lukem sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
1028 1.1 lukem if (0xdc00 <= c && c <= 0xdfff) {
1029 1.1 lukem /*
1030 1.1 lukem * Take into account the \[xu] in front of the hex code.
1031 1.1 lukem */
1032 1.1 lukem sp += 2;
1033 1.1 lukem symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1034 1.1 lukem (c & 0x03ff));
1035 1.1 lukem }
1036 1.1 lukem }
1037 1.1 lukem }
1038 1.1 lukem
1039 1.1 lukem /*
1040 1.1 lukem * Last, make sure any _URE_CHAR type symbols are changed to lower case if
1041 1.1 lukem * the `casefold' flag is set.
1042 1.1 lukem */
1043 1.1 lukem if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
1044 1.1 lukem symp->sym.chr = _ure_tolower(symp->sym.chr);
1045 1.1 lukem
1046 1.1 lukem /*
1047 1.1 lukem * If the symbol constructed is anything other than one of the anchors,
1048 1.1 lukem * make sure the _URE_DFA_BLANKLINE flag is removed.
1049 1.1 lukem */
1050 1.1 lukem if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
1051 1.1 lukem b->flags &= ~_URE_DFA_BLANKLINE;
1052 1.1 lukem
1053 1.1 lukem /*
1054 1.1 lukem * Return the number of characters consumed.
1055 1.1 lukem */
1056 1.1 lukem return sp - sym;
1057 1.1 lukem }
1058 1.1 lukem
1059 1.1 lukem static int
1060 1.1 lukem _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
1061 1.1 lukem {
1062 1.1 lukem if (a->type != b->type || a->mods != b->mods || a->props != b->props)
1063 1.1 lukem return 1;
1064 1.1 lukem
1065 1.1 lukem if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
1066 1.1 lukem if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
1067 1.1 lukem return 1;
1068 1.1 lukem if (a->sym.ccl.ranges_used > 0 &&
1069 1.1 lukem memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
1070 1.1 lukem sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
1071 1.1 lukem return 1;
1072 1.1 lukem } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
1073 1.1 lukem return 1;
1074 1.1 lukem return 0;
1075 1.1 lukem }
1076 1.1 lukem
1077 1.1 lukem /*
1078 1.1 lukem * Construct a symbol, but only keep unique symbols.
1079 1.1 lukem */
1080 1.1 lukem static ucs2_t
1081 1.1 lukem _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
1082 1.1 lukem _ure_buffer_t *b)
1083 1.1 lukem {
1084 1.1 lukem ucs2_t i;
1085 1.1 lukem _ure_symtab_t *sp, symbol;
1086 1.1 lukem
1087 1.1 lukem /*
1088 1.1 lukem * Build the next symbol so we can test to see if it is already in the
1089 1.1 lukem * symbol table.
1090 1.1 lukem */
1091 1.1 lukem (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
1092 1.1 lukem *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
1093 1.1 lukem
1094 1.1 lukem /*
1095 1.1 lukem * Check to see if the symbol exists.
1096 1.1 lukem */
1097 1.1 lukem for (i = 0, sp = b->symtab;
1098 1.1 lukem i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
1099 1.1 lukem
1100 1.1 lukem if (i < b->symtab_used) {
1101 1.1 lukem /*
1102 1.1 lukem * Free up any ranges used for the symbol.
1103 1.1 lukem */
1104 1.1 lukem if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
1105 1.1 lukem symbol.sym.ccl.ranges_size > 0)
1106 1.1 lukem free((char *) symbol.sym.ccl.ranges);
1107 1.1 lukem
1108 1.1 lukem return b->symtab[i].id;
1109 1.1 lukem }
1110 1.1 lukem
1111 1.1 lukem /*
1112 1.1 lukem * Need to add the new symbol.
1113 1.1 lukem */
1114 1.1 lukem if (b->symtab_used == b->symtab_size) {
1115 1.1 lukem if (b->symtab_size == 0)
1116 1.1 lukem b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
1117 1.1 lukem else
1118 1.1 lukem b->symtab = (_ure_symtab_t *)
1119 1.1 lukem realloc((char *) b->symtab,
1120 1.1 lukem sizeof(_ure_symtab_t) * (b->symtab_size + 8));
1121 1.1 lukem sp = b->symtab + b->symtab_size;
1122 1.1 lukem (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
1123 1.1 lukem b->symtab_size += 8;
1124 1.1 lukem }
1125 1.1 lukem
1126 1.1 lukem symbol.id = b->symtab_used++;
1127 1.1 lukem (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
1128 1.1 lukem sizeof(_ure_symtab_t));
1129 1.1 lukem
1130 1.1 lukem return symbol.id;
1131 1.1 lukem }
1132 1.1 lukem
1133 1.1 lukem /*************************************************************************
1134 1.1 lukem *
1135 1.1 lukem * End symbol parse functions.
1136 1.1 lukem *
1137 1.1 lukem *************************************************************************/
1138 1.1 lukem
1139 1.1 lukem static ucs2_t
1140 1.1 lukem _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
1141 1.1 lukem {
1142 1.1 lukem ucs2_t i;
1143 1.1 lukem
1144 1.1 lukem if (b == 0)
1145 1.1 lukem return _URE_NOOP;
1146 1.1 lukem
1147 1.1 lukem /*
1148 1.1 lukem * Determine if the expression already exists or not.
1149 1.1 lukem */
1150 1.1 lukem for (i = 0; i < b->expr_used; i++) {
1151 1.1 lukem if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
1152 1.1 lukem b->expr[i].rhs == rhs)
1153 1.1 lukem break;
1154 1.1 lukem }
1155 1.1 lukem if (i < b->expr_used)
1156 1.1 lukem return i;
1157 1.1 lukem
1158 1.1 lukem /*
1159 1.1 lukem * Need to add a new expression.
1160 1.1 lukem */
1161 1.1 lukem if (b->expr_used == b->expr_size) {
1162 1.1 lukem if (b->expr_size == 0)
1163 1.1 lukem b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
1164 1.1 lukem else
1165 1.1 lukem b->expr = (_ure_elt_t *)
1166 1.1 lukem realloc((char *) b->expr,
1167 1.1 lukem sizeof(_ure_elt_t) * (b->expr_size + 8));
1168 1.1 lukem b->expr_size += 8;
1169 1.1 lukem }
1170 1.1 lukem
1171 1.1 lukem b->expr[b->expr_used].onstack = 0;
1172 1.1 lukem b->expr[b->expr_used].type = type;
1173 1.1 lukem b->expr[b->expr_used].lhs = lhs;
1174 1.1 lukem b->expr[b->expr_used].rhs = rhs;
1175 1.1 lukem
1176 1.1 lukem return b->expr_used++;
1177 1.1 lukem }
1178 1.1 lukem
1179 1.1 lukem static unsigned char spmap[] = {
1180 1.1 lukem 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
1181 1.1 lukem 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1182 1.1 lukem 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1183 1.1 lukem };
1184 1.1 lukem
1185 1.1 lukem #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
1186 1.1 lukem (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
1187 1.1 lukem
1188 1.1 lukem /*
1189 1.1 lukem * Convert the regular expression into an NFA in a form that will be easy to
1190 1.1 lukem * reduce to a DFA. The starting state for the reduction will be returned.
1191 1.1 lukem */
1192 1.1 lukem static ucs2_t
1193 1.1 lukem _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
1194 1.1 lukem {
1195 1.1 lukem ucs2_t c, state, top, sym, *sp, *ep;
1196 1.1 lukem unsigned long used;
1197 1.1 lukem
1198 1.1 lukem state = _URE_NOOP;
1199 1.1 lukem
1200 1.1 lukem sp = re;
1201 1.1 lukem ep = sp + relen;
1202 1.1 lukem while (b->error == _URE_OK && sp < ep) {
1203 1.1 lukem c = *sp++;
1204 1.1 lukem switch (c) {
1205 1.1 lukem case '(':
1206 1.1 lukem _ure_push(_URE_PAREN, b);
1207 1.1 lukem break;
1208 1.1 lukem case ')':
1209 1.1 lukem /*
1210 1.1 lukem * Check for the case of too many close parentheses.
1211 1.1 lukem */
1212 1.1 lukem if (_ure_peek(b) == _URE_NOOP) {
1213 1.1 lukem b->error = _URE_UNBALANCED_GROUP;
1214 1.1 lukem break;
1215 1.1 lukem }
1216 1.1 lukem
1217 1.1 lukem while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1218 1.1 lukem /*
1219 1.1 lukem * Make an expression with the AND or OR operator and its right
1220 1.1 lukem * hand side.
1221 1.1 lukem */
1222 1.1 lukem state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1223 1.1 lukem
1224 1.1 lukem /*
1225 1.1 lukem * Remove the _URE_PAREN off the stack.
1226 1.1 lukem */
1227 1.1 lukem (void) _ure_pop(b);
1228 1.1 lukem break;
1229 1.1 lukem case '*':
1230 1.1 lukem state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
1231 1.1 lukem break;
1232 1.1 lukem case '+':
1233 1.1 lukem state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
1234 1.1 lukem break;
1235 1.1 lukem case '?':
1236 1.1 lukem state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
1237 1.1 lukem break;
1238 1.1 lukem case '|':
1239 1.1 lukem while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1240 1.1 lukem /*
1241 1.1 lukem * Make an expression with the AND or OR operator and its right
1242 1.1 lukem * hand side.
1243 1.1 lukem */
1244 1.1 lukem state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1245 1.1 lukem
1246 1.1 lukem _ure_push(state, b);
1247 1.1 lukem _ure_push(_URE_OR, b);
1248 1.1 lukem break;
1249 1.1 lukem default:
1250 1.1 lukem sp--;
1251 1.1 lukem sym = _ure_make_symbol(sp, ep - sp, &used, b);
1252 1.1 lukem sp += used;
1253 1.1 lukem state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
1254 1.1 lukem break;
1255 1.1 lukem }
1256 1.1 lukem
1257 1.1 lukem if (c != '(' && c != '|' && sp < ep &&
1258 1.1 lukem (!_ure_isspecial(*sp) || *sp == '(')) {
1259 1.1 lukem _ure_push(state, b);
1260 1.1 lukem _ure_push(_URE_AND, b);
1261 1.1 lukem }
1262 1.1 lukem }
1263 1.1 lukem while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1264 1.1 lukem /*
1265 1.1 lukem * Make an expression with the AND or OR operator and its right
1266 1.1 lukem * hand side.
1267 1.1 lukem */
1268 1.1 lukem state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1269 1.1 lukem
1270 1.1 lukem if (b->stack.slist_used > 0)
1271 1.1 lukem b->error = _URE_UNBALANCED_GROUP;
1272 1.1 lukem
1273 1.1 lukem return (b->error == _URE_OK) ? state : _URE_NOOP;
1274 1.1 lukem }
1275 1.1 lukem
1276 1.1 lukem static void
1277 1.1 lukem _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
1278 1.1 lukem {
1279 1.1 lukem ucs2_t i, *stp;
1280 1.1 lukem _ure_symtab_t *sp;
1281 1.1 lukem
1282 1.1 lukem /*
1283 1.1 lukem * Locate the symbol in the symbol table so the state can be added.
1284 1.1 lukem * If the symbol doesn't exist, then a real problem exists.
1285 1.1 lukem */
1286 1.1 lukem for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
1287 1.1 lukem i++, sp++) ;
1288 1.1 lukem
1289 1.1 lukem /*
1290 1.1 lukem * Now find out if the state exists in the symbol's state list.
1291 1.1 lukem */
1292 1.1 lukem for (i = 0, stp = sp->states.slist;
1293 1.1 lukem i < sp->states.slist_used && state > *stp; i++, stp++) ;
1294 1.1 lukem
1295 1.1 lukem if (i == sp->states.slist_used || state < *stp) {
1296 1.1 lukem /*
1297 1.1 lukem * Need to add the state in order.
1298 1.1 lukem */
1299 1.1 lukem if (sp->states.slist_used == sp->states.slist_size) {
1300 1.1 lukem if (sp->states.slist_size == 0)
1301 1.1 lukem sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
1302 1.1 lukem else
1303 1.1 lukem sp->states.slist = (ucs2_t *)
1304 1.1 lukem realloc((char *) sp->states.slist,
1305 1.1 lukem sizeof(ucs2_t) * (sp->states.slist_size + 8));
1306 1.1 lukem sp->states.slist_size += 8;
1307 1.1 lukem }
1308 1.1 lukem if (i < sp->states.slist_used)
1309 1.1 lukem (void) _ure_memmove((char *) (sp->states.slist + i + 1),
1310 1.1 lukem (char *) (sp->states.slist + i),
1311 1.1 lukem sizeof(ucs2_t) * (sp->states.slist_used - i));
1312 1.1 lukem sp->states.slist[i] = state;
1313 1.1 lukem sp->states.slist_used++;
1314 1.1 lukem }
1315 1.1 lukem }
1316 1.1 lukem
1317 1.1 lukem static ucs2_t
1318 1.1 lukem _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
1319 1.1 lukem {
1320 1.1 lukem ucs2_t i;
1321 1.1 lukem _ure_state_t *sp;
1322 1.1 lukem
1323 1.1 lukem for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
1324 1.1 lukem if (sp->st.slist_used == nstates &&
1325 1.1 lukem memcmp((char *) states, (char *) sp->st.slist,
1326 1.1 lukem sizeof(ucs2_t) * nstates) == 0)
1327 1.1 lukem break;
1328 1.1 lukem }
1329 1.1 lukem
1330 1.1 lukem if (i == b->states.states_used) {
1331 1.1 lukem /*
1332 1.1 lukem * Need to add a new DFA state (set of NFA states).
1333 1.1 lukem */
1334 1.1 lukem if (b->states.states_used == b->states.states_size) {
1335 1.1 lukem if (b->states.states_size == 0)
1336 1.1 lukem b->states.states = (_ure_state_t *)
1337 1.1 lukem malloc(sizeof(_ure_state_t) << 3);
1338 1.1 lukem else
1339 1.1 lukem b->states.states = (_ure_state_t *)
1340 1.1 lukem realloc((char *) b->states.states,
1341 1.1 lukem sizeof(_ure_state_t) * (b->states.states_size + 8));
1342 1.1 lukem sp = b->states.states + b->states.states_size;
1343 1.1 lukem (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
1344 1.1 lukem b->states.states_size += 8;
1345 1.1 lukem }
1346 1.1 lukem
1347 1.1 lukem sp = b->states.states + b->states.states_used++;
1348 1.1 lukem sp->id = i;
1349 1.1 lukem
1350 1.1 lukem if (sp->st.slist_used + nstates > sp->st.slist_size) {
1351 1.1 lukem if (sp->st.slist_size == 0)
1352 1.1 lukem sp->st.slist = (ucs2_t *)
1353 1.1 lukem malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1354 1.1 lukem else
1355 1.1 lukem sp->st.slist = (ucs2_t *)
1356 1.1 lukem realloc((char *) sp->st.slist,
1357 1.1 lukem sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1358 1.1 lukem sp->st.slist_size = sp->st.slist_used + nstates;
1359 1.1 lukem }
1360 1.1 lukem sp->st.slist_used = nstates;
1361 1.1 lukem (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
1362 1.1 lukem sizeof(ucs2_t) * nstates);
1363 1.1 lukem }
1364 1.1 lukem
1365 1.1 lukem /*
1366 1.1 lukem * Return the ID of the DFA state representing a group of NFA states.
1367 1.1 lukem */
1368 1.1 lukem return i;
1369 1.1 lukem }
1370 1.1 lukem
1371 1.1 lukem static void
1372 1.1 lukem _ure_reduce(ucs2_t start, _ure_buffer_t *b)
1373 1.1 lukem {
1374 1.1 lukem ucs2_t i, j, state, eval, syms, rhs;
1375 1.1 lukem ucs2_t s1, s2, ns1, ns2;
1376 1.1 lukem _ure_state_t *sp;
1377 1.1 lukem _ure_symtab_t *smp;
1378 1.1 lukem
1379 1.1 lukem b->reducing = 1;
1380 1.1 lukem
1381 1.1 lukem /*
1382 1.1 lukem * Add the starting state for the reduction.
1383 1.1 lukem */
1384 1.1 lukem _ure_add_state(1, &start, b);
1385 1.1 lukem
1386 1.1 lukem /*
1387 1.1 lukem * Process each set of NFA states that get created.
1388 1.1 lukem */
1389 1.1 lukem for (i = 0; i < b->states.states_used; i++) {
1390 1.1 lukem sp = b->states.states + i;
1391 1.1 lukem
1392 1.1 lukem /*
1393 1.1 lukem * Push the current states on the stack.
1394 1.1 lukem */
1395 1.1 lukem for (j = 0; j < sp->st.slist_used; j++)
1396 1.1 lukem _ure_push(sp->st.slist[j], b);
1397 1.1 lukem
1398 1.1 lukem /*
1399 1.1 lukem * Reduce the NFA states.
1400 1.1 lukem */
1401 1.1 lukem for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
1402 1.1 lukem state = b->stack.slist[j];
1403 1.1 lukem eval = 1;
1404 1.1 lukem
1405 1.1 lukem /*
1406 1.1 lukem * This inner loop is the iterative equivalent of recursively
1407 1.1 lukem * reducing subexpressions generated as a result of a reduction.
1408 1.1 lukem */
1409 1.1 lukem while (eval) {
1410 1.1 lukem switch (b->expr[state].type) {
1411 1.1 lukem case _URE_SYMBOL:
1412 1.1 lukem ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1413 1.1 lukem _ure_add_symstate(b->expr[state].lhs, ns1, b);
1414 1.1 lukem syms++;
1415 1.1 lukem eval = 0;
1416 1.1 lukem break;
1417 1.1 lukem case _URE_ONE:
1418 1.1 lukem sp->accepting = 1;
1419 1.1 lukem eval = 0;
1420 1.1 lukem break;
1421 1.1 lukem case _URE_QUEST:
1422 1.1 lukem s1 = b->expr[state].lhs;
1423 1.1 lukem ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1424 1.1 lukem state = _ure_make_expr(_URE_OR, ns1, s1, b);
1425 1.1 lukem break;
1426 1.1 lukem case _URE_PLUS:
1427 1.1 lukem s1 = b->expr[state].lhs;
1428 1.1 lukem ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
1429 1.1 lukem state = _ure_make_expr(_URE_AND, s1, ns1, b);
1430 1.1 lukem break;
1431 1.1 lukem case _URE_STAR:
1432 1.1 lukem s1 = b->expr[state].lhs;
1433 1.1 lukem ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1434 1.1 lukem ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
1435 1.1 lukem state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1436 1.1 lukem break;
1437 1.1 lukem case _URE_OR:
1438 1.1 lukem s1 = b->expr[state].lhs;
1439 1.1 lukem s2 = b->expr[state].rhs;
1440 1.1 lukem _ure_push(s1, b);
1441 1.1 lukem _ure_push(s2, b);
1442 1.1 lukem eval = 0;
1443 1.1 lukem break;
1444 1.1 lukem case _URE_AND:
1445 1.1 lukem s1 = b->expr[state].lhs;
1446 1.1 lukem s2 = b->expr[state].rhs;
1447 1.1 lukem switch (b->expr[s1].type) {
1448 1.1 lukem case _URE_SYMBOL:
1449 1.1 lukem _ure_add_symstate(b->expr[s1].lhs, s2, b);
1450 1.1 lukem syms++;
1451 1.1 lukem eval = 0;
1452 1.1 lukem break;
1453 1.1 lukem case _URE_ONE:
1454 1.1 lukem state = s2;
1455 1.1 lukem break;
1456 1.1 lukem case _URE_QUEST:
1457 1.1 lukem ns1 = b->expr[s1].lhs;
1458 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
1459 1.1 lukem state = _ure_make_expr(_URE_OR, s2, ns2, b);
1460 1.1 lukem break;
1461 1.1 lukem case _URE_PLUS:
1462 1.1 lukem ns1 = b->expr[s1].lhs;
1463 1.1 lukem ns2 = _ure_make_expr(_URE_OR, s2, state, b);
1464 1.1 lukem state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1465 1.1 lukem break;
1466 1.1 lukem case _URE_STAR:
1467 1.1 lukem ns1 = b->expr[s1].lhs;
1468 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
1469 1.1 lukem state = _ure_make_expr(_URE_OR, s2, ns2, b);
1470 1.1 lukem break;
1471 1.1 lukem case _URE_OR:
1472 1.1 lukem ns1 = b->expr[s1].lhs;
1473 1.1 lukem ns2 = b->expr[s1].rhs;
1474 1.1 lukem ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
1475 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1476 1.1 lukem state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1477 1.1 lukem break;
1478 1.1 lukem case _URE_AND:
1479 1.1 lukem ns1 = b->expr[s1].lhs;
1480 1.1 lukem ns2 = b->expr[s1].rhs;
1481 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1482 1.1 lukem state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1483 1.1 lukem break;
1484 1.1 lukem }
1485 1.1 lukem }
1486 1.1 lukem }
1487 1.1 lukem }
1488 1.1 lukem
1489 1.1 lukem /*
1490 1.1 lukem * Clear the state stack.
1491 1.1 lukem */
1492 1.1 lukem while (_ure_pop(b) != _URE_NOOP) ;
1493 1.1 lukem
1494 1.1 lukem /*
1495 1.1 lukem * Reset the state pointer because the reduction may have moved it
1496 1.1 lukem * during a reallocation.
1497 1.1 lukem */
1498 1.1 lukem sp = b->states.states + i;
1499 1.1 lukem
1500 1.1 lukem /*
1501 1.1 lukem * Generate the DFA states for the symbols collected during the
1502 1.1 lukem * current reduction.
1503 1.1 lukem */
1504 1.1 lukem if (sp->trans_used + syms > sp->trans_size) {
1505 1.1 lukem if (sp->trans_size == 0)
1506 1.1 lukem sp->trans = (_ure_elt_t *)
1507 1.1 lukem malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
1508 1.1 lukem else
1509 1.1 lukem sp->trans = (_ure_elt_t *)
1510 1.1 lukem realloc((char *) sp->trans,
1511 1.1 lukem sizeof(_ure_elt_t) * (sp->trans_used + syms));
1512 1.1 lukem sp->trans_size = sp->trans_used + syms;
1513 1.1 lukem }
1514 1.1 lukem
1515 1.1 lukem /*
1516 1.1 lukem * Go through the symbol table and generate the DFA state transitions
1517 1.1 lukem * for each symbol that has collected NFA states.
1518 1.1 lukem */
1519 1.1 lukem for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
1520 1.1 lukem sp = b->states.states + i;
1521 1.1 lukem
1522 1.1 lukem if (smp->states.slist_used > 0) {
1523 1.1 lukem sp->trans[syms].lhs = smp->id;
1524 1.1 lukem rhs = _ure_add_state(smp->states.slist_used,
1525 1.1 lukem smp->states.slist, b);
1526 1.1 lukem /*
1527 1.1 lukem * Reset the state pointer in case the reallocation moves it
1528 1.1 lukem * in memory.
1529 1.1 lukem */
1530 1.1 lukem sp = b->states.states + i;
1531 1.1 lukem sp->trans[syms].rhs = rhs;
1532 1.1 lukem
1533 1.1 lukem smp->states.slist_used = 0;
1534 1.1 lukem syms++;
1535 1.1 lukem }
1536 1.1 lukem }
1537 1.1 lukem
1538 1.1 lukem /*
1539 1.1 lukem * Set the number of transitions actually used.
1540 1.1 lukem */
1541 1.1 lukem sp->trans_used = syms;
1542 1.1 lukem }
1543 1.1 lukem b->reducing = 0;
1544 1.1 lukem }
1545 1.1 lukem
1546 1.1 lukem static void
1547 1.1 lukem _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
1548 1.1 lukem {
1549 1.1 lukem ucs2_t tmp;
1550 1.1 lukem
1551 1.1 lukem l = b->states.states[l].id;
1552 1.1 lukem r = b->states.states[r].id;
1553 1.1 lukem
1554 1.1 lukem if (l == r)
1555 1.1 lukem return;
1556 1.1 lukem
1557 1.1 lukem if (l > r) {
1558 1.1 lukem tmp = l;
1559 1.1 lukem l = r;
1560 1.1 lukem r = tmp;
1561 1.1 lukem }
1562 1.1 lukem
1563 1.1 lukem /*
1564 1.1 lukem * Check to see if the equivalence pair already exists.
1565 1.1 lukem */
1566 1.1 lukem for (tmp = 0; tmp < b->equiv_used &&
1567 1.1 lukem (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
1568 1.1 lukem tmp++) ;
1569 1.1 lukem
1570 1.1 lukem if (tmp < b->equiv_used)
1571 1.1 lukem return;
1572 1.1 lukem
1573 1.1 lukem if (b->equiv_used == b->equiv_size) {
1574 1.1 lukem if (b->equiv_size == 0)
1575 1.1 lukem b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
1576 1.1 lukem else
1577 1.1 lukem b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
1578 1.1 lukem sizeof(_ure_equiv_t) *
1579 1.1 lukem (b->equiv_size + 8));
1580 1.1 lukem b->equiv_size += 8;
1581 1.1 lukem }
1582 1.1 lukem b->equiv[b->equiv_used].l = l;
1583 1.1 lukem b->equiv[b->equiv_used].r = r;
1584 1.1 lukem b->equiv_used++;
1585 1.1 lukem }
1586 1.1 lukem
1587 1.1 lukem /*
1588 1.1 lukem * Merge the DFA states that are equivalent.
1589 1.1 lukem */
1590 1.1 lukem static void
1591 1.1 lukem _ure_merge_equiv(_ure_buffer_t *b)
1592 1.1 lukem {
1593 1.1 lukem ucs2_t i, j, k, eq, done;
1594 1.1 lukem _ure_state_t *sp1, *sp2, *ls, *rs;
1595 1.1 lukem
1596 1.1 lukem for (i = 0; i < b->states.states_used; i++) {
1597 1.1 lukem sp1 = b->states.states + i;
1598 1.1 lukem if (sp1->id != i)
1599 1.1 lukem continue;
1600 1.1 lukem for (j = 0; j < i; j++) {
1601 1.1 lukem sp2 = b->states.states + j;
1602 1.1 lukem if (sp2->id != j)
1603 1.1 lukem continue;
1604 1.1 lukem b->equiv_used = 0;
1605 1.1 lukem _ure_add_equiv(i, j, b);
1606 1.1 lukem for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
1607 1.1 lukem ls = b->states.states + b->equiv[eq].l;
1608 1.1 lukem rs = b->states.states + b->equiv[eq].r;
1609 1.1 lukem if (ls->accepting != rs->accepting ||
1610 1.1 lukem ls->trans_used != rs->trans_used) {
1611 1.1 lukem done = 1;
1612 1.1 lukem break;
1613 1.1 lukem }
1614 1.1 lukem for (k = 0; k < ls->trans_used &&
1615 1.1 lukem ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
1616 1.1 lukem if (k < ls->trans_used) {
1617 1.1 lukem done = 1;
1618 1.1 lukem break;
1619 1.1 lukem }
1620 1.1 lukem
1621 1.1 lukem for (k = 0; k < ls->trans_used; k++)
1622 1.1 lukem _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
1623 1.1 lukem }
1624 1.1 lukem if (done == 0)
1625 1.1 lukem break;
1626 1.1 lukem }
1627 1.1 lukem for (eq = 0; j < i && eq < b->equiv_used; eq++)
1628 1.1 lukem b->states.states[b->equiv[eq].r].id =
1629 1.1 lukem b->states.states[b->equiv[eq].l].id;
1630 1.1 lukem }
1631 1.1 lukem
1632 1.1 lukem /*
1633 1.1 lukem * Renumber the states appropriately.
1634 1.1 lukem */
1635 1.1 lukem for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
1636 1.1 lukem sp1++, i++)
1637 1.1 lukem sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
1638 1.1 lukem }
1639 1.1 lukem
1640 1.1 lukem /*************************************************************************
1641 1.1 lukem *
1642 1.1 lukem * API.
1643 1.1 lukem *
1644 1.1 lukem *************************************************************************/
1645 1.1 lukem
1646 1.1 lukem ure_buffer_t
1647 1.1 lukem ure_buffer_create(void)
1648 1.1 lukem {
1649 1.1 lukem ure_buffer_t b;
1650 1.1 lukem
1651 1.1 lukem b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
1652 1.1 lukem
1653 1.1 lukem return b;
1654 1.1 lukem }
1655 1.1 lukem
1656 1.1 lukem void
1657 1.1 lukem ure_buffer_free(ure_buffer_t buf)
1658 1.1 lukem {
1659 1.1 lukem unsigned long i;
1660 1.1 lukem
1661 1.1 lukem if (buf == 0)
1662 1.1 lukem return;
1663 1.1 lukem
1664 1.1 lukem if (buf->stack.slist_size > 0)
1665 1.1 lukem free((char *) buf->stack.slist);
1666 1.1 lukem
1667 1.1 lukem if (buf->expr_size > 0)
1668 1.1 lukem free((char *) buf->expr);
1669 1.1 lukem
1670 1.1 lukem for (i = 0; i < buf->symtab_size; i++) {
1671 1.1 lukem if (buf->symtab[i].states.slist_size > 0)
1672 1.1 lukem free((char *) buf->symtab[i].states.slist);
1673 1.1 lukem }
1674 1.1 lukem
1675 1.1 lukem if (buf->symtab_size > 0)
1676 1.1 lukem free((char *) buf->symtab);
1677 1.1 lukem
1678 1.1 lukem for (i = 0; i < buf->states.states_size; i++) {
1679 1.1 lukem if (buf->states.states[i].trans_size > 0)
1680 1.1 lukem free((char *) buf->states.states[i].trans);
1681 1.1 lukem if (buf->states.states[i].st.slist_size > 0)
1682 1.1 lukem free((char *) buf->states.states[i].st.slist);
1683 1.1 lukem }
1684 1.1 lukem
1685 1.1 lukem if (buf->states.states_size > 0)
1686 1.1 lukem free((char *) buf->states.states);
1687 1.1 lukem
1688 1.1 lukem if (buf->equiv_size > 0)
1689 1.1 lukem free((char *) buf->equiv);
1690 1.1 lukem
1691 1.1 lukem free((char *) buf);
1692 1.1 lukem }
1693 1.1 lukem
1694 1.1 lukem ure_dfa_t
1695 1.1 lukem ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
1696 1.1 lukem {
1697 1.1 lukem ucs2_t i, j, state;
1698 1.1 lukem _ure_state_t *sp;
1699 1.1 lukem _ure_dstate_t *dsp;
1700 1.1 lukem _ure_trans_t *tp;
1701 1.1 lukem ure_dfa_t dfa;
1702 1.1 lukem
1703 1.1 lukem if (re == 0 || *re == 0 || relen == 0 || buf == 0)
1704 1.1 lukem return 0;
1705 1.1 lukem
1706 1.1 lukem /*
1707 1.1 lukem * Reset the various fields of the compilation buffer. Default the flags
1708 1.1 lukem * to indicate the presense of the "^$" pattern. If any other pattern
1709 1.1 lukem * occurs, then this flag will be removed. This is done to catch this
1710 1.1 lukem * special pattern and handle it specially when matching.
1711 1.1 lukem */
1712 1.1 lukem buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
1713 1.1 lukem buf->reducing = 0;
1714 1.1 lukem buf->stack.slist_used = 0;
1715 1.1 lukem buf->expr_used = 0;
1716 1.1 lukem
1717 1.1 lukem for (i = 0; i < buf->symtab_used; i++)
1718 1.1 lukem buf->symtab[i].states.slist_used = 0;
1719 1.1 lukem buf->symtab_used = 0;
1720 1.1 lukem
1721 1.1 lukem for (i = 0; i < buf->states.states_used; i++) {
1722 1.1 lukem buf->states.states[i].st.slist_used = 0;
1723 1.1 lukem buf->states.states[i].trans_used = 0;
1724 1.1 lukem }
1725 1.1 lukem buf->states.states_used = 0;
1726 1.1 lukem
1727 1.1 lukem /*
1728 1.1 lukem * Construct the NFA. If this stage returns a 0, then an error occured or
1729 1.1 lukem * an empty expression was passed.
1730 1.1 lukem */
1731 1.1 lukem if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
1732 1.1 lukem return 0;
1733 1.1 lukem
1734 1.1 lukem /*
1735 1.1 lukem * Do the expression reduction to get the initial DFA.
1736 1.1 lukem */
1737 1.1 lukem _ure_reduce(state, buf);
1738 1.1 lukem
1739 1.1 lukem /*
1740 1.1 lukem * Merge all the equivalent DFA states.
1741 1.1 lukem */
1742 1.1 lukem _ure_merge_equiv(buf);
1743 1.1 lukem
1744 1.1 lukem /*
1745 1.1 lukem * Construct the minimal DFA.
1746 1.1 lukem */
1747 1.1 lukem dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
1748 1.1 lukem (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
1749 1.1 lukem
1750 1.1 lukem dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
1751 1.1 lukem
1752 1.1 lukem /*
1753 1.1 lukem * Free up the NFA state groups and transfer the symbols from the buffer
1754 1.1 lukem * to the DFA.
1755 1.1 lukem */
1756 1.1 lukem for (i = 0; i < buf->symtab_size; i++) {
1757 1.1 lukem if (buf->symtab[i].states.slist_size > 0)
1758 1.1 lukem free((char *) buf->symtab[i].states.slist);
1759 1.1 lukem }
1760 1.1 lukem dfa->syms = buf->symtab;
1761 1.1 lukem dfa->nsyms = buf->symtab_used;
1762 1.1 lukem
1763 1.1 lukem buf->symtab_used = buf->symtab_size = 0;
1764 1.1 lukem
1765 1.1 lukem /*
1766 1.1 lukem * Collect the total number of states and transitions needed for the DFA.
1767 1.1 lukem */
1768 1.1 lukem for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1769 1.1 lukem i++, sp++) {
1770 1.1 lukem if (sp->id == state) {
1771 1.1 lukem dfa->nstates++;
1772 1.1 lukem dfa->ntrans += sp->trans_used;
1773 1.1 lukem state++;
1774 1.1 lukem }
1775 1.1 lukem }
1776 1.1 lukem
1777 1.1 lukem /*
1778 1.1 lukem * Allocate enough space for the states and transitions.
1779 1.1 lukem */
1780 1.1 lukem dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
1781 1.1 lukem dfa->nstates);
1782 1.1 lukem dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
1783 1.1 lukem
1784 1.1 lukem /*
1785 1.1 lukem * Actually transfer the DFA states from the buffer.
1786 1.1 lukem */
1787 1.1 lukem dsp = dfa->states;
1788 1.1 lukem tp = dfa->trans;
1789 1.1 lukem for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1790 1.1 lukem i++, sp++) {
1791 1.1 lukem if (sp->id == state) {
1792 1.1 lukem dsp->trans = tp;
1793 1.1 lukem dsp->ntrans = sp->trans_used;
1794 1.1 lukem dsp->accepting = sp->accepting;
1795 1.1 lukem
1796 1.1 lukem /*
1797 1.1 lukem * Add the transitions for the state.
1798 1.1 lukem */
1799 1.1 lukem for (j = 0; j < dsp->ntrans; j++, tp++) {
1800 1.1 lukem tp->symbol = sp->trans[j].lhs;
1801 1.1 lukem tp->next_state = buf->states.states[sp->trans[j].rhs].id;
1802 1.1 lukem }
1803 1.1 lukem
1804 1.1 lukem dsp++;
1805 1.1 lukem state++;
1806 1.1 lukem }
1807 1.1 lukem }
1808 1.1 lukem
1809 1.1 lukem return dfa;
1810 1.1 lukem }
1811 1.1 lukem
1812 1.1 lukem void
1813 1.1 lukem ure_dfa_free(ure_dfa_t dfa)
1814 1.1 lukem {
1815 1.1 lukem ucs2_t i;
1816 1.1 lukem
1817 1.1 lukem if (dfa == 0)
1818 1.1 lukem return;
1819 1.1 lukem
1820 1.1 lukem for (i = 0; i < dfa->nsyms; i++) {
1821 1.1 lukem if ((dfa->syms[i].type == _URE_CCLASS ||
1822 1.1 lukem dfa->syms[i].type == _URE_NCCLASS) &&
1823 1.1 lukem dfa->syms[i].sym.ccl.ranges_size > 0)
1824 1.1 lukem free((char *) dfa->syms[i].sym.ccl.ranges);
1825 1.1 lukem }
1826 1.1 lukem if (dfa->nsyms > 0)
1827 1.1 lukem free((char *) dfa->syms);
1828 1.1 lukem
1829 1.1 lukem if (dfa->nstates > 0)
1830 1.1 lukem free((char *) dfa->states);
1831 1.1 lukem if (dfa->ntrans > 0)
1832 1.1 lukem free((char *) dfa->trans);
1833 1.1 lukem free((char *) dfa);
1834 1.1 lukem }
1835 1.1 lukem
1836 1.1 lukem void
1837 1.1 lukem ure_write_dfa(ure_dfa_t dfa, FILE *out)
1838 1.1 lukem {
1839 1.1 lukem ucs2_t i, j, k, h, l;
1840 1.1 lukem _ure_dstate_t *sp;
1841 1.1 lukem _ure_symtab_t *sym;
1842 1.1 lukem _ure_range_t *rp;
1843 1.1 lukem
1844 1.1 lukem if (dfa == 0 || out == 0)
1845 1.1 lukem return;
1846 1.1 lukem
1847 1.1 lukem /*
1848 1.1 lukem * Write all the different character classes.
1849 1.1 lukem */
1850 1.1 lukem for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
1851 1.1 lukem if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
1852 1.1 lukem fprintf(out, "C%hd = ", sym->id);
1853 1.1 lukem if (sym->sym.ccl.ranges_used > 0) {
1854 1.1 lukem putc('[', out);
1855 1.1 lukem if (sym->type == _URE_NCCLASS)
1856 1.1 lukem putc('^', out);
1857 1.1 lukem }
1858 1.1 lukem if (sym->props != 0) {
1859 1.1 lukem if (sym->type == _URE_NCCLASS)
1860 1.1 lukem fprintf(out, "\\P");
1861 1.1 lukem else
1862 1.1 lukem fprintf(out, "\\p");
1863 1.1 lukem for (k = h = 0; k < 32; k++) {
1864 1.1 lukem if (sym->props & (1 << k)) {
1865 1.1 lukem if (h != 0)
1866 1.1 lukem putc(',', out);
1867 1.1 lukem fprintf(out, "%hd", k + 1);
1868 1.1 lukem h = 1;
1869 1.1 lukem }
1870 1.1 lukem }
1871 1.1 lukem }
1872 1.1 lukem /*
1873 1.1 lukem * Dump the ranges.
1874 1.1 lukem */
1875 1.1 lukem for (k = 0, rp = sym->sym.ccl.ranges;
1876 1.1 lukem k < sym->sym.ccl.ranges_used; k++, rp++) {
1877 1.1 lukem /*
1878 1.1 lukem * Check for UTF16 characters.
1879 1.1 lukem */
1880 1.1 lukem if (0x10000 <= rp->min_code &&
1881 1.1 lukem rp->min_code <= 0x10ffff) {
1882 1.1 lukem h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
1883 1.1 lukem l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
1884 1.1 lukem fprintf(out, "\\x%04hX\\x%04hX", h, l);
1885 1.1 lukem } else
1886 1.1 lukem fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
1887 1.1 lukem if (rp->max_code != rp->min_code) {
1888 1.1 lukem putc('-', out);
1889 1.1 lukem if (rp->max_code >= 0x10000 &&
1890 1.1 lukem rp->max_code <= 0x10ffff) {
1891 1.1 lukem h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
1892 1.1 lukem l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
1893 1.1 lukem fprintf(out, "\\x%04hX\\x%04hX", h, l);
1894 1.1 lukem } else
1895 1.1 lukem fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
1896 1.1 lukem }
1897 1.1 lukem }
1898 1.1 lukem if (sym->sym.ccl.ranges_used > 0)
1899 1.1 lukem putc(']', out);
1900 1.1 lukem putc('\n', out);
1901 1.1 lukem }
1902 1.1 lukem }
1903 1.1 lukem
1904 1.1 lukem for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
1905 1.1 lukem fprintf(out, "S%hd = ", i);
1906 1.1 lukem if (sp->accepting) {
1907 1.1 lukem fprintf(out, "1 ");
1908 1.1 lukem if (sp->ntrans)
1909 1.1 lukem fprintf(out, "| ");
1910 1.1 lukem }
1911 1.1 lukem for (j = 0; j < sp->ntrans; j++) {
1912 1.1 lukem if (j > 0)
1913 1.1 lukem fprintf(out, "| ");
1914 1.1 lukem
1915 1.1 lukem sym = dfa->syms + sp->trans[j].symbol;
1916 1.1 lukem switch (sym->type) {
1917 1.1 lukem case _URE_CHAR:
1918 1.1 lukem if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
1919 1.1 lukem /*
1920 1.1 lukem * Take care of UTF16 characters.
1921 1.1 lukem */
1922 1.1 lukem h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
1923 1.1 lukem l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
1924 1.1 lukem fprintf(out, "\\x%04hX\\x%04hX ", h, l);
1925 1.1 lukem } else
1926 1.1 lukem fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
1927 1.1 lukem break;
1928 1.1 lukem case _URE_ANY_CHAR:
1929 1.1 lukem fprintf(out, "<any> ");
1930 1.1 lukem break;
1931 1.1 lukem case _URE_BOL_ANCHOR:
1932 1.1 lukem fprintf(out, "<bol-anchor> ");
1933 1.1 lukem break;
1934 1.1 lukem case _URE_EOL_ANCHOR:
1935 1.1 lukem fprintf(out, "<eol-anchor> ");
1936 1.1 lukem break;
1937 1.1 lukem case _URE_CCLASS:
1938 1.1 lukem case _URE_NCCLASS:
1939 1.1 lukem fprintf(out, "[C%hd] ", sym->id);
1940 1.1 lukem break;
1941 1.1 lukem }
1942 1.1 lukem fprintf(out, "S%hd", sp->trans[j].next_state);
1943 1.1 lukem if (j + 1 < sp->ntrans)
1944 1.1 lukem putc(' ', out);
1945 1.1 lukem }
1946 1.1 lukem putc('\n', out);
1947 1.1 lukem }
1948 1.1 lukem }
1949 1.1 lukem
1950 1.1 lukem #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
1951 1.1 lukem (cc) == 0x2029)
1952 1.1 lukem
1953 1.1 lukem int
1954 1.1 lukem ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
1955 1.1 lukem unsigned long *match_start, unsigned long *match_end)
1956 1.1 lukem {
1957 1.1 lukem int i, j, matched, found, skip;
1958 1.1 lukem unsigned long ms, me;
1959 1.1 lukem ucs4_t c;
1960 1.1 lukem ucs2_t *sp, *ep, *lp;
1961 1.1 lukem _ure_dstate_t *stp;
1962 1.1 lukem _ure_symtab_t *sym;
1963 1.1 lukem _ure_range_t *rp;
1964 1.1 lukem
1965 1.1 lukem if (dfa == 0 || text == 0)
1966 1.1 lukem return 0;
1967 1.1 lukem
1968 1.1 lukem /*
1969 1.1 lukem * Handle the special case of an empty string matching the "^$" pattern.
1970 1.1 lukem */
1971 1.1 lukem if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
1972 1.1 lukem *match_start = *match_end = 0;
1973 1.1 lukem return 1;
1974 1.1 lukem }
1975 1.1 lukem
1976 1.1 lukem sp = text;
1977 1.1 lukem ep = sp + textlen;
1978 1.1 lukem
1979 1.1 lukem ms = me = ~0;
1980 1.1 lukem
1981 1.1 lukem stp = dfa->states;
1982 1.1 lukem
1983 1.1 lukem for (found = skip = 0; found == 0 && sp < ep; ) {
1984 1.1 lukem lp = sp;
1985 1.1 lukem c = *sp++;
1986 1.1 lukem
1987 1.1 lukem /*
1988 1.1 lukem * Check to see if this is a high surrogate that should be
1989 1.1 lukem * combined with a following low surrogate.
1990 1.1 lukem */
1991 1.1 lukem if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
1992 1.1 lukem 0xdc00 <= *sp && *sp <= 0xdfff)
1993 1.1 lukem c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
1994 1.1 lukem
1995 1.1 lukem /*
1996 1.1 lukem * Determine if the character is non-spacing and should be skipped.
1997 1.1 lukem */
1998 1.1 lukem if (_ure_matches_properties(_URE_NONSPACING, c) &&
1999 1.1 lukem (flags & URE_IGNORE_NONSPACING)) {
2000 1.1 lukem sp++;
2001 1.1 lukem continue;
2002 1.1 lukem }
2003 1.1 lukem
2004 1.1 lukem if (dfa->flags & _URE_DFA_CASEFOLD)
2005 1.1 lukem c = _ure_tolower(c);
2006 1.1 lukem
2007 1.1 lukem /*
2008 1.1 lukem * See if one of the transitions matches.
2009 1.1 lukem */
2010 1.1 lukem for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
2011 1.1 lukem sym = dfa->syms + stp->trans[i].symbol;
2012 1.1 lukem switch (sym->type) {
2013 1.1 lukem case _URE_ANY_CHAR:
2014 1.1 lukem if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
2015 1.1 lukem !_ure_issep(c))
2016 1.1 lukem matched = 1;
2017 1.1 lukem break;
2018 1.1 lukem case _URE_CHAR:
2019 1.1 lukem if (c == sym->sym.chr)
2020 1.1 lukem matched = 1;
2021 1.1 lukem break;
2022 1.1 lukem case _URE_BOL_ANCHOR:
2023 1.1 lukem if (lp == text) {
2024 1.1 lukem sp = lp;
2025 1.1 lukem matched = 1;
2026 1.1 lukem } else if (_ure_issep(c)) {
2027 1.1 lukem if (c == '\r' && sp < ep && *sp == '\n')
2028 1.1 lukem sp++;
2029 1.1 lukem lp = sp;
2030 1.1 lukem matched = 1;
2031 1.1 lukem }
2032 1.1 lukem break;
2033 1.1 lukem case _URE_EOL_ANCHOR:
2034 1.1 lukem if (_ure_issep(c)) {
2035 1.1 lukem /*
2036 1.1 lukem * Put the pointer back before the separator so the match
2037 1.1 lukem * end position will be correct. This case will also
2038 1.1 lukem * cause the `sp' pointer to be advanced over the current
2039 1.1 lukem * separator once the match end point has been recorded.
2040 1.1 lukem */
2041 1.1 lukem sp = lp;
2042 1.1 lukem matched = 1;
2043 1.1 lukem }
2044 1.1 lukem break;
2045 1.1 lukem case _URE_CCLASS:
2046 1.1 lukem case _URE_NCCLASS:
2047 1.1 lukem if (sym->props != 0)
2048 1.1 lukem matched = _ure_matches_properties(sym->props, c);
2049 1.1 lukem for (j = 0, rp = sym->sym.ccl.ranges;
2050 1.1 lukem j < sym->sym.ccl.ranges_used; j++, rp++) {
2051 1.1 lukem if (rp->min_code <= c && c <= rp->max_code)
2052 1.1 lukem matched = 1;
2053 1.1 lukem }
2054 1.1 lukem if (sym->type == _URE_NCCLASS)
2055 1.1 lukem matched = !matched;
2056 1.1 lukem break;
2057 1.1 lukem }
2058 1.1 lukem
2059 1.1 lukem if (matched) {
2060 1.1 lukem if (ms == ~0UL)
2061 1.1 lukem ms = lp - text;
2062 1.1 lukem else
2063 1.1 lukem me = sp - text;
2064 1.1 lukem stp = dfa->states + stp->trans[i].next_state;
2065 1.1 lukem
2066 1.1 lukem /*
2067 1.1 lukem * If the match was an EOL anchor, adjust the pointer past the
2068 1.1 lukem * separator that caused the match. The correct match
2069 1.1 lukem * position has been recorded already.
2070 1.1 lukem */
2071 1.1 lukem if (sym->type == _URE_EOL_ANCHOR) {
2072 1.1 lukem /*
2073 1.1 lukem * Skip the character that caused the match.
2074 1.1 lukem */
2075 1.1 lukem sp++;
2076 1.1 lukem
2077 1.1 lukem /*
2078 1.1 lukem * Handle the infamous CRLF situation.
2079 1.1 lukem */
2080 1.1 lukem if (sp < ep && c == '\r' && *sp == '\n')
2081 1.1 lukem sp++;
2082 1.1 lukem }
2083 1.1 lukem }
2084 1.1 lukem }
2085 1.1 lukem
2086 1.1 lukem if (matched == 0) {
2087 1.1 lukem if (stp->accepting == 0) {
2088 1.1 lukem /*
2089 1.1 lukem * If the last state was not accepting, then reset
2090 1.1 lukem * and start over.
2091 1.1 lukem */
2092 1.1 lukem stp = dfa->states;
2093 1.1 lukem ms = me = ~0;
2094 1.1 lukem } else
2095 1.1 lukem /*
2096 1.1 lukem * The last state was accepting, so terminate the matching
2097 1.1 lukem * loop to avoid more work.
2098 1.1 lukem */
2099 1.1 lukem found = 1;
2100 1.1 lukem } else if (sp == ep) {
2101 1.1 lukem if (!stp->accepting) {
2102 1.1 lukem /*
2103 1.1 lukem * This ugly hack is to make sure the end-of-line anchors
2104 1.1 lukem * match when the source text hits the end. This is only done
2105 1.1 lukem * if the last subexpression matches.
2106 1.1 lukem */
2107 1.1 lukem for (i = 0; found == 0 && i < stp->ntrans; i++) {
2108 1.1 lukem sym = dfa->syms + stp->trans[i].symbol;
2109 1.1 lukem if (sym->type ==_URE_EOL_ANCHOR) {
2110 1.1 lukem stp = dfa->states + stp->trans[i].next_state;
2111 1.1 lukem if (stp->accepting) {
2112 1.1 lukem me = sp - text;
2113 1.1 lukem found = 1;
2114 1.1 lukem } else
2115 1.1 lukem break;
2116 1.1 lukem }
2117 1.1 lukem }
2118 1.1 lukem } else {
2119 1.1 lukem /*
2120 1.1 lukem * Make sure any conditions that match all the way to the end
2121 1.1 lukem * of the string match.
2122 1.1 lukem */
2123 1.1 lukem found = 1;
2124 1.1 lukem me = sp - text;
2125 1.1 lukem }
2126 1.1 lukem }
2127 1.1 lukem }
2128 1.1 lukem
2129 1.1 lukem if (found == 0)
2130 1.1 lukem ms = me = ~0;
2131 1.1 lukem
2132 1.1 lukem *match_start = ms;
2133 1.1 lukem *match_end = me;
2134 1.1 lukem
2135 1.1 lukem return (ms != ~0UL) ? 1 : 0;
2136 1.1 lukem }
2137