ure.c revision 1.1.1.3.12.1 1 1.1.1.2 lukem /* $NetBSD: ure.c,v 1.1.1.3.12.1 2014/08/19 23:52:00 tls Exp $ */
2 1.1.1.2 lukem
3 1.1.1.3.12.1 tls /* $OpenLDAP$ */
4 1.1 lukem /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 1.1 lukem *
6 1.1.1.3.12.1 tls * Copyright 1998-2014 The OpenLDAP Foundation.
7 1.1 lukem * All rights reserved.
8 1.1 lukem *
9 1.1 lukem * Redistribution and use in source and binary forms, with or without
10 1.1 lukem * modification, are permitted only as authorized by the OpenLDAP
11 1.1 lukem * Public License.
12 1.1 lukem *
13 1.1 lukem * A copy of this license is available in file LICENSE in the
14 1.1 lukem * top-level directory of the distribution or, alternatively, at
15 1.1 lukem * <http://www.OpenLDAP.org/license.html>.
16 1.1 lukem */
17 1.1 lukem /* Copyright 1997, 1998, 1999 Computing Research Labs,
18 1.1 lukem * New Mexico State University
19 1.1 lukem *
20 1.1 lukem * Permission is hereby granted, free of charge, to any person obtaining a
21 1.1 lukem * copy of this software and associated documentation files (the "Software"),
22 1.1 lukem * to deal in the Software without restriction, including without limitation
23 1.1 lukem * the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 1.1 lukem * and/or sell copies of the Software, and to permit persons to whom the
25 1.1 lukem * Software is furnished to do so, subject to the following conditions:
26 1.1 lukem *
27 1.1 lukem * The above copyright notice and this permission notice shall be included in
28 1.1 lukem * all copies or substantial portions of the Software.
29 1.1 lukem *
30 1.1 lukem * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31 1.1 lukem * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32 1.1 lukem * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
33 1.1 lukem * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
34 1.1 lukem * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
35 1.1 lukem * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
36 1.1 lukem * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37 1.1 lukem */
38 1.1.1.3.12.1 tls /* Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp " */
39 1.1 lukem
40 1.1 lukem #include "portable.h"
41 1.1 lukem
42 1.1 lukem #include <ac/stdlib.h>
43 1.1 lukem #include <ac/string.h>
44 1.1 lukem #include <ac/unistd.h>
45 1.1 lukem
46 1.1 lukem #include "ure.h"
47 1.1 lukem
48 1.1 lukem /*
49 1.1 lukem * Flags used internally in the DFA.
50 1.1 lukem */
51 1.1 lukem #define _URE_DFA_CASEFOLD 0x01
52 1.1 lukem #define _URE_DFA_BLANKLINE 0x02
53 1.1 lukem
54 1.1 lukem static unsigned long cclass_flags[] = {
55 1.1 lukem 0,
56 1.1 lukem _URE_NONSPACING,
57 1.1 lukem _URE_COMBINING,
58 1.1 lukem _URE_NUMDIGIT,
59 1.1 lukem _URE_NUMOTHER,
60 1.1 lukem _URE_SPACESEP,
61 1.1 lukem _URE_LINESEP,
62 1.1 lukem _URE_PARASEP,
63 1.1 lukem _URE_CNTRL,
64 1.1 lukem _URE_PUA,
65 1.1 lukem _URE_UPPER,
66 1.1 lukem _URE_LOWER,
67 1.1 lukem _URE_TITLE,
68 1.1 lukem _URE_MODIFIER,
69 1.1 lukem _URE_OTHERLETTER,
70 1.1 lukem _URE_DASHPUNCT,
71 1.1 lukem _URE_OPENPUNCT,
72 1.1 lukem _URE_CLOSEPUNCT,
73 1.1 lukem _URE_OTHERPUNCT,
74 1.1 lukem _URE_MATHSYM,
75 1.1 lukem _URE_CURRENCYSYM,
76 1.1 lukem _URE_OTHERSYM,
77 1.1 lukem _URE_LTR,
78 1.1 lukem _URE_RTL,
79 1.1 lukem _URE_EURONUM,
80 1.1 lukem _URE_EURONUMSEP,
81 1.1 lukem _URE_EURONUMTERM,
82 1.1 lukem _URE_ARABNUM,
83 1.1 lukem _URE_COMMONSEP,
84 1.1 lukem _URE_BLOCKSEP,
85 1.1 lukem _URE_SEGMENTSEP,
86 1.1 lukem _URE_WHITESPACE,
87 1.1 lukem _URE_OTHERNEUT,
88 1.1 lukem };
89 1.1 lukem
90 1.1 lukem /*
91 1.1 lukem * Symbol types for the DFA.
92 1.1 lukem */
93 1.1 lukem #define _URE_ANY_CHAR 1
94 1.1 lukem #define _URE_CHAR 2
95 1.1 lukem #define _URE_CCLASS 3
96 1.1 lukem #define _URE_NCCLASS 4
97 1.1 lukem #define _URE_BOL_ANCHOR 5
98 1.1 lukem #define _URE_EOL_ANCHOR 6
99 1.1 lukem
100 1.1 lukem /*
101 1.1 lukem * Op codes for converting the NFA to a DFA.
102 1.1 lukem */
103 1.1 lukem #define _URE_SYMBOL 10
104 1.1 lukem #define _URE_PAREN 11
105 1.1 lukem #define _URE_QUEST 12
106 1.1 lukem #define _URE_STAR 13
107 1.1 lukem #define _URE_PLUS 14
108 1.1 lukem #define _URE_ONE 15
109 1.1 lukem #define _URE_AND 16
110 1.1 lukem #define _URE_OR 17
111 1.1 lukem
112 1.1 lukem #define _URE_NOOP 0xffff
113 1.1 lukem
114 1.1 lukem #define _URE_REGSTART 0x8000
115 1.1 lukem #define _URE_REGEND 0x4000
116 1.1 lukem
117 1.1 lukem /*
118 1.1 lukem * Structure used to handle a compacted range of characters.
119 1.1 lukem */
120 1.1 lukem typedef struct {
121 1.1 lukem ucs4_t min_code;
122 1.1 lukem ucs4_t max_code;
123 1.1 lukem } _ure_range_t;
124 1.1 lukem
125 1.1 lukem typedef struct {
126 1.1 lukem _ure_range_t *ranges;
127 1.1 lukem ucs2_t ranges_used;
128 1.1 lukem ucs2_t ranges_size;
129 1.1 lukem } _ure_ccl_t;
130 1.1 lukem
131 1.1 lukem typedef union {
132 1.1 lukem ucs4_t chr;
133 1.1 lukem _ure_ccl_t ccl;
134 1.1 lukem } _ure_sym_t;
135 1.1 lukem
136 1.1 lukem /*
137 1.1 lukem * This is a general element structure used for expressions and stack
138 1.1 lukem * elements.
139 1.1 lukem */
140 1.1 lukem typedef struct {
141 1.1 lukem ucs2_t reg;
142 1.1 lukem ucs2_t onstack;
143 1.1 lukem ucs2_t type;
144 1.1 lukem ucs2_t lhs;
145 1.1 lukem ucs2_t rhs;
146 1.1 lukem } _ure_elt_t;
147 1.1 lukem
148 1.1 lukem /*
149 1.1 lukem * This is a structure used to track a list or a stack of states.
150 1.1 lukem */
151 1.1 lukem typedef struct {
152 1.1 lukem ucs2_t *slist;
153 1.1 lukem ucs2_t slist_size;
154 1.1 lukem ucs2_t slist_used;
155 1.1 lukem } _ure_stlist_t;
156 1.1 lukem
157 1.1 lukem /*
158 1.1 lukem * Structure to track the list of unique states for a symbol
159 1.1 lukem * during reduction.
160 1.1 lukem */
161 1.1 lukem typedef struct {
162 1.1 lukem ucs2_t id;
163 1.1 lukem ucs2_t type;
164 1.1 lukem unsigned long mods;
165 1.1 lukem unsigned long props;
166 1.1 lukem _ure_sym_t sym;
167 1.1 lukem _ure_stlist_t states;
168 1.1 lukem } _ure_symtab_t;
169 1.1 lukem
170 1.1 lukem /*
171 1.1 lukem * Structure to hold a single state.
172 1.1 lukem */
173 1.1 lukem typedef struct {
174 1.1 lukem ucs2_t id;
175 1.1 lukem ucs2_t accepting;
176 1.1 lukem ucs2_t pad;
177 1.1 lukem _ure_stlist_t st;
178 1.1 lukem _ure_elt_t *trans;
179 1.1 lukem ucs2_t trans_size;
180 1.1 lukem ucs2_t trans_used;
181 1.1 lukem } _ure_state_t;
182 1.1 lukem
183 1.1 lukem /*
184 1.1 lukem * Structure used for keeping lists of states.
185 1.1 lukem */
186 1.1 lukem typedef struct {
187 1.1 lukem _ure_state_t *states;
188 1.1 lukem ucs2_t states_size;
189 1.1 lukem ucs2_t states_used;
190 1.1 lukem } _ure_statetable_t;
191 1.1 lukem
192 1.1 lukem /*
193 1.1 lukem * Structure to track pairs of DFA states when equivalent states are
194 1.1 lukem * merged.
195 1.1 lukem */
196 1.1 lukem typedef struct {
197 1.1 lukem ucs2_t l;
198 1.1 lukem ucs2_t r;
199 1.1 lukem } _ure_equiv_t;
200 1.1 lukem
201 1.1 lukem /*
202 1.1 lukem * Structure used for constructing the NFA and reducing to a minimal DFA.
203 1.1 lukem */
204 1.1 lukem typedef struct _ure_buffer_t {
205 1.1 lukem int reducing;
206 1.1 lukem int error;
207 1.1 lukem unsigned long flags;
208 1.1 lukem
209 1.1 lukem _ure_stlist_t stack;
210 1.1 lukem
211 1.1 lukem /*
212 1.1 lukem * Table of unique symbols encountered.
213 1.1 lukem */
214 1.1 lukem _ure_symtab_t *symtab;
215 1.1 lukem ucs2_t symtab_size;
216 1.1 lukem ucs2_t symtab_used;
217 1.1 lukem
218 1.1 lukem /*
219 1.1 lukem * Tracks the unique expressions generated for the NFA and when the NFA is
220 1.1 lukem * reduced.
221 1.1 lukem */
222 1.1 lukem _ure_elt_t *expr;
223 1.1 lukem ucs2_t expr_used;
224 1.1 lukem ucs2_t expr_size;
225 1.1 lukem
226 1.1 lukem /*
227 1.1 lukem * The reduced table of unique groups of NFA states.
228 1.1 lukem */
229 1.1 lukem _ure_statetable_t states;
230 1.1 lukem
231 1.1 lukem /*
232 1.1 lukem * Tracks states when equivalent states are merged.
233 1.1 lukem */
234 1.1 lukem _ure_equiv_t *equiv;
235 1.1 lukem ucs2_t equiv_used;
236 1.1 lukem ucs2_t equiv_size;
237 1.1 lukem } _ure_buffer_t;
238 1.1 lukem
239 1.1 lukem typedef struct {
240 1.1 lukem ucs2_t symbol;
241 1.1 lukem ucs2_t next_state;
242 1.1 lukem } _ure_trans_t;
243 1.1 lukem
244 1.1 lukem typedef struct {
245 1.1 lukem ucs2_t accepting;
246 1.1 lukem ucs2_t ntrans;
247 1.1 lukem _ure_trans_t *trans;
248 1.1 lukem } _ure_dstate_t;
249 1.1 lukem
250 1.1 lukem typedef struct _ure_dfa_t {
251 1.1 lukem unsigned long flags;
252 1.1 lukem
253 1.1 lukem _ure_symtab_t *syms;
254 1.1 lukem ucs2_t nsyms;
255 1.1 lukem
256 1.1 lukem _ure_dstate_t *states;
257 1.1 lukem ucs2_t nstates;
258 1.1 lukem
259 1.1 lukem _ure_trans_t *trans;
260 1.1 lukem ucs2_t ntrans;
261 1.1 lukem } _ure_dfa_t;
262 1.1 lukem
263 1.1 lukem /*************************************************************************
264 1.1 lukem *
265 1.1 lukem * Functions.
266 1.1 lukem *
267 1.1 lukem *************************************************************************/
268 1.1 lukem
269 1.1 lukem static void
270 1.1 lukem _ure_memmove(char *dest, char *src, unsigned long bytes)
271 1.1 lukem {
272 1.1 lukem long i, j;
273 1.1 lukem
274 1.1 lukem i = (long) bytes;
275 1.1 lukem j = i & 7;
276 1.1 lukem i = (i + 7) >> 3;
277 1.1 lukem
278 1.1 lukem /*
279 1.1 lukem * Do a memmove using Ye Olde Duff's Device for efficiency.
280 1.1 lukem */
281 1.1 lukem if (src < dest) {
282 1.1 lukem src += bytes;
283 1.1 lukem dest += bytes;
284 1.1 lukem
285 1.1 lukem switch (j) {
286 1.1 lukem case 0: do {
287 1.1 lukem *--dest = *--src;
288 1.1 lukem case 7: *--dest = *--src;
289 1.1 lukem case 6: *--dest = *--src;
290 1.1 lukem case 5: *--dest = *--src;
291 1.1 lukem case 4: *--dest = *--src;
292 1.1 lukem case 3: *--dest = *--src;
293 1.1 lukem case 2: *--dest = *--src;
294 1.1 lukem case 1: *--dest = *--src;
295 1.1 lukem } while (--i > 0);
296 1.1 lukem }
297 1.1 lukem } else if (src > dest) {
298 1.1 lukem switch (j) {
299 1.1 lukem case 0: do {
300 1.1 lukem *dest++ = *src++;
301 1.1 lukem case 7: *dest++ = *src++;
302 1.1 lukem case 6: *dest++ = *src++;
303 1.1 lukem case 5: *dest++ = *src++;
304 1.1 lukem case 4: *dest++ = *src++;
305 1.1 lukem case 3: *dest++ = *src++;
306 1.1 lukem case 2: *dest++ = *src++;
307 1.1 lukem case 1: *dest++ = *src++;
308 1.1 lukem } while (--i > 0);
309 1.1 lukem }
310 1.1 lukem }
311 1.1 lukem }
312 1.1 lukem
313 1.1 lukem static void
314 1.1 lukem _ure_push(ucs2_t v, _ure_buffer_t *b)
315 1.1 lukem {
316 1.1 lukem _ure_stlist_t *s;
317 1.1 lukem
318 1.1 lukem if (b == 0)
319 1.1 lukem return;
320 1.1 lukem
321 1.1 lukem /*
322 1.1 lukem * If the `reducing' parameter is non-zero, check to see if the value
323 1.1 lukem * passed is already on the stack.
324 1.1 lukem */
325 1.1 lukem if (b->reducing != 0 && b->expr[v].onstack != 0)
326 1.1 lukem return;
327 1.1 lukem
328 1.1 lukem s = &b->stack;
329 1.1 lukem if (s->slist_used == s->slist_size) {
330 1.1 lukem if (s->slist_size == 0)
331 1.1 lukem s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
332 1.1 lukem else
333 1.1 lukem s->slist = (ucs2_t *) realloc((char *) s->slist,
334 1.1 lukem sizeof(ucs2_t) * (s->slist_size + 8));
335 1.1 lukem s->slist_size += 8;
336 1.1 lukem }
337 1.1 lukem s->slist[s->slist_used++] = v;
338 1.1 lukem
339 1.1 lukem /*
340 1.1 lukem * If the `reducing' parameter is non-zero, flag the element as being on
341 1.1 lukem * the stack.
342 1.1 lukem */
343 1.1 lukem if (b->reducing != 0)
344 1.1 lukem b->expr[v].onstack = 1;
345 1.1 lukem }
346 1.1 lukem
347 1.1 lukem static ucs2_t
348 1.1 lukem _ure_peek(_ure_buffer_t *b)
349 1.1 lukem {
350 1.1 lukem if (b == 0 || b->stack.slist_used == 0)
351 1.1 lukem return _URE_NOOP;
352 1.1 lukem
353 1.1 lukem return b->stack.slist[b->stack.slist_used - 1];
354 1.1 lukem }
355 1.1 lukem
356 1.1 lukem static ucs2_t
357 1.1 lukem _ure_pop(_ure_buffer_t *b)
358 1.1 lukem {
359 1.1 lukem ucs2_t v;
360 1.1 lukem
361 1.1 lukem if (b == 0 || b->stack.slist_used == 0)
362 1.1 lukem return _URE_NOOP;
363 1.1 lukem
364 1.1 lukem v = b->stack.slist[--b->stack.slist_used];
365 1.1 lukem if (b->reducing)
366 1.1 lukem b->expr[v].onstack = 0;
367 1.1 lukem
368 1.1 lukem return v;
369 1.1 lukem }
370 1.1 lukem
371 1.1 lukem /*************************************************************************
372 1.1 lukem *
373 1.1 lukem * Start symbol parse functions.
374 1.1 lukem *
375 1.1 lukem *************************************************************************/
376 1.1 lukem
377 1.1 lukem /*
378 1.1 lukem * Parse a comma-separated list of integers that represent character
379 1.1 lukem * properties. Combine them into a mask that is returned in the `mask'
380 1.1 lukem * variable, and return the number of characters consumed.
381 1.1 lukem */
382 1.1 lukem static unsigned long
383 1.1 lukem _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
384 1.1 lukem _ure_buffer_t *b)
385 1.1 lukem {
386 1.1 lukem unsigned long n, m;
387 1.1 lukem ucs2_t *sp, *ep;
388 1.1 lukem
389 1.1 lukem sp = pp;
390 1.1 lukem ep = sp + limit;
391 1.1 lukem
392 1.1 lukem for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
393 1.1 lukem if (*sp == ',') {
394 1.1 lukem /*
395 1.1 lukem * Encountered a comma, so select the next character property flag
396 1.1 lukem * and reset the number.
397 1.1 lukem */
398 1.1 lukem m |= cclass_flags[n];
399 1.1 lukem n = 0;
400 1.1 lukem } else if (*sp >= '0' && *sp <= '9')
401 1.1 lukem /*
402 1.1 lukem * Encountered a digit, so start or continue building the cardinal
403 1.1 lukem * that represents the character property flag.
404 1.1 lukem */
405 1.1 lukem n = (n * 10) + (*sp - '0');
406 1.1 lukem else
407 1.1 lukem /*
408 1.1 lukem * Encountered something that is not part of the property list.
409 1.1 lukem * Indicate that we are done.
410 1.1 lukem */
411 1.1 lukem break;
412 1.1 lukem
413 1.1 lukem /*
414 1.1 lukem * If a property number greater than 32 occurs, then there is a
415 1.1 lukem * problem. Most likely a missing comma separator.
416 1.1 lukem */
417 1.1 lukem if (n > 32)
418 1.1 lukem b->error = _URE_INVALID_PROPERTY;
419 1.1 lukem }
420 1.1 lukem
421 1.1 lukem if (n != 0)
422 1.1 lukem m |= cclass_flags[n];
423 1.1 lukem
424 1.1 lukem /*
425 1.1 lukem * Set the mask that represents the group of character properties.
426 1.1 lukem */
427 1.1 lukem *mask = m;
428 1.1 lukem
429 1.1 lukem /*
430 1.1 lukem * Return the number of characters consumed.
431 1.1 lukem */
432 1.1 lukem return sp - pp;
433 1.1 lukem }
434 1.1 lukem
435 1.1 lukem /*
436 1.1 lukem * Collect a hex number with 1 to 4 digits and return the number
437 1.1 lukem * of characters used.
438 1.1 lukem */
439 1.1 lukem static unsigned long
440 1.1 lukem _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
441 1.1 lukem {
442 1.1 lukem ucs2_t i;
443 1.1 lukem ucs2_t *sp, *ep;
444 1.1 lukem ucs4_t nn;
445 1.1 lukem
446 1.1 lukem sp = np;
447 1.1 lukem ep = sp + limit;
448 1.1 lukem
449 1.1 lukem for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
450 1.1 lukem if (*sp >= '0' && *sp <= '9')
451 1.1 lukem nn = (nn << 4) + (*sp - '0');
452 1.1 lukem else if (*sp >= 'A' && *sp <= 'F')
453 1.1 lukem nn = (nn << 4) + ((*sp - 'A') + 10);
454 1.1 lukem else if (*sp >= 'a' && *sp <= 'f')
455 1.1 lukem nn = (nn << 4) + ((*sp - 'a') + 10);
456 1.1 lukem else
457 1.1 lukem /*
458 1.1 lukem * Encountered something that is not a hex digit.
459 1.1 lukem */
460 1.1 lukem break;
461 1.1 lukem }
462 1.1 lukem
463 1.1 lukem /*
464 1.1 lukem * Assign the character code collected and return the number of
465 1.1 lukem * characters used.
466 1.1 lukem */
467 1.1 lukem *n = nn;
468 1.1 lukem
469 1.1 lukem return sp - np;
470 1.1 lukem }
471 1.1 lukem
472 1.1 lukem /*
473 1.1 lukem * Insert a range into a character class, removing duplicates and ordering
474 1.1 lukem * them in increasing range-start order.
475 1.1 lukem */
476 1.1 lukem static void
477 1.1 lukem _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
478 1.1 lukem {
479 1.1 lukem ucs2_t i;
480 1.1 lukem ucs4_t tmp;
481 1.1 lukem _ure_range_t *rp;
482 1.1 lukem
483 1.1 lukem /*
484 1.1 lukem * If the `casefold' flag is set, then make sure both endpoints of the
485 1.1 lukem * range are converted to lower case.
486 1.1 lukem */
487 1.1 lukem if (b->flags & _URE_DFA_CASEFOLD) {
488 1.1 lukem r->min_code = _ure_tolower(r->min_code);
489 1.1 lukem r->max_code = _ure_tolower(r->max_code);
490 1.1 lukem }
491 1.1 lukem
492 1.1 lukem /*
493 1.1 lukem * Swap the range endpoints if they are not in increasing order.
494 1.1 lukem */
495 1.1 lukem if (r->min_code > r->max_code) {
496 1.1 lukem tmp = r->min_code;
497 1.1 lukem r->min_code = r->max_code;
498 1.1 lukem r->max_code = tmp;
499 1.1 lukem }
500 1.1 lukem
501 1.1 lukem for (i = 0, rp = ccl->ranges;
502 1.1 lukem i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
503 1.1 lukem
504 1.1 lukem /*
505 1.1 lukem * Check for a duplicate.
506 1.1 lukem */
507 1.1 lukem if (i < ccl->ranges_used &&
508 1.1 lukem r->min_code == rp->min_code && r->max_code == rp->max_code)
509 1.1 lukem return;
510 1.1 lukem
511 1.1 lukem if (ccl->ranges_used == ccl->ranges_size) {
512 1.1 lukem if (ccl->ranges_size == 0)
513 1.1 lukem ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
514 1.1 lukem else
515 1.1 lukem ccl->ranges = (_ure_range_t *)
516 1.1 lukem realloc((char *) ccl->ranges,
517 1.1 lukem sizeof(_ure_range_t) * (ccl->ranges_size + 8));
518 1.1 lukem ccl->ranges_size += 8;
519 1.1 lukem }
520 1.1 lukem
521 1.1 lukem rp = ccl->ranges + ccl->ranges_used;
522 1.1 lukem
523 1.1 lukem if (i < ccl->ranges_used)
524 1.1 lukem _ure_memmove((char *) (rp + 1), (char *) rp,
525 1.1 lukem sizeof(_ure_range_t) * (ccl->ranges_used - i));
526 1.1 lukem
527 1.1 lukem ccl->ranges_used++;
528 1.1 lukem rp->min_code = r->min_code;
529 1.1 lukem rp->max_code = r->max_code;
530 1.1 lukem }
531 1.1 lukem
532 1.1 lukem #define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
533 1.1 lukem _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
534 1.1 lukem #define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT)
535 1.1 lukem #define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
536 1.1 lukem _URE_OTHERPUNCT)
537 1.1 lukem #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
538 1.1 lukem _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
539 1.1 lukem #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
540 1.1 lukem #define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
541 1.1 lukem
542 1.1 lukem typedef void (*_ure_cclsetup_t)(
543 1.1 lukem _ure_symtab_t *sym,
544 1.1 lukem unsigned long mask,
545 1.1 lukem _ure_buffer_t *b
546 1.1 lukem );
547 1.1 lukem
548 1.1 lukem typedef struct {
549 1.1 lukem ucs2_t key;
550 1.1 lukem unsigned long len;
551 1.1 lukem unsigned long next;
552 1.1 lukem _ure_cclsetup_t func;
553 1.1 lukem unsigned long mask;
554 1.1 lukem } _ure_trie_t;
555 1.1 lukem
556 1.1 lukem static void
557 1.1 lukem _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
558 1.1 lukem {
559 1.1 lukem sym->props |= mask;
560 1.1 lukem }
561 1.1 lukem
562 1.1 lukem static void
563 1.1 lukem _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
564 1.1 lukem {
565 1.1 lukem _ure_range_t range;
566 1.1 lukem
567 1.1 lukem sym->props |= mask;
568 1.1 lukem
569 1.1 lukem /*
570 1.1 lukem * Add the additional characters needed for handling isspace().
571 1.1 lukem */
572 1.1 lukem range.min_code = range.max_code = '\t';
573 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
574 1.1 lukem range.min_code = range.max_code = '\r';
575 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
576 1.1 lukem range.min_code = range.max_code = '\n';
577 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
578 1.1 lukem range.min_code = range.max_code = '\f';
579 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
580 1.1 lukem range.min_code = range.max_code = 0xfeff;
581 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
582 1.1 lukem }
583 1.1 lukem
584 1.1 lukem static void
585 1.1 lukem _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
586 1.1 lukem {
587 1.1 lukem _ure_range_t range;
588 1.1 lukem
589 1.1 lukem /*
590 1.1 lukem * Add the additional characters needed for handling isxdigit().
591 1.1 lukem */
592 1.1 lukem range.min_code = '0';
593 1.1 lukem range.max_code = '9';
594 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
595 1.1 lukem range.min_code = 'A';
596 1.1 lukem range.max_code = 'F';
597 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
598 1.1 lukem range.min_code = 'a';
599 1.1 lukem range.max_code = 'f';
600 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
601 1.1 lukem }
602 1.1 lukem
603 1.1 lukem static _ure_trie_t cclass_trie[] = {
604 1.1 lukem {0x003a, 1, 1, 0, 0},
605 1.1 lukem {0x0061, 9, 10, 0, 0},
606 1.1 lukem {0x0063, 8, 19, 0, 0},
607 1.1 lukem {0x0064, 7, 24, 0, 0},
608 1.1 lukem {0x0067, 6, 29, 0, 0},
609 1.1 lukem {0x006c, 5, 34, 0, 0},
610 1.1 lukem {0x0070, 4, 39, 0, 0},
611 1.1 lukem {0x0073, 3, 49, 0, 0},
612 1.1 lukem {0x0075, 2, 54, 0, 0},
613 1.1 lukem {0x0078, 1, 59, 0, 0},
614 1.1 lukem {0x006c, 1, 11, 0, 0},
615 1.1 lukem {0x006e, 2, 13, 0, 0},
616 1.1 lukem {0x0070, 1, 16, 0, 0},
617 1.1 lukem {0x0075, 1, 14, 0, 0},
618 1.1 lukem {0x006d, 1, 15, 0, 0},
619 1.1 lukem {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
620 1.1 lukem {0x0068, 1, 17, 0, 0},
621 1.1 lukem {0x0061, 1, 18, 0, 0},
622 1.1 lukem {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
623 1.1 lukem {0x006e, 1, 20, 0, 0},
624 1.1 lukem {0x0074, 1, 21, 0, 0},
625 1.1 lukem {0x0072, 1, 22, 0, 0},
626 1.1 lukem {0x006c, 1, 23, 0, 0},
627 1.1 lukem {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
628 1.1 lukem {0x0069, 1, 25, 0, 0},
629 1.1 lukem {0x0067, 1, 26, 0, 0},
630 1.1 lukem {0x0069, 1, 27, 0, 0},
631 1.1 lukem {0x0074, 1, 28, 0, 0},
632 1.1 lukem {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
633 1.1 lukem {0x0072, 1, 30, 0, 0},
634 1.1 lukem {0x0061, 1, 31, 0, 0},
635 1.1 lukem {0x0070, 1, 32, 0, 0},
636 1.1 lukem {0x0068, 1, 33, 0, 0},
637 1.1 lukem {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
638 1.1 lukem {0x006f, 1, 35, 0, 0},
639 1.1 lukem {0x0077, 1, 36, 0, 0},
640 1.1 lukem {0x0065, 1, 37, 0, 0},
641 1.1 lukem {0x0072, 1, 38, 0, 0},
642 1.1 lukem {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
643 1.1 lukem {0x0072, 2, 41, 0, 0},
644 1.1 lukem {0x0075, 1, 45, 0, 0},
645 1.1 lukem {0x0069, 1, 42, 0, 0},
646 1.1 lukem {0x006e, 1, 43, 0, 0},
647 1.1 lukem {0x0074, 1, 44, 0, 0},
648 1.1 lukem {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
649 1.1 lukem {0x006e, 1, 46, 0, 0},
650 1.1 lukem {0x0063, 1, 47, 0, 0},
651 1.1 lukem {0x0074, 1, 48, 0, 0},
652 1.1 lukem {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
653 1.1 lukem {0x0070, 1, 50, 0, 0},
654 1.1 lukem {0x0061, 1, 51, 0, 0},
655 1.1 lukem {0x0063, 1, 52, 0, 0},
656 1.1 lukem {0x0065, 1, 53, 0, 0},
657 1.1 lukem {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
658 1.1 lukem {0x0070, 1, 55, 0, 0},
659 1.1 lukem {0x0070, 1, 56, 0, 0},
660 1.1 lukem {0x0065, 1, 57, 0, 0},
661 1.1 lukem {0x0072, 1, 58, 0, 0},
662 1.1 lukem {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
663 1.1 lukem {0x0064, 1, 60, 0, 0},
664 1.1 lukem {0x0069, 1, 61, 0, 0},
665 1.1 lukem {0x0067, 1, 62, 0, 0},
666 1.1 lukem {0x0069, 1, 63, 0, 0},
667 1.1 lukem {0x0074, 1, 64, 0, 0},
668 1.1 lukem {0x003a, 1, 65, _ure_xdigit_setup, 0},
669 1.1 lukem };
670 1.1 lukem
671 1.1 lukem /*
672 1.1 lukem * Probe for one of the POSIX colon delimited character classes in the static
673 1.1 lukem * trie.
674 1.1 lukem */
675 1.1 lukem static unsigned long
676 1.1 lukem _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
677 1.1 lukem _ure_buffer_t *b)
678 1.1 lukem {
679 1.1 lukem int i;
680 1.1 lukem unsigned long n;
681 1.1 lukem _ure_trie_t *tp;
682 1.1 lukem ucs2_t *sp, *ep;
683 1.1 lukem
684 1.1 lukem /*
685 1.1 lukem * If the number of characters left is less than 7, then this cannot be
686 1.1 lukem * interpreted as one of the colon delimited classes.
687 1.1 lukem */
688 1.1 lukem if (limit < 7)
689 1.1 lukem return 0;
690 1.1 lukem
691 1.1 lukem sp = cp;
692 1.1 lukem ep = sp + limit;
693 1.1 lukem tp = cclass_trie;
694 1.1 lukem for (i = 0; sp < ep && i < 8; i++, sp++) {
695 1.1 lukem n = tp->len;
696 1.1 lukem
697 1.1 lukem for (; n > 0 && tp->key != *sp; tp++, n--) ;
698 1.1 lukem
699 1.1 lukem if (n == 0)
700 1.1 lukem return 0;
701 1.1 lukem
702 1.1 lukem if (*sp == ':' && (i == 6 || i == 7)) {
703 1.1 lukem sp++;
704 1.1 lukem break;
705 1.1 lukem }
706 1.1 lukem if (sp + 1 < ep)
707 1.1 lukem tp = cclass_trie + tp->next;
708 1.1 lukem }
709 1.1 lukem if (tp->func == 0)
710 1.1 lukem return 0;
711 1.1 lukem
712 1.1 lukem (*tp->func)(sym, tp->mask, b);
713 1.1 lukem
714 1.1 lukem return sp - cp;
715 1.1 lukem }
716 1.1 lukem
717 1.1 lukem /*
718 1.1 lukem * Construct a list of ranges and return the number of characters consumed.
719 1.1 lukem */
720 1.1 lukem static unsigned long
721 1.1 lukem _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
722 1.1 lukem _ure_buffer_t *b)
723 1.1 lukem {
724 1.1 lukem int range_end;
725 1.1 lukem unsigned long n;
726 1.1 lukem ucs2_t *sp, *ep;
727 1.1 lukem ucs4_t c, last;
728 1.1 lukem _ure_ccl_t *cclp;
729 1.1 lukem _ure_range_t range;
730 1.1 lukem
731 1.1 lukem sp = cp;
732 1.1 lukem ep = sp + limit;
733 1.1 lukem
734 1.1 lukem if (*sp == '^') {
735 1.1 lukem symp->type = _URE_NCCLASS;
736 1.1 lukem sp++;
737 1.1 lukem } else
738 1.1 lukem symp->type = _URE_CCLASS;
739 1.1 lukem
740 1.1 lukem for (last = 0, range_end = 0;
741 1.1 lukem b->error == _URE_OK && sp < ep && *sp != ']'; ) {
742 1.1 lukem c = *sp++;
743 1.1 lukem if (c == '\\') {
744 1.1 lukem if (sp == ep) {
745 1.1 lukem /*
746 1.1 lukem * The EOS was encountered when expecting the reverse solidus
747 1.1 lukem * to be followed by the character it is escaping. Set an
748 1.1 lukem * error code and return the number of characters consumed up
749 1.1 lukem * to this point.
750 1.1 lukem */
751 1.1 lukem b->error = _URE_UNEXPECTED_EOS;
752 1.1 lukem return sp - cp;
753 1.1 lukem }
754 1.1 lukem
755 1.1 lukem c = *sp++;
756 1.1 lukem switch (c) {
757 1.1 lukem case 'a':
758 1.1 lukem c = 0x07;
759 1.1 lukem break;
760 1.1 lukem case 'b':
761 1.1 lukem c = 0x08;
762 1.1 lukem break;
763 1.1 lukem case 'f':
764 1.1 lukem c = 0x0c;
765 1.1 lukem break;
766 1.1 lukem case 'n':
767 1.1 lukem c = 0x0a;
768 1.1 lukem break;
769 1.1 lukem case 'r':
770 1.1 lukem c = 0x0d;
771 1.1 lukem break;
772 1.1 lukem case 't':
773 1.1 lukem c = 0x09;
774 1.1 lukem break;
775 1.1 lukem case 'v':
776 1.1 lukem c = 0x0b;
777 1.1 lukem break;
778 1.1 lukem case 'p':
779 1.1 lukem case 'P':
780 1.1 lukem sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
781 1.1 lukem /*
782 1.1 lukem * Invert the bit mask of the properties if this is a negated
783 1.1 lukem * character class or if 'P' is used to specify a list of
784 1.1 lukem * character properties that should *not* match in a
785 1.1 lukem * character class.
786 1.1 lukem */
787 1.1 lukem if (c == 'P')
788 1.1 lukem symp->props = ~symp->props;
789 1.1 lukem continue;
790 1.1 lukem break;
791 1.1 lukem case 'x':
792 1.1 lukem case 'X':
793 1.1 lukem case 'u':
794 1.1 lukem case 'U':
795 1.1 lukem if (sp < ep &&
796 1.1 lukem ((*sp >= '0' && *sp <= '9') ||
797 1.1 lukem (*sp >= 'A' && *sp <= 'F') ||
798 1.1 lukem (*sp >= 'a' && *sp <= 'f')))
799 1.1 lukem sp += _ure_hex(sp, ep - sp, &c);
800 1.1 lukem }
801 1.1 lukem } else if (c == ':') {
802 1.1 lukem /*
803 1.1 lukem * Probe for a POSIX colon delimited character class.
804 1.1 lukem */
805 1.1 lukem sp--;
806 1.1 lukem if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
807 1.1 lukem sp++;
808 1.1 lukem else {
809 1.1 lukem sp += n;
810 1.1 lukem continue;
811 1.1 lukem }
812 1.1 lukem }
813 1.1 lukem
814 1.1 lukem cclp = &symp->sym.ccl;
815 1.1 lukem
816 1.1 lukem /*
817 1.1 lukem * Check to see if the current character is a low surrogate that needs
818 1.1 lukem * to be combined with a preceding high surrogate.
819 1.1 lukem */
820 1.1 lukem if (last != 0) {
821 1.1 lukem if (c >= 0xdc00 && c <= 0xdfff)
822 1.1 lukem /*
823 1.1 lukem * Construct the UTF16 character code.
824 1.1 lukem */
825 1.1 lukem c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
826 1.1 lukem else {
827 1.1 lukem /*
828 1.1 lukem * Add the isolated high surrogate to the range.
829 1.1 lukem */
830 1.1 lukem if (range_end == 1)
831 1.1 lukem range.max_code = last & 0xffff;
832 1.1 lukem else
833 1.1 lukem range.min_code = range.max_code = last & 0xffff;
834 1.1 lukem
835 1.1 lukem _ure_add_range(cclp, &range, b);
836 1.1 lukem range_end = 0;
837 1.1 lukem }
838 1.1 lukem }
839 1.1 lukem
840 1.1 lukem /*
841 1.1 lukem * Clear the last character code.
842 1.1 lukem */
843 1.1 lukem last = 0;
844 1.1 lukem
845 1.1 lukem /*
846 1.1 lukem * This slightly awkward code handles the different cases needed to
847 1.1 lukem * construct a range.
848 1.1 lukem */
849 1.1 lukem if (c >= 0xd800 && c <= 0xdbff) {
850 1.1 lukem /*
851 1.1 lukem * If the high surrogate is followed by a range indicator, simply
852 1.1 lukem * add it as the range start. Otherwise, save it in case the next
853 1.1 lukem * character is a low surrogate.
854 1.1 lukem */
855 1.1 lukem if (*sp == '-') {
856 1.1 lukem sp++;
857 1.1 lukem range.min_code = c;
858 1.1 lukem range_end = 1;
859 1.1 lukem } else
860 1.1 lukem last = c;
861 1.1 lukem } else if (range_end == 1) {
862 1.1 lukem range.max_code = c;
863 1.1 lukem _ure_add_range(cclp, &range, b);
864 1.1 lukem range_end = 0;
865 1.1 lukem } else {
866 1.1 lukem range.min_code = range.max_code = c;
867 1.1 lukem if (*sp == '-') {
868 1.1 lukem sp++;
869 1.1 lukem range_end = 1;
870 1.1 lukem } else
871 1.1 lukem _ure_add_range(cclp, &range, b);
872 1.1 lukem }
873 1.1 lukem }
874 1.1 lukem
875 1.1 lukem if (sp < ep && *sp == ']')
876 1.1 lukem sp++;
877 1.1 lukem else
878 1.1 lukem /*
879 1.1 lukem * The parse was not terminated by the character class close symbol
880 1.1 lukem * (']'), so set an error code.
881 1.1 lukem */
882 1.1 lukem b->error = _URE_CCLASS_OPEN;
883 1.1 lukem
884 1.1 lukem return sp - cp;
885 1.1 lukem }
886 1.1 lukem
887 1.1 lukem /*
888 1.1 lukem * Probe for a low surrogate hex code.
889 1.1 lukem */
890 1.1 lukem static unsigned long
891 1.1 lukem _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
892 1.1 lukem {
893 1.1 lukem ucs4_t i, code;
894 1.1 lukem ucs2_t *sp, *ep;
895 1.1 lukem
896 1.1 lukem for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
897 1.1 lukem if (*sp >= '0' && *sp <= '9')
898 1.1 lukem code = (code << 4) + (*sp - '0');
899 1.1 lukem else if (*sp >= 'A' && *sp <= 'F')
900 1.1 lukem code = (code << 4) + ((*sp - 'A') + 10);
901 1.1 lukem else if (*sp >= 'a' && *sp <= 'f')
902 1.1 lukem code = (code << 4) + ((*sp - 'a') + 10);
903 1.1 lukem else
904 1.1 lukem break;
905 1.1 lukem }
906 1.1 lukem
907 1.1 lukem *c = code;
908 1.1 lukem return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
909 1.1 lukem }
910 1.1 lukem
911 1.1 lukem static unsigned long
912 1.1 lukem _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
913 1.1 lukem _ure_buffer_t *b)
914 1.1 lukem {
915 1.1 lukem ucs4_t c;
916 1.1 lukem ucs2_t *sp, *ep;
917 1.1 lukem
918 1.1 lukem sp = sym;
919 1.1 lukem ep = sym + limit;
920 1.1 lukem
921 1.1 lukem if ((c = *sp++) == '\\') {
922 1.1 lukem
923 1.1 lukem if (sp == ep) {
924 1.1 lukem /*
925 1.1 lukem * The EOS was encountered when expecting the reverse solidus to
926 1.1 lukem * be followed by the character it is escaping. Set an error code
927 1.1 lukem * and return the number of characters consumed up to this point.
928 1.1 lukem */
929 1.1 lukem b->error = _URE_UNEXPECTED_EOS;
930 1.1 lukem return sp - sym;
931 1.1 lukem }
932 1.1 lukem
933 1.1 lukem c = *sp++;
934 1.1 lukem switch (c) {
935 1.1 lukem case 'p':
936 1.1 lukem case 'P':
937 1.1 lukem symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
938 1.1 lukem sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
939 1.1 lukem break;
940 1.1 lukem case 'a':
941 1.1 lukem symp->type = _URE_CHAR;
942 1.1 lukem symp->sym.chr = 0x07;
943 1.1 lukem break;
944 1.1 lukem case 'b':
945 1.1 lukem symp->type = _URE_CHAR;
946 1.1 lukem symp->sym.chr = 0x08;
947 1.1 lukem break;
948 1.1 lukem case 'f':
949 1.1 lukem symp->type = _URE_CHAR;
950 1.1 lukem symp->sym.chr = 0x0c;
951 1.1 lukem break;
952 1.1 lukem case 'n':
953 1.1 lukem symp->type = _URE_CHAR;
954 1.1 lukem symp->sym.chr = 0x0a;
955 1.1 lukem break;
956 1.1 lukem case 'r':
957 1.1 lukem symp->type = _URE_CHAR;
958 1.1 lukem symp->sym.chr = 0x0d;
959 1.1 lukem break;
960 1.1 lukem case 't':
961 1.1 lukem symp->type = _URE_CHAR;
962 1.1 lukem symp->sym.chr = 0x09;
963 1.1 lukem break;
964 1.1 lukem case 'v':
965 1.1 lukem symp->type = _URE_CHAR;
966 1.1 lukem symp->sym.chr = 0x0b;
967 1.1 lukem break;
968 1.1 lukem case 'x':
969 1.1 lukem case 'X':
970 1.1 lukem case 'u':
971 1.1 lukem case 'U':
972 1.1 lukem /*
973 1.1 lukem * Collect between 1 and 4 digits representing a UCS2 code. Fall
974 1.1 lukem * through to the next case.
975 1.1 lukem */
976 1.1 lukem if (sp < ep &&
977 1.1 lukem ((*sp >= '0' && *sp <= '9') ||
978 1.1 lukem (*sp >= 'A' && *sp <= 'F') ||
979 1.1 lukem (*sp >= 'a' && *sp <= 'f')))
980 1.1 lukem sp += _ure_hex(sp, ep - sp, &c);
981 1.1 lukem /* FALLTHROUGH */
982 1.1 lukem default:
983 1.1 lukem /*
984 1.1 lukem * Simply add an escaped character here.
985 1.1 lukem */
986 1.1 lukem symp->type = _URE_CHAR;
987 1.1 lukem symp->sym.chr = c;
988 1.1 lukem }
989 1.1 lukem } else if (c == '^' || c == '$')
990 1.1 lukem /*
991 1.1 lukem * Handle the BOL and EOL anchors. This actually consists simply of
992 1.1 lukem * setting a flag that indicates that the user supplied anchor match
993 1.1 lukem * function should be called. This needs to be done instead of simply
994 1.1 lukem * matching line/paragraph separators because beginning-of-text and
995 1.1 lukem * end-of-text tests are needed as well.
996 1.1 lukem */
997 1.1 lukem symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
998 1.1 lukem else if (c == '[')
999 1.1 lukem /*
1000 1.1 lukem * Construct a character class.
1001 1.1 lukem */
1002 1.1 lukem sp += _ure_cclass(sp, ep - sp, symp, b);
1003 1.1 lukem else if (c == '.')
1004 1.1 lukem symp->type = _URE_ANY_CHAR;
1005 1.1 lukem else {
1006 1.1 lukem symp->type = _URE_CHAR;
1007 1.1 lukem symp->sym.chr = c;
1008 1.1 lukem }
1009 1.1 lukem
1010 1.1 lukem /*
1011 1.1 lukem * If the symbol type happens to be a character and is a high surrogate,
1012 1.1 lukem * then probe forward to see if it is followed by a low surrogate that
1013 1.1 lukem * needs to be added.
1014 1.1 lukem */
1015 1.1 lukem if (sp < ep && symp->type == _URE_CHAR &&
1016 1.1 lukem 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
1017 1.1 lukem
1018 1.1 lukem if (0xdc00 <= *sp && *sp <= 0xdfff) {
1019 1.1 lukem symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1020 1.1 lukem (*sp & 0x03ff));
1021 1.1 lukem sp++;
1022 1.1 lukem } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
1023 1.1 lukem *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
1024 1.1 lukem sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
1025 1.1 lukem if (0xdc00 <= c && c <= 0xdfff) {
1026 1.1 lukem /*
1027 1.1 lukem * Take into account the \[xu] in front of the hex code.
1028 1.1 lukem */
1029 1.1 lukem sp += 2;
1030 1.1 lukem symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1031 1.1 lukem (c & 0x03ff));
1032 1.1 lukem }
1033 1.1 lukem }
1034 1.1 lukem }
1035 1.1 lukem
1036 1.1 lukem /*
1037 1.1 lukem * Last, make sure any _URE_CHAR type symbols are changed to lower case if
1038 1.1 lukem * the `casefold' flag is set.
1039 1.1 lukem */
1040 1.1 lukem if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
1041 1.1 lukem symp->sym.chr = _ure_tolower(symp->sym.chr);
1042 1.1 lukem
1043 1.1 lukem /*
1044 1.1 lukem * If the symbol constructed is anything other than one of the anchors,
1045 1.1 lukem * make sure the _URE_DFA_BLANKLINE flag is removed.
1046 1.1 lukem */
1047 1.1 lukem if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
1048 1.1 lukem b->flags &= ~_URE_DFA_BLANKLINE;
1049 1.1 lukem
1050 1.1 lukem /*
1051 1.1 lukem * Return the number of characters consumed.
1052 1.1 lukem */
1053 1.1 lukem return sp - sym;
1054 1.1 lukem }
1055 1.1 lukem
1056 1.1 lukem static int
1057 1.1 lukem _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
1058 1.1 lukem {
1059 1.1 lukem if (a->type != b->type || a->mods != b->mods || a->props != b->props)
1060 1.1 lukem return 1;
1061 1.1 lukem
1062 1.1 lukem if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
1063 1.1 lukem if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
1064 1.1 lukem return 1;
1065 1.1 lukem if (a->sym.ccl.ranges_used > 0 &&
1066 1.1 lukem memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
1067 1.1 lukem sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
1068 1.1 lukem return 1;
1069 1.1 lukem } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
1070 1.1 lukem return 1;
1071 1.1 lukem return 0;
1072 1.1 lukem }
1073 1.1 lukem
1074 1.1 lukem /*
1075 1.1 lukem * Construct a symbol, but only keep unique symbols.
1076 1.1 lukem */
1077 1.1 lukem static ucs2_t
1078 1.1 lukem _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
1079 1.1 lukem _ure_buffer_t *b)
1080 1.1 lukem {
1081 1.1 lukem ucs2_t i;
1082 1.1 lukem _ure_symtab_t *sp, symbol;
1083 1.1 lukem
1084 1.1 lukem /*
1085 1.1 lukem * Build the next symbol so we can test to see if it is already in the
1086 1.1 lukem * symbol table.
1087 1.1 lukem */
1088 1.1 lukem (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
1089 1.1 lukem *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
1090 1.1 lukem
1091 1.1 lukem /*
1092 1.1 lukem * Check to see if the symbol exists.
1093 1.1 lukem */
1094 1.1 lukem for (i = 0, sp = b->symtab;
1095 1.1 lukem i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
1096 1.1 lukem
1097 1.1 lukem if (i < b->symtab_used) {
1098 1.1 lukem /*
1099 1.1 lukem * Free up any ranges used for the symbol.
1100 1.1 lukem */
1101 1.1 lukem if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
1102 1.1 lukem symbol.sym.ccl.ranges_size > 0)
1103 1.1 lukem free((char *) symbol.sym.ccl.ranges);
1104 1.1 lukem
1105 1.1 lukem return b->symtab[i].id;
1106 1.1 lukem }
1107 1.1 lukem
1108 1.1 lukem /*
1109 1.1 lukem * Need to add the new symbol.
1110 1.1 lukem */
1111 1.1 lukem if (b->symtab_used == b->symtab_size) {
1112 1.1 lukem if (b->symtab_size == 0)
1113 1.1 lukem b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
1114 1.1 lukem else
1115 1.1 lukem b->symtab = (_ure_symtab_t *)
1116 1.1 lukem realloc((char *) b->symtab,
1117 1.1 lukem sizeof(_ure_symtab_t) * (b->symtab_size + 8));
1118 1.1 lukem sp = b->symtab + b->symtab_size;
1119 1.1 lukem (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
1120 1.1 lukem b->symtab_size += 8;
1121 1.1 lukem }
1122 1.1 lukem
1123 1.1 lukem symbol.id = b->symtab_used++;
1124 1.1 lukem (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
1125 1.1 lukem sizeof(_ure_symtab_t));
1126 1.1 lukem
1127 1.1 lukem return symbol.id;
1128 1.1 lukem }
1129 1.1 lukem
1130 1.1 lukem /*************************************************************************
1131 1.1 lukem *
1132 1.1 lukem * End symbol parse functions.
1133 1.1 lukem *
1134 1.1 lukem *************************************************************************/
1135 1.1 lukem
1136 1.1 lukem static ucs2_t
1137 1.1 lukem _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
1138 1.1 lukem {
1139 1.1 lukem ucs2_t i;
1140 1.1 lukem
1141 1.1 lukem if (b == 0)
1142 1.1 lukem return _URE_NOOP;
1143 1.1 lukem
1144 1.1 lukem /*
1145 1.1 lukem * Determine if the expression already exists or not.
1146 1.1 lukem */
1147 1.1 lukem for (i = 0; i < b->expr_used; i++) {
1148 1.1 lukem if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
1149 1.1 lukem b->expr[i].rhs == rhs)
1150 1.1 lukem break;
1151 1.1 lukem }
1152 1.1 lukem if (i < b->expr_used)
1153 1.1 lukem return i;
1154 1.1 lukem
1155 1.1 lukem /*
1156 1.1 lukem * Need to add a new expression.
1157 1.1 lukem */
1158 1.1 lukem if (b->expr_used == b->expr_size) {
1159 1.1 lukem if (b->expr_size == 0)
1160 1.1 lukem b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
1161 1.1 lukem else
1162 1.1 lukem b->expr = (_ure_elt_t *)
1163 1.1 lukem realloc((char *) b->expr,
1164 1.1 lukem sizeof(_ure_elt_t) * (b->expr_size + 8));
1165 1.1 lukem b->expr_size += 8;
1166 1.1 lukem }
1167 1.1 lukem
1168 1.1 lukem b->expr[b->expr_used].onstack = 0;
1169 1.1 lukem b->expr[b->expr_used].type = type;
1170 1.1 lukem b->expr[b->expr_used].lhs = lhs;
1171 1.1 lukem b->expr[b->expr_used].rhs = rhs;
1172 1.1 lukem
1173 1.1 lukem return b->expr_used++;
1174 1.1 lukem }
1175 1.1 lukem
1176 1.1 lukem static unsigned char spmap[] = {
1177 1.1 lukem 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
1178 1.1 lukem 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1179 1.1 lukem 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1180 1.1 lukem };
1181 1.1 lukem
1182 1.1 lukem #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
1183 1.1 lukem (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
1184 1.1 lukem
1185 1.1 lukem /*
1186 1.1 lukem * Convert the regular expression into an NFA in a form that will be easy to
1187 1.1 lukem * reduce to a DFA. The starting state for the reduction will be returned.
1188 1.1 lukem */
1189 1.1 lukem static ucs2_t
1190 1.1 lukem _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
1191 1.1 lukem {
1192 1.1 lukem ucs2_t c, state, top, sym, *sp, *ep;
1193 1.1 lukem unsigned long used;
1194 1.1 lukem
1195 1.1 lukem state = _URE_NOOP;
1196 1.1 lukem
1197 1.1 lukem sp = re;
1198 1.1 lukem ep = sp + relen;
1199 1.1 lukem while (b->error == _URE_OK && sp < ep) {
1200 1.1 lukem c = *sp++;
1201 1.1 lukem switch (c) {
1202 1.1 lukem case '(':
1203 1.1 lukem _ure_push(_URE_PAREN, b);
1204 1.1 lukem break;
1205 1.1 lukem case ')':
1206 1.1 lukem /*
1207 1.1 lukem * Check for the case of too many close parentheses.
1208 1.1 lukem */
1209 1.1 lukem if (_ure_peek(b) == _URE_NOOP) {
1210 1.1 lukem b->error = _URE_UNBALANCED_GROUP;
1211 1.1 lukem break;
1212 1.1 lukem }
1213 1.1 lukem
1214 1.1 lukem while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1215 1.1 lukem /*
1216 1.1 lukem * Make an expression with the AND or OR operator and its right
1217 1.1 lukem * hand side.
1218 1.1 lukem */
1219 1.1 lukem state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1220 1.1 lukem
1221 1.1 lukem /*
1222 1.1 lukem * Remove the _URE_PAREN off the stack.
1223 1.1 lukem */
1224 1.1 lukem (void) _ure_pop(b);
1225 1.1 lukem break;
1226 1.1 lukem case '*':
1227 1.1 lukem state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
1228 1.1 lukem break;
1229 1.1 lukem case '+':
1230 1.1 lukem state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
1231 1.1 lukem break;
1232 1.1 lukem case '?':
1233 1.1 lukem state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
1234 1.1 lukem break;
1235 1.1 lukem case '|':
1236 1.1 lukem while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1237 1.1 lukem /*
1238 1.1 lukem * Make an expression with the AND or OR operator and its right
1239 1.1 lukem * hand side.
1240 1.1 lukem */
1241 1.1 lukem state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1242 1.1 lukem
1243 1.1 lukem _ure_push(state, b);
1244 1.1 lukem _ure_push(_URE_OR, b);
1245 1.1 lukem break;
1246 1.1 lukem default:
1247 1.1 lukem sp--;
1248 1.1 lukem sym = _ure_make_symbol(sp, ep - sp, &used, b);
1249 1.1 lukem sp += used;
1250 1.1 lukem state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
1251 1.1 lukem break;
1252 1.1 lukem }
1253 1.1 lukem
1254 1.1 lukem if (c != '(' && c != '|' && sp < ep &&
1255 1.1 lukem (!_ure_isspecial(*sp) || *sp == '(')) {
1256 1.1 lukem _ure_push(state, b);
1257 1.1 lukem _ure_push(_URE_AND, b);
1258 1.1 lukem }
1259 1.1 lukem }
1260 1.1 lukem while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1261 1.1 lukem /*
1262 1.1 lukem * Make an expression with the AND or OR operator and its right
1263 1.1 lukem * hand side.
1264 1.1 lukem */
1265 1.1 lukem state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1266 1.1 lukem
1267 1.1 lukem if (b->stack.slist_used > 0)
1268 1.1 lukem b->error = _URE_UNBALANCED_GROUP;
1269 1.1 lukem
1270 1.1 lukem return (b->error == _URE_OK) ? state : _URE_NOOP;
1271 1.1 lukem }
1272 1.1 lukem
1273 1.1 lukem static void
1274 1.1 lukem _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
1275 1.1 lukem {
1276 1.1 lukem ucs2_t i, *stp;
1277 1.1 lukem _ure_symtab_t *sp;
1278 1.1 lukem
1279 1.1 lukem /*
1280 1.1 lukem * Locate the symbol in the symbol table so the state can be added.
1281 1.1 lukem * If the symbol doesn't exist, then a real problem exists.
1282 1.1 lukem */
1283 1.1 lukem for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
1284 1.1 lukem i++, sp++) ;
1285 1.1 lukem
1286 1.1 lukem /*
1287 1.1 lukem * Now find out if the state exists in the symbol's state list.
1288 1.1 lukem */
1289 1.1 lukem for (i = 0, stp = sp->states.slist;
1290 1.1 lukem i < sp->states.slist_used && state > *stp; i++, stp++) ;
1291 1.1 lukem
1292 1.1 lukem if (i == sp->states.slist_used || state < *stp) {
1293 1.1 lukem /*
1294 1.1 lukem * Need to add the state in order.
1295 1.1 lukem */
1296 1.1 lukem if (sp->states.slist_used == sp->states.slist_size) {
1297 1.1 lukem if (sp->states.slist_size == 0)
1298 1.1 lukem sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
1299 1.1 lukem else
1300 1.1 lukem sp->states.slist = (ucs2_t *)
1301 1.1 lukem realloc((char *) sp->states.slist,
1302 1.1 lukem sizeof(ucs2_t) * (sp->states.slist_size + 8));
1303 1.1 lukem sp->states.slist_size += 8;
1304 1.1 lukem }
1305 1.1 lukem if (i < sp->states.slist_used)
1306 1.1 lukem (void) _ure_memmove((char *) (sp->states.slist + i + 1),
1307 1.1 lukem (char *) (sp->states.slist + i),
1308 1.1 lukem sizeof(ucs2_t) * (sp->states.slist_used - i));
1309 1.1 lukem sp->states.slist[i] = state;
1310 1.1 lukem sp->states.slist_used++;
1311 1.1 lukem }
1312 1.1 lukem }
1313 1.1 lukem
1314 1.1 lukem static ucs2_t
1315 1.1 lukem _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
1316 1.1 lukem {
1317 1.1 lukem ucs2_t i;
1318 1.1 lukem _ure_state_t *sp;
1319 1.1 lukem
1320 1.1 lukem for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
1321 1.1 lukem if (sp->st.slist_used == nstates &&
1322 1.1 lukem memcmp((char *) states, (char *) sp->st.slist,
1323 1.1 lukem sizeof(ucs2_t) * nstates) == 0)
1324 1.1 lukem break;
1325 1.1 lukem }
1326 1.1 lukem
1327 1.1 lukem if (i == b->states.states_used) {
1328 1.1 lukem /*
1329 1.1 lukem * Need to add a new DFA state (set of NFA states).
1330 1.1 lukem */
1331 1.1 lukem if (b->states.states_used == b->states.states_size) {
1332 1.1 lukem if (b->states.states_size == 0)
1333 1.1 lukem b->states.states = (_ure_state_t *)
1334 1.1 lukem malloc(sizeof(_ure_state_t) << 3);
1335 1.1 lukem else
1336 1.1 lukem b->states.states = (_ure_state_t *)
1337 1.1 lukem realloc((char *) b->states.states,
1338 1.1 lukem sizeof(_ure_state_t) * (b->states.states_size + 8));
1339 1.1 lukem sp = b->states.states + b->states.states_size;
1340 1.1 lukem (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
1341 1.1 lukem b->states.states_size += 8;
1342 1.1 lukem }
1343 1.1 lukem
1344 1.1 lukem sp = b->states.states + b->states.states_used++;
1345 1.1 lukem sp->id = i;
1346 1.1 lukem
1347 1.1 lukem if (sp->st.slist_used + nstates > sp->st.slist_size) {
1348 1.1 lukem if (sp->st.slist_size == 0)
1349 1.1 lukem sp->st.slist = (ucs2_t *)
1350 1.1 lukem malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1351 1.1 lukem else
1352 1.1 lukem sp->st.slist = (ucs2_t *)
1353 1.1 lukem realloc((char *) sp->st.slist,
1354 1.1 lukem sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1355 1.1 lukem sp->st.slist_size = sp->st.slist_used + nstates;
1356 1.1 lukem }
1357 1.1 lukem sp->st.slist_used = nstates;
1358 1.1 lukem (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
1359 1.1 lukem sizeof(ucs2_t) * nstates);
1360 1.1 lukem }
1361 1.1 lukem
1362 1.1 lukem /*
1363 1.1 lukem * Return the ID of the DFA state representing a group of NFA states.
1364 1.1 lukem */
1365 1.1 lukem return i;
1366 1.1 lukem }
1367 1.1 lukem
1368 1.1 lukem static void
1369 1.1 lukem _ure_reduce(ucs2_t start, _ure_buffer_t *b)
1370 1.1 lukem {
1371 1.1 lukem ucs2_t i, j, state, eval, syms, rhs;
1372 1.1 lukem ucs2_t s1, s2, ns1, ns2;
1373 1.1 lukem _ure_state_t *sp;
1374 1.1 lukem _ure_symtab_t *smp;
1375 1.1 lukem
1376 1.1 lukem b->reducing = 1;
1377 1.1 lukem
1378 1.1 lukem /*
1379 1.1 lukem * Add the starting state for the reduction.
1380 1.1 lukem */
1381 1.1 lukem _ure_add_state(1, &start, b);
1382 1.1 lukem
1383 1.1 lukem /*
1384 1.1 lukem * Process each set of NFA states that get created.
1385 1.1 lukem */
1386 1.1 lukem for (i = 0; i < b->states.states_used; i++) {
1387 1.1 lukem sp = b->states.states + i;
1388 1.1 lukem
1389 1.1 lukem /*
1390 1.1 lukem * Push the current states on the stack.
1391 1.1 lukem */
1392 1.1 lukem for (j = 0; j < sp->st.slist_used; j++)
1393 1.1 lukem _ure_push(sp->st.slist[j], b);
1394 1.1 lukem
1395 1.1 lukem /*
1396 1.1 lukem * Reduce the NFA states.
1397 1.1 lukem */
1398 1.1 lukem for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
1399 1.1 lukem state = b->stack.slist[j];
1400 1.1 lukem eval = 1;
1401 1.1 lukem
1402 1.1 lukem /*
1403 1.1 lukem * This inner loop is the iterative equivalent of recursively
1404 1.1 lukem * reducing subexpressions generated as a result of a reduction.
1405 1.1 lukem */
1406 1.1 lukem while (eval) {
1407 1.1 lukem switch (b->expr[state].type) {
1408 1.1 lukem case _URE_SYMBOL:
1409 1.1 lukem ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1410 1.1 lukem _ure_add_symstate(b->expr[state].lhs, ns1, b);
1411 1.1 lukem syms++;
1412 1.1 lukem eval = 0;
1413 1.1 lukem break;
1414 1.1 lukem case _URE_ONE:
1415 1.1 lukem sp->accepting = 1;
1416 1.1 lukem eval = 0;
1417 1.1 lukem break;
1418 1.1 lukem case _URE_QUEST:
1419 1.1 lukem s1 = b->expr[state].lhs;
1420 1.1 lukem ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1421 1.1 lukem state = _ure_make_expr(_URE_OR, ns1, s1, b);
1422 1.1 lukem break;
1423 1.1 lukem case _URE_PLUS:
1424 1.1 lukem s1 = b->expr[state].lhs;
1425 1.1 lukem ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
1426 1.1 lukem state = _ure_make_expr(_URE_AND, s1, ns1, b);
1427 1.1 lukem break;
1428 1.1 lukem case _URE_STAR:
1429 1.1 lukem s1 = b->expr[state].lhs;
1430 1.1 lukem ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1431 1.1 lukem ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
1432 1.1 lukem state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1433 1.1 lukem break;
1434 1.1 lukem case _URE_OR:
1435 1.1 lukem s1 = b->expr[state].lhs;
1436 1.1 lukem s2 = b->expr[state].rhs;
1437 1.1 lukem _ure_push(s1, b);
1438 1.1 lukem _ure_push(s2, b);
1439 1.1 lukem eval = 0;
1440 1.1 lukem break;
1441 1.1 lukem case _URE_AND:
1442 1.1 lukem s1 = b->expr[state].lhs;
1443 1.1 lukem s2 = b->expr[state].rhs;
1444 1.1 lukem switch (b->expr[s1].type) {
1445 1.1 lukem case _URE_SYMBOL:
1446 1.1 lukem _ure_add_symstate(b->expr[s1].lhs, s2, b);
1447 1.1 lukem syms++;
1448 1.1 lukem eval = 0;
1449 1.1 lukem break;
1450 1.1 lukem case _URE_ONE:
1451 1.1 lukem state = s2;
1452 1.1 lukem break;
1453 1.1 lukem case _URE_QUEST:
1454 1.1 lukem ns1 = b->expr[s1].lhs;
1455 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
1456 1.1 lukem state = _ure_make_expr(_URE_OR, s2, ns2, b);
1457 1.1 lukem break;
1458 1.1 lukem case _URE_PLUS:
1459 1.1 lukem ns1 = b->expr[s1].lhs;
1460 1.1 lukem ns2 = _ure_make_expr(_URE_OR, s2, state, b);
1461 1.1 lukem state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1462 1.1 lukem break;
1463 1.1 lukem case _URE_STAR:
1464 1.1 lukem ns1 = b->expr[s1].lhs;
1465 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
1466 1.1 lukem state = _ure_make_expr(_URE_OR, s2, ns2, b);
1467 1.1 lukem break;
1468 1.1 lukem case _URE_OR:
1469 1.1 lukem ns1 = b->expr[s1].lhs;
1470 1.1 lukem ns2 = b->expr[s1].rhs;
1471 1.1 lukem ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
1472 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1473 1.1 lukem state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1474 1.1 lukem break;
1475 1.1 lukem case _URE_AND:
1476 1.1 lukem ns1 = b->expr[s1].lhs;
1477 1.1 lukem ns2 = b->expr[s1].rhs;
1478 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1479 1.1 lukem state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1480 1.1 lukem break;
1481 1.1 lukem }
1482 1.1 lukem }
1483 1.1 lukem }
1484 1.1 lukem }
1485 1.1 lukem
1486 1.1 lukem /*
1487 1.1 lukem * Clear the state stack.
1488 1.1 lukem */
1489 1.1 lukem while (_ure_pop(b) != _URE_NOOP) ;
1490 1.1 lukem
1491 1.1 lukem /*
1492 1.1 lukem * Reset the state pointer because the reduction may have moved it
1493 1.1 lukem * during a reallocation.
1494 1.1 lukem */
1495 1.1 lukem sp = b->states.states + i;
1496 1.1 lukem
1497 1.1 lukem /*
1498 1.1 lukem * Generate the DFA states for the symbols collected during the
1499 1.1 lukem * current reduction.
1500 1.1 lukem */
1501 1.1 lukem if (sp->trans_used + syms > sp->trans_size) {
1502 1.1 lukem if (sp->trans_size == 0)
1503 1.1 lukem sp->trans = (_ure_elt_t *)
1504 1.1 lukem malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
1505 1.1 lukem else
1506 1.1 lukem sp->trans = (_ure_elt_t *)
1507 1.1 lukem realloc((char *) sp->trans,
1508 1.1 lukem sizeof(_ure_elt_t) * (sp->trans_used + syms));
1509 1.1 lukem sp->trans_size = sp->trans_used + syms;
1510 1.1 lukem }
1511 1.1 lukem
1512 1.1 lukem /*
1513 1.1 lukem * Go through the symbol table and generate the DFA state transitions
1514 1.1 lukem * for each symbol that has collected NFA states.
1515 1.1 lukem */
1516 1.1 lukem for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
1517 1.1 lukem sp = b->states.states + i;
1518 1.1 lukem
1519 1.1 lukem if (smp->states.slist_used > 0) {
1520 1.1 lukem sp->trans[syms].lhs = smp->id;
1521 1.1 lukem rhs = _ure_add_state(smp->states.slist_used,
1522 1.1 lukem smp->states.slist, b);
1523 1.1 lukem /*
1524 1.1 lukem * Reset the state pointer in case the reallocation moves it
1525 1.1 lukem * in memory.
1526 1.1 lukem */
1527 1.1 lukem sp = b->states.states + i;
1528 1.1 lukem sp->trans[syms].rhs = rhs;
1529 1.1 lukem
1530 1.1 lukem smp->states.slist_used = 0;
1531 1.1 lukem syms++;
1532 1.1 lukem }
1533 1.1 lukem }
1534 1.1 lukem
1535 1.1 lukem /*
1536 1.1 lukem * Set the number of transitions actually used.
1537 1.1 lukem */
1538 1.1 lukem sp->trans_used = syms;
1539 1.1 lukem }
1540 1.1 lukem b->reducing = 0;
1541 1.1 lukem }
1542 1.1 lukem
1543 1.1 lukem static void
1544 1.1 lukem _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
1545 1.1 lukem {
1546 1.1 lukem ucs2_t tmp;
1547 1.1 lukem
1548 1.1 lukem l = b->states.states[l].id;
1549 1.1 lukem r = b->states.states[r].id;
1550 1.1 lukem
1551 1.1 lukem if (l == r)
1552 1.1 lukem return;
1553 1.1 lukem
1554 1.1 lukem if (l > r) {
1555 1.1 lukem tmp = l;
1556 1.1 lukem l = r;
1557 1.1 lukem r = tmp;
1558 1.1 lukem }
1559 1.1 lukem
1560 1.1 lukem /*
1561 1.1 lukem * Check to see if the equivalence pair already exists.
1562 1.1 lukem */
1563 1.1 lukem for (tmp = 0; tmp < b->equiv_used &&
1564 1.1 lukem (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
1565 1.1 lukem tmp++) ;
1566 1.1 lukem
1567 1.1 lukem if (tmp < b->equiv_used)
1568 1.1 lukem return;
1569 1.1 lukem
1570 1.1 lukem if (b->equiv_used == b->equiv_size) {
1571 1.1 lukem if (b->equiv_size == 0)
1572 1.1 lukem b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
1573 1.1 lukem else
1574 1.1 lukem b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
1575 1.1 lukem sizeof(_ure_equiv_t) *
1576 1.1 lukem (b->equiv_size + 8));
1577 1.1 lukem b->equiv_size += 8;
1578 1.1 lukem }
1579 1.1 lukem b->equiv[b->equiv_used].l = l;
1580 1.1 lukem b->equiv[b->equiv_used].r = r;
1581 1.1 lukem b->equiv_used++;
1582 1.1 lukem }
1583 1.1 lukem
1584 1.1 lukem /*
1585 1.1 lukem * Merge the DFA states that are equivalent.
1586 1.1 lukem */
1587 1.1 lukem static void
1588 1.1 lukem _ure_merge_equiv(_ure_buffer_t *b)
1589 1.1 lukem {
1590 1.1 lukem ucs2_t i, j, k, eq, done;
1591 1.1 lukem _ure_state_t *sp1, *sp2, *ls, *rs;
1592 1.1 lukem
1593 1.1 lukem for (i = 0; i < b->states.states_used; i++) {
1594 1.1 lukem sp1 = b->states.states + i;
1595 1.1 lukem if (sp1->id != i)
1596 1.1 lukem continue;
1597 1.1 lukem for (j = 0; j < i; j++) {
1598 1.1 lukem sp2 = b->states.states + j;
1599 1.1 lukem if (sp2->id != j)
1600 1.1 lukem continue;
1601 1.1 lukem b->equiv_used = 0;
1602 1.1 lukem _ure_add_equiv(i, j, b);
1603 1.1 lukem for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
1604 1.1 lukem ls = b->states.states + b->equiv[eq].l;
1605 1.1 lukem rs = b->states.states + b->equiv[eq].r;
1606 1.1 lukem if (ls->accepting != rs->accepting ||
1607 1.1 lukem ls->trans_used != rs->trans_used) {
1608 1.1 lukem done = 1;
1609 1.1 lukem break;
1610 1.1 lukem }
1611 1.1 lukem for (k = 0; k < ls->trans_used &&
1612 1.1 lukem ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
1613 1.1 lukem if (k < ls->trans_used) {
1614 1.1 lukem done = 1;
1615 1.1 lukem break;
1616 1.1 lukem }
1617 1.1 lukem
1618 1.1 lukem for (k = 0; k < ls->trans_used; k++)
1619 1.1 lukem _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
1620 1.1 lukem }
1621 1.1 lukem if (done == 0)
1622 1.1 lukem break;
1623 1.1 lukem }
1624 1.1 lukem for (eq = 0; j < i && eq < b->equiv_used; eq++)
1625 1.1 lukem b->states.states[b->equiv[eq].r].id =
1626 1.1 lukem b->states.states[b->equiv[eq].l].id;
1627 1.1 lukem }
1628 1.1 lukem
1629 1.1 lukem /*
1630 1.1 lukem * Renumber the states appropriately.
1631 1.1 lukem */
1632 1.1 lukem for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
1633 1.1 lukem sp1++, i++)
1634 1.1 lukem sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
1635 1.1 lukem }
1636 1.1 lukem
1637 1.1 lukem /*************************************************************************
1638 1.1 lukem *
1639 1.1 lukem * API.
1640 1.1 lukem *
1641 1.1 lukem *************************************************************************/
1642 1.1 lukem
1643 1.1 lukem ure_buffer_t
1644 1.1 lukem ure_buffer_create(void)
1645 1.1 lukem {
1646 1.1 lukem ure_buffer_t b;
1647 1.1 lukem
1648 1.1 lukem b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
1649 1.1 lukem
1650 1.1 lukem return b;
1651 1.1 lukem }
1652 1.1 lukem
1653 1.1 lukem void
1654 1.1 lukem ure_buffer_free(ure_buffer_t buf)
1655 1.1 lukem {
1656 1.1 lukem unsigned long i;
1657 1.1 lukem
1658 1.1 lukem if (buf == 0)
1659 1.1 lukem return;
1660 1.1 lukem
1661 1.1 lukem if (buf->stack.slist_size > 0)
1662 1.1 lukem free((char *) buf->stack.slist);
1663 1.1 lukem
1664 1.1 lukem if (buf->expr_size > 0)
1665 1.1 lukem free((char *) buf->expr);
1666 1.1 lukem
1667 1.1 lukem for (i = 0; i < buf->symtab_size; i++) {
1668 1.1 lukem if (buf->symtab[i].states.slist_size > 0)
1669 1.1 lukem free((char *) buf->symtab[i].states.slist);
1670 1.1 lukem }
1671 1.1 lukem
1672 1.1 lukem if (buf->symtab_size > 0)
1673 1.1 lukem free((char *) buf->symtab);
1674 1.1 lukem
1675 1.1 lukem for (i = 0; i < buf->states.states_size; i++) {
1676 1.1 lukem if (buf->states.states[i].trans_size > 0)
1677 1.1 lukem free((char *) buf->states.states[i].trans);
1678 1.1 lukem if (buf->states.states[i].st.slist_size > 0)
1679 1.1 lukem free((char *) buf->states.states[i].st.slist);
1680 1.1 lukem }
1681 1.1 lukem
1682 1.1 lukem if (buf->states.states_size > 0)
1683 1.1 lukem free((char *) buf->states.states);
1684 1.1 lukem
1685 1.1 lukem if (buf->equiv_size > 0)
1686 1.1 lukem free((char *) buf->equiv);
1687 1.1 lukem
1688 1.1 lukem free((char *) buf);
1689 1.1 lukem }
1690 1.1 lukem
1691 1.1 lukem ure_dfa_t
1692 1.1 lukem ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
1693 1.1 lukem {
1694 1.1 lukem ucs2_t i, j, state;
1695 1.1 lukem _ure_state_t *sp;
1696 1.1 lukem _ure_dstate_t *dsp;
1697 1.1 lukem _ure_trans_t *tp;
1698 1.1 lukem ure_dfa_t dfa;
1699 1.1 lukem
1700 1.1 lukem if (re == 0 || *re == 0 || relen == 0 || buf == 0)
1701 1.1 lukem return 0;
1702 1.1 lukem
1703 1.1 lukem /*
1704 1.1 lukem * Reset the various fields of the compilation buffer. Default the flags
1705 1.1 lukem * to indicate the presense of the "^$" pattern. If any other pattern
1706 1.1 lukem * occurs, then this flag will be removed. This is done to catch this
1707 1.1 lukem * special pattern and handle it specially when matching.
1708 1.1 lukem */
1709 1.1 lukem buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
1710 1.1 lukem buf->reducing = 0;
1711 1.1 lukem buf->stack.slist_used = 0;
1712 1.1 lukem buf->expr_used = 0;
1713 1.1 lukem
1714 1.1 lukem for (i = 0; i < buf->symtab_used; i++)
1715 1.1 lukem buf->symtab[i].states.slist_used = 0;
1716 1.1 lukem buf->symtab_used = 0;
1717 1.1 lukem
1718 1.1 lukem for (i = 0; i < buf->states.states_used; i++) {
1719 1.1 lukem buf->states.states[i].st.slist_used = 0;
1720 1.1 lukem buf->states.states[i].trans_used = 0;
1721 1.1 lukem }
1722 1.1 lukem buf->states.states_used = 0;
1723 1.1 lukem
1724 1.1 lukem /*
1725 1.1 lukem * Construct the NFA. If this stage returns a 0, then an error occured or
1726 1.1 lukem * an empty expression was passed.
1727 1.1 lukem */
1728 1.1 lukem if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
1729 1.1 lukem return 0;
1730 1.1 lukem
1731 1.1 lukem /*
1732 1.1 lukem * Do the expression reduction to get the initial DFA.
1733 1.1 lukem */
1734 1.1 lukem _ure_reduce(state, buf);
1735 1.1 lukem
1736 1.1 lukem /*
1737 1.1 lukem * Merge all the equivalent DFA states.
1738 1.1 lukem */
1739 1.1 lukem _ure_merge_equiv(buf);
1740 1.1 lukem
1741 1.1 lukem /*
1742 1.1 lukem * Construct the minimal DFA.
1743 1.1 lukem */
1744 1.1 lukem dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
1745 1.1 lukem (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
1746 1.1 lukem
1747 1.1 lukem dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
1748 1.1 lukem
1749 1.1 lukem /*
1750 1.1 lukem * Free up the NFA state groups and transfer the symbols from the buffer
1751 1.1 lukem * to the DFA.
1752 1.1 lukem */
1753 1.1 lukem for (i = 0; i < buf->symtab_size; i++) {
1754 1.1 lukem if (buf->symtab[i].states.slist_size > 0)
1755 1.1 lukem free((char *) buf->symtab[i].states.slist);
1756 1.1 lukem }
1757 1.1 lukem dfa->syms = buf->symtab;
1758 1.1 lukem dfa->nsyms = buf->symtab_used;
1759 1.1 lukem
1760 1.1 lukem buf->symtab_used = buf->symtab_size = 0;
1761 1.1 lukem
1762 1.1 lukem /*
1763 1.1 lukem * Collect the total number of states and transitions needed for the DFA.
1764 1.1 lukem */
1765 1.1 lukem for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1766 1.1 lukem i++, sp++) {
1767 1.1 lukem if (sp->id == state) {
1768 1.1 lukem dfa->nstates++;
1769 1.1 lukem dfa->ntrans += sp->trans_used;
1770 1.1 lukem state++;
1771 1.1 lukem }
1772 1.1 lukem }
1773 1.1 lukem
1774 1.1 lukem /*
1775 1.1 lukem * Allocate enough space for the states and transitions.
1776 1.1 lukem */
1777 1.1 lukem dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
1778 1.1 lukem dfa->nstates);
1779 1.1 lukem dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
1780 1.1 lukem
1781 1.1 lukem /*
1782 1.1 lukem * Actually transfer the DFA states from the buffer.
1783 1.1 lukem */
1784 1.1 lukem dsp = dfa->states;
1785 1.1 lukem tp = dfa->trans;
1786 1.1 lukem for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1787 1.1 lukem i++, sp++) {
1788 1.1 lukem if (sp->id == state) {
1789 1.1 lukem dsp->trans = tp;
1790 1.1 lukem dsp->ntrans = sp->trans_used;
1791 1.1 lukem dsp->accepting = sp->accepting;
1792 1.1 lukem
1793 1.1 lukem /*
1794 1.1 lukem * Add the transitions for the state.
1795 1.1 lukem */
1796 1.1 lukem for (j = 0; j < dsp->ntrans; j++, tp++) {
1797 1.1 lukem tp->symbol = sp->trans[j].lhs;
1798 1.1 lukem tp->next_state = buf->states.states[sp->trans[j].rhs].id;
1799 1.1 lukem }
1800 1.1 lukem
1801 1.1 lukem dsp++;
1802 1.1 lukem state++;
1803 1.1 lukem }
1804 1.1 lukem }
1805 1.1 lukem
1806 1.1 lukem return dfa;
1807 1.1 lukem }
1808 1.1 lukem
1809 1.1 lukem void
1810 1.1 lukem ure_dfa_free(ure_dfa_t dfa)
1811 1.1 lukem {
1812 1.1 lukem ucs2_t i;
1813 1.1 lukem
1814 1.1 lukem if (dfa == 0)
1815 1.1 lukem return;
1816 1.1 lukem
1817 1.1 lukem for (i = 0; i < dfa->nsyms; i++) {
1818 1.1 lukem if ((dfa->syms[i].type == _URE_CCLASS ||
1819 1.1 lukem dfa->syms[i].type == _URE_NCCLASS) &&
1820 1.1 lukem dfa->syms[i].sym.ccl.ranges_size > 0)
1821 1.1 lukem free((char *) dfa->syms[i].sym.ccl.ranges);
1822 1.1 lukem }
1823 1.1 lukem if (dfa->nsyms > 0)
1824 1.1 lukem free((char *) dfa->syms);
1825 1.1 lukem
1826 1.1 lukem if (dfa->nstates > 0)
1827 1.1 lukem free((char *) dfa->states);
1828 1.1 lukem if (dfa->ntrans > 0)
1829 1.1 lukem free((char *) dfa->trans);
1830 1.1 lukem free((char *) dfa);
1831 1.1 lukem }
1832 1.1 lukem
1833 1.1 lukem void
1834 1.1 lukem ure_write_dfa(ure_dfa_t dfa, FILE *out)
1835 1.1 lukem {
1836 1.1 lukem ucs2_t i, j, k, h, l;
1837 1.1 lukem _ure_dstate_t *sp;
1838 1.1 lukem _ure_symtab_t *sym;
1839 1.1 lukem _ure_range_t *rp;
1840 1.1 lukem
1841 1.1 lukem if (dfa == 0 || out == 0)
1842 1.1 lukem return;
1843 1.1 lukem
1844 1.1 lukem /*
1845 1.1 lukem * Write all the different character classes.
1846 1.1 lukem */
1847 1.1 lukem for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
1848 1.1 lukem if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
1849 1.1 lukem fprintf(out, "C%hd = ", sym->id);
1850 1.1 lukem if (sym->sym.ccl.ranges_used > 0) {
1851 1.1 lukem putc('[', out);
1852 1.1 lukem if (sym->type == _URE_NCCLASS)
1853 1.1 lukem putc('^', out);
1854 1.1 lukem }
1855 1.1 lukem if (sym->props != 0) {
1856 1.1 lukem if (sym->type == _URE_NCCLASS)
1857 1.1 lukem fprintf(out, "\\P");
1858 1.1 lukem else
1859 1.1 lukem fprintf(out, "\\p");
1860 1.1 lukem for (k = h = 0; k < 32; k++) {
1861 1.1 lukem if (sym->props & (1 << k)) {
1862 1.1 lukem if (h != 0)
1863 1.1 lukem putc(',', out);
1864 1.1 lukem fprintf(out, "%hd", k + 1);
1865 1.1 lukem h = 1;
1866 1.1 lukem }
1867 1.1 lukem }
1868 1.1 lukem }
1869 1.1 lukem /*
1870 1.1 lukem * Dump the ranges.
1871 1.1 lukem */
1872 1.1 lukem for (k = 0, rp = sym->sym.ccl.ranges;
1873 1.1 lukem k < sym->sym.ccl.ranges_used; k++, rp++) {
1874 1.1 lukem /*
1875 1.1 lukem * Check for UTF16 characters.
1876 1.1 lukem */
1877 1.1 lukem if (0x10000 <= rp->min_code &&
1878 1.1 lukem rp->min_code <= 0x10ffff) {
1879 1.1 lukem h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
1880 1.1 lukem l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
1881 1.1 lukem fprintf(out, "\\x%04hX\\x%04hX", h, l);
1882 1.1 lukem } else
1883 1.1 lukem fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
1884 1.1 lukem if (rp->max_code != rp->min_code) {
1885 1.1 lukem putc('-', out);
1886 1.1 lukem if (rp->max_code >= 0x10000 &&
1887 1.1 lukem rp->max_code <= 0x10ffff) {
1888 1.1 lukem h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
1889 1.1 lukem l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
1890 1.1 lukem fprintf(out, "\\x%04hX\\x%04hX", h, l);
1891 1.1 lukem } else
1892 1.1 lukem fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
1893 1.1 lukem }
1894 1.1 lukem }
1895 1.1 lukem if (sym->sym.ccl.ranges_used > 0)
1896 1.1 lukem putc(']', out);
1897 1.1 lukem putc('\n', out);
1898 1.1 lukem }
1899 1.1 lukem }
1900 1.1 lukem
1901 1.1 lukem for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
1902 1.1 lukem fprintf(out, "S%hd = ", i);
1903 1.1 lukem if (sp->accepting) {
1904 1.1 lukem fprintf(out, "1 ");
1905 1.1 lukem if (sp->ntrans)
1906 1.1 lukem fprintf(out, "| ");
1907 1.1 lukem }
1908 1.1 lukem for (j = 0; j < sp->ntrans; j++) {
1909 1.1 lukem if (j > 0)
1910 1.1 lukem fprintf(out, "| ");
1911 1.1 lukem
1912 1.1 lukem sym = dfa->syms + sp->trans[j].symbol;
1913 1.1 lukem switch (sym->type) {
1914 1.1 lukem case _URE_CHAR:
1915 1.1 lukem if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
1916 1.1 lukem /*
1917 1.1 lukem * Take care of UTF16 characters.
1918 1.1 lukem */
1919 1.1 lukem h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
1920 1.1 lukem l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
1921 1.1 lukem fprintf(out, "\\x%04hX\\x%04hX ", h, l);
1922 1.1 lukem } else
1923 1.1 lukem fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
1924 1.1 lukem break;
1925 1.1 lukem case _URE_ANY_CHAR:
1926 1.1 lukem fprintf(out, "<any> ");
1927 1.1 lukem break;
1928 1.1 lukem case _URE_BOL_ANCHOR:
1929 1.1 lukem fprintf(out, "<bol-anchor> ");
1930 1.1 lukem break;
1931 1.1 lukem case _URE_EOL_ANCHOR:
1932 1.1 lukem fprintf(out, "<eol-anchor> ");
1933 1.1 lukem break;
1934 1.1 lukem case _URE_CCLASS:
1935 1.1 lukem case _URE_NCCLASS:
1936 1.1 lukem fprintf(out, "[C%hd] ", sym->id);
1937 1.1 lukem break;
1938 1.1 lukem }
1939 1.1 lukem fprintf(out, "S%hd", sp->trans[j].next_state);
1940 1.1 lukem if (j + 1 < sp->ntrans)
1941 1.1 lukem putc(' ', out);
1942 1.1 lukem }
1943 1.1 lukem putc('\n', out);
1944 1.1 lukem }
1945 1.1 lukem }
1946 1.1 lukem
1947 1.1 lukem #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
1948 1.1 lukem (cc) == 0x2029)
1949 1.1 lukem
1950 1.1 lukem int
1951 1.1 lukem ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
1952 1.1 lukem unsigned long *match_start, unsigned long *match_end)
1953 1.1 lukem {
1954 1.1 lukem int i, j, matched, found, skip;
1955 1.1 lukem unsigned long ms, me;
1956 1.1 lukem ucs4_t c;
1957 1.1 lukem ucs2_t *sp, *ep, *lp;
1958 1.1 lukem _ure_dstate_t *stp;
1959 1.1 lukem _ure_symtab_t *sym;
1960 1.1 lukem _ure_range_t *rp;
1961 1.1 lukem
1962 1.1 lukem if (dfa == 0 || text == 0)
1963 1.1 lukem return 0;
1964 1.1 lukem
1965 1.1 lukem /*
1966 1.1 lukem * Handle the special case of an empty string matching the "^$" pattern.
1967 1.1 lukem */
1968 1.1 lukem if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
1969 1.1 lukem *match_start = *match_end = 0;
1970 1.1 lukem return 1;
1971 1.1 lukem }
1972 1.1 lukem
1973 1.1 lukem sp = text;
1974 1.1 lukem ep = sp + textlen;
1975 1.1 lukem
1976 1.1 lukem ms = me = ~0;
1977 1.1 lukem
1978 1.1 lukem stp = dfa->states;
1979 1.1 lukem
1980 1.1 lukem for (found = skip = 0; found == 0 && sp < ep; ) {
1981 1.1 lukem lp = sp;
1982 1.1 lukem c = *sp++;
1983 1.1 lukem
1984 1.1 lukem /*
1985 1.1 lukem * Check to see if this is a high surrogate that should be
1986 1.1 lukem * combined with a following low surrogate.
1987 1.1 lukem */
1988 1.1 lukem if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
1989 1.1 lukem 0xdc00 <= *sp && *sp <= 0xdfff)
1990 1.1 lukem c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
1991 1.1 lukem
1992 1.1 lukem /*
1993 1.1 lukem * Determine if the character is non-spacing and should be skipped.
1994 1.1 lukem */
1995 1.1 lukem if (_ure_matches_properties(_URE_NONSPACING, c) &&
1996 1.1 lukem (flags & URE_IGNORE_NONSPACING)) {
1997 1.1 lukem sp++;
1998 1.1 lukem continue;
1999 1.1 lukem }
2000 1.1 lukem
2001 1.1 lukem if (dfa->flags & _URE_DFA_CASEFOLD)
2002 1.1 lukem c = _ure_tolower(c);
2003 1.1 lukem
2004 1.1 lukem /*
2005 1.1 lukem * See if one of the transitions matches.
2006 1.1 lukem */
2007 1.1 lukem for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
2008 1.1 lukem sym = dfa->syms + stp->trans[i].symbol;
2009 1.1 lukem switch (sym->type) {
2010 1.1 lukem case _URE_ANY_CHAR:
2011 1.1 lukem if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
2012 1.1 lukem !_ure_issep(c))
2013 1.1 lukem matched = 1;
2014 1.1 lukem break;
2015 1.1 lukem case _URE_CHAR:
2016 1.1 lukem if (c == sym->sym.chr)
2017 1.1 lukem matched = 1;
2018 1.1 lukem break;
2019 1.1 lukem case _URE_BOL_ANCHOR:
2020 1.1 lukem if (lp == text) {
2021 1.1 lukem sp = lp;
2022 1.1 lukem matched = 1;
2023 1.1 lukem } else if (_ure_issep(c)) {
2024 1.1 lukem if (c == '\r' && sp < ep && *sp == '\n')
2025 1.1 lukem sp++;
2026 1.1 lukem lp = sp;
2027 1.1 lukem matched = 1;
2028 1.1 lukem }
2029 1.1 lukem break;
2030 1.1 lukem case _URE_EOL_ANCHOR:
2031 1.1 lukem if (_ure_issep(c)) {
2032 1.1 lukem /*
2033 1.1 lukem * Put the pointer back before the separator so the match
2034 1.1 lukem * end position will be correct. This case will also
2035 1.1 lukem * cause the `sp' pointer to be advanced over the current
2036 1.1 lukem * separator once the match end point has been recorded.
2037 1.1 lukem */
2038 1.1 lukem sp = lp;
2039 1.1 lukem matched = 1;
2040 1.1 lukem }
2041 1.1 lukem break;
2042 1.1 lukem case _URE_CCLASS:
2043 1.1 lukem case _URE_NCCLASS:
2044 1.1 lukem if (sym->props != 0)
2045 1.1 lukem matched = _ure_matches_properties(sym->props, c);
2046 1.1 lukem for (j = 0, rp = sym->sym.ccl.ranges;
2047 1.1 lukem j < sym->sym.ccl.ranges_used; j++, rp++) {
2048 1.1 lukem if (rp->min_code <= c && c <= rp->max_code)
2049 1.1 lukem matched = 1;
2050 1.1 lukem }
2051 1.1 lukem if (sym->type == _URE_NCCLASS)
2052 1.1 lukem matched = !matched;
2053 1.1 lukem break;
2054 1.1 lukem }
2055 1.1 lukem
2056 1.1 lukem if (matched) {
2057 1.1 lukem if (ms == ~0UL)
2058 1.1 lukem ms = lp - text;
2059 1.1 lukem else
2060 1.1 lukem me = sp - text;
2061 1.1 lukem stp = dfa->states + stp->trans[i].next_state;
2062 1.1 lukem
2063 1.1 lukem /*
2064 1.1 lukem * If the match was an EOL anchor, adjust the pointer past the
2065 1.1 lukem * separator that caused the match. The correct match
2066 1.1 lukem * position has been recorded already.
2067 1.1 lukem */
2068 1.1 lukem if (sym->type == _URE_EOL_ANCHOR) {
2069 1.1 lukem /*
2070 1.1 lukem * Skip the character that caused the match.
2071 1.1 lukem */
2072 1.1 lukem sp++;
2073 1.1 lukem
2074 1.1 lukem /*
2075 1.1 lukem * Handle the infamous CRLF situation.
2076 1.1 lukem */
2077 1.1 lukem if (sp < ep && c == '\r' && *sp == '\n')
2078 1.1 lukem sp++;
2079 1.1 lukem }
2080 1.1 lukem }
2081 1.1 lukem }
2082 1.1 lukem
2083 1.1 lukem if (matched == 0) {
2084 1.1 lukem if (stp->accepting == 0) {
2085 1.1 lukem /*
2086 1.1 lukem * If the last state was not accepting, then reset
2087 1.1 lukem * and start over.
2088 1.1 lukem */
2089 1.1 lukem stp = dfa->states;
2090 1.1 lukem ms = me = ~0;
2091 1.1 lukem } else
2092 1.1 lukem /*
2093 1.1 lukem * The last state was accepting, so terminate the matching
2094 1.1 lukem * loop to avoid more work.
2095 1.1 lukem */
2096 1.1 lukem found = 1;
2097 1.1 lukem } else if (sp == ep) {
2098 1.1 lukem if (!stp->accepting) {
2099 1.1 lukem /*
2100 1.1 lukem * This ugly hack is to make sure the end-of-line anchors
2101 1.1 lukem * match when the source text hits the end. This is only done
2102 1.1 lukem * if the last subexpression matches.
2103 1.1 lukem */
2104 1.1 lukem for (i = 0; found == 0 && i < stp->ntrans; i++) {
2105 1.1 lukem sym = dfa->syms + stp->trans[i].symbol;
2106 1.1 lukem if (sym->type ==_URE_EOL_ANCHOR) {
2107 1.1 lukem stp = dfa->states + stp->trans[i].next_state;
2108 1.1 lukem if (stp->accepting) {
2109 1.1 lukem me = sp - text;
2110 1.1 lukem found = 1;
2111 1.1 lukem } else
2112 1.1 lukem break;
2113 1.1 lukem }
2114 1.1 lukem }
2115 1.1 lukem } else {
2116 1.1 lukem /*
2117 1.1 lukem * Make sure any conditions that match all the way to the end
2118 1.1 lukem * of the string match.
2119 1.1 lukem */
2120 1.1 lukem found = 1;
2121 1.1 lukem me = sp - text;
2122 1.1 lukem }
2123 1.1 lukem }
2124 1.1 lukem }
2125 1.1 lukem
2126 1.1 lukem if (found == 0)
2127 1.1 lukem ms = me = ~0;
2128 1.1 lukem
2129 1.1 lukem *match_start = ms;
2130 1.1 lukem *match_end = me;
2131 1.1 lukem
2132 1.1 lukem return (ms != ~0UL) ? 1 : 0;
2133 1.1 lukem }
2134