ure.c revision 1.1 1 1.1 lukem /* $OpenLDAP: pkg/ldap/libraries/liblunicode/ure/ure.c,v 1.17.2.3 2008/02/11 23:26:42 kurt Exp $ */
2 1.1 lukem /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
3 1.1 lukem *
4 1.1 lukem * Copyright 1998-2008 The OpenLDAP Foundation.
5 1.1 lukem * All rights reserved.
6 1.1 lukem *
7 1.1 lukem * Redistribution and use in source and binary forms, with or without
8 1.1 lukem * modification, are permitted only as authorized by the OpenLDAP
9 1.1 lukem * Public License.
10 1.1 lukem *
11 1.1 lukem * A copy of this license is available in file LICENSE in the
12 1.1 lukem * top-level directory of the distribution or, alternatively, at
13 1.1 lukem * <http://www.OpenLDAP.org/license.html>.
14 1.1 lukem */
15 1.1 lukem /* Copyright 1997, 1998, 1999 Computing Research Labs,
16 1.1 lukem * New Mexico State University
17 1.1 lukem *
18 1.1 lukem * Permission is hereby granted, free of charge, to any person obtaining a
19 1.1 lukem * copy of this software and associated documentation files (the "Software"),
20 1.1 lukem * to deal in the Software without restriction, including without limitation
21 1.1 lukem * the rights to use, copy, modify, merge, publish, distribute, sublicense,
22 1.1 lukem * and/or sell copies of the Software, and to permit persons to whom the
23 1.1 lukem * Software is furnished to do so, subject to the following conditions:
24 1.1 lukem *
25 1.1 lukem * The above copyright notice and this permission notice shall be included in
26 1.1 lukem * all copies or substantial portions of the Software.
27 1.1 lukem *
28 1.1 lukem * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 1.1 lukem * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 1.1 lukem * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
31 1.1 lukem * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
32 1.1 lukem * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
33 1.1 lukem * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
34 1.1 lukem * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
35 1.1 lukem */
36 1.1 lukem /* $Id: ure.c,v 1.1 2008/05/22 14:20:36 lukem Exp $" */
37 1.1 lukem
38 1.1 lukem #include "portable.h"
39 1.1 lukem
40 1.1 lukem #include <ac/stdlib.h>
41 1.1 lukem #include <ac/string.h>
42 1.1 lukem #include <ac/unistd.h>
43 1.1 lukem
44 1.1 lukem #include "ure.h"
45 1.1 lukem
46 1.1 lukem /*
47 1.1 lukem * Flags used internally in the DFA.
48 1.1 lukem */
49 1.1 lukem #define _URE_DFA_CASEFOLD 0x01
50 1.1 lukem #define _URE_DFA_BLANKLINE 0x02
51 1.1 lukem
52 1.1 lukem static unsigned long cclass_flags[] = {
53 1.1 lukem 0,
54 1.1 lukem _URE_NONSPACING,
55 1.1 lukem _URE_COMBINING,
56 1.1 lukem _URE_NUMDIGIT,
57 1.1 lukem _URE_NUMOTHER,
58 1.1 lukem _URE_SPACESEP,
59 1.1 lukem _URE_LINESEP,
60 1.1 lukem _URE_PARASEP,
61 1.1 lukem _URE_CNTRL,
62 1.1 lukem _URE_PUA,
63 1.1 lukem _URE_UPPER,
64 1.1 lukem _URE_LOWER,
65 1.1 lukem _URE_TITLE,
66 1.1 lukem _URE_MODIFIER,
67 1.1 lukem _URE_OTHERLETTER,
68 1.1 lukem _URE_DASHPUNCT,
69 1.1 lukem _URE_OPENPUNCT,
70 1.1 lukem _URE_CLOSEPUNCT,
71 1.1 lukem _URE_OTHERPUNCT,
72 1.1 lukem _URE_MATHSYM,
73 1.1 lukem _URE_CURRENCYSYM,
74 1.1 lukem _URE_OTHERSYM,
75 1.1 lukem _URE_LTR,
76 1.1 lukem _URE_RTL,
77 1.1 lukem _URE_EURONUM,
78 1.1 lukem _URE_EURONUMSEP,
79 1.1 lukem _URE_EURONUMTERM,
80 1.1 lukem _URE_ARABNUM,
81 1.1 lukem _URE_COMMONSEP,
82 1.1 lukem _URE_BLOCKSEP,
83 1.1 lukem _URE_SEGMENTSEP,
84 1.1 lukem _URE_WHITESPACE,
85 1.1 lukem _URE_OTHERNEUT,
86 1.1 lukem };
87 1.1 lukem
88 1.1 lukem /*
89 1.1 lukem * Symbol types for the DFA.
90 1.1 lukem */
91 1.1 lukem #define _URE_ANY_CHAR 1
92 1.1 lukem #define _URE_CHAR 2
93 1.1 lukem #define _URE_CCLASS 3
94 1.1 lukem #define _URE_NCCLASS 4
95 1.1 lukem #define _URE_BOL_ANCHOR 5
96 1.1 lukem #define _URE_EOL_ANCHOR 6
97 1.1 lukem
98 1.1 lukem /*
99 1.1 lukem * Op codes for converting the NFA to a DFA.
100 1.1 lukem */
101 1.1 lukem #define _URE_SYMBOL 10
102 1.1 lukem #define _URE_PAREN 11
103 1.1 lukem #define _URE_QUEST 12
104 1.1 lukem #define _URE_STAR 13
105 1.1 lukem #define _URE_PLUS 14
106 1.1 lukem #define _URE_ONE 15
107 1.1 lukem #define _URE_AND 16
108 1.1 lukem #define _URE_OR 17
109 1.1 lukem
110 1.1 lukem #define _URE_NOOP 0xffff
111 1.1 lukem
112 1.1 lukem #define _URE_REGSTART 0x8000
113 1.1 lukem #define _URE_REGEND 0x4000
114 1.1 lukem
115 1.1 lukem /*
116 1.1 lukem * Structure used to handle a compacted range of characters.
117 1.1 lukem */
118 1.1 lukem typedef struct {
119 1.1 lukem ucs4_t min_code;
120 1.1 lukem ucs4_t max_code;
121 1.1 lukem } _ure_range_t;
122 1.1 lukem
123 1.1 lukem typedef struct {
124 1.1 lukem _ure_range_t *ranges;
125 1.1 lukem ucs2_t ranges_used;
126 1.1 lukem ucs2_t ranges_size;
127 1.1 lukem } _ure_ccl_t;
128 1.1 lukem
129 1.1 lukem typedef union {
130 1.1 lukem ucs4_t chr;
131 1.1 lukem _ure_ccl_t ccl;
132 1.1 lukem } _ure_sym_t;
133 1.1 lukem
134 1.1 lukem /*
135 1.1 lukem * This is a general element structure used for expressions and stack
136 1.1 lukem * elements.
137 1.1 lukem */
138 1.1 lukem typedef struct {
139 1.1 lukem ucs2_t reg;
140 1.1 lukem ucs2_t onstack;
141 1.1 lukem ucs2_t type;
142 1.1 lukem ucs2_t lhs;
143 1.1 lukem ucs2_t rhs;
144 1.1 lukem } _ure_elt_t;
145 1.1 lukem
146 1.1 lukem /*
147 1.1 lukem * This is a structure used to track a list or a stack of states.
148 1.1 lukem */
149 1.1 lukem typedef struct {
150 1.1 lukem ucs2_t *slist;
151 1.1 lukem ucs2_t slist_size;
152 1.1 lukem ucs2_t slist_used;
153 1.1 lukem } _ure_stlist_t;
154 1.1 lukem
155 1.1 lukem /*
156 1.1 lukem * Structure to track the list of unique states for a symbol
157 1.1 lukem * during reduction.
158 1.1 lukem */
159 1.1 lukem typedef struct {
160 1.1 lukem ucs2_t id;
161 1.1 lukem ucs2_t type;
162 1.1 lukem unsigned long mods;
163 1.1 lukem unsigned long props;
164 1.1 lukem _ure_sym_t sym;
165 1.1 lukem _ure_stlist_t states;
166 1.1 lukem } _ure_symtab_t;
167 1.1 lukem
168 1.1 lukem /*
169 1.1 lukem * Structure to hold a single state.
170 1.1 lukem */
171 1.1 lukem typedef struct {
172 1.1 lukem ucs2_t id;
173 1.1 lukem ucs2_t accepting;
174 1.1 lukem ucs2_t pad;
175 1.1 lukem _ure_stlist_t st;
176 1.1 lukem _ure_elt_t *trans;
177 1.1 lukem ucs2_t trans_size;
178 1.1 lukem ucs2_t trans_used;
179 1.1 lukem } _ure_state_t;
180 1.1 lukem
181 1.1 lukem /*
182 1.1 lukem * Structure used for keeping lists of states.
183 1.1 lukem */
184 1.1 lukem typedef struct {
185 1.1 lukem _ure_state_t *states;
186 1.1 lukem ucs2_t states_size;
187 1.1 lukem ucs2_t states_used;
188 1.1 lukem } _ure_statetable_t;
189 1.1 lukem
190 1.1 lukem /*
191 1.1 lukem * Structure to track pairs of DFA states when equivalent states are
192 1.1 lukem * merged.
193 1.1 lukem */
194 1.1 lukem typedef struct {
195 1.1 lukem ucs2_t l;
196 1.1 lukem ucs2_t r;
197 1.1 lukem } _ure_equiv_t;
198 1.1 lukem
199 1.1 lukem /*
200 1.1 lukem * Structure used for constructing the NFA and reducing to a minimal DFA.
201 1.1 lukem */
202 1.1 lukem typedef struct _ure_buffer_t {
203 1.1 lukem int reducing;
204 1.1 lukem int error;
205 1.1 lukem unsigned long flags;
206 1.1 lukem
207 1.1 lukem _ure_stlist_t stack;
208 1.1 lukem
209 1.1 lukem /*
210 1.1 lukem * Table of unique symbols encountered.
211 1.1 lukem */
212 1.1 lukem _ure_symtab_t *symtab;
213 1.1 lukem ucs2_t symtab_size;
214 1.1 lukem ucs2_t symtab_used;
215 1.1 lukem
216 1.1 lukem /*
217 1.1 lukem * Tracks the unique expressions generated for the NFA and when the NFA is
218 1.1 lukem * reduced.
219 1.1 lukem */
220 1.1 lukem _ure_elt_t *expr;
221 1.1 lukem ucs2_t expr_used;
222 1.1 lukem ucs2_t expr_size;
223 1.1 lukem
224 1.1 lukem /*
225 1.1 lukem * The reduced table of unique groups of NFA states.
226 1.1 lukem */
227 1.1 lukem _ure_statetable_t states;
228 1.1 lukem
229 1.1 lukem /*
230 1.1 lukem * Tracks states when equivalent states are merged.
231 1.1 lukem */
232 1.1 lukem _ure_equiv_t *equiv;
233 1.1 lukem ucs2_t equiv_used;
234 1.1 lukem ucs2_t equiv_size;
235 1.1 lukem } _ure_buffer_t;
236 1.1 lukem
237 1.1 lukem typedef struct {
238 1.1 lukem ucs2_t symbol;
239 1.1 lukem ucs2_t next_state;
240 1.1 lukem } _ure_trans_t;
241 1.1 lukem
242 1.1 lukem typedef struct {
243 1.1 lukem ucs2_t accepting;
244 1.1 lukem ucs2_t ntrans;
245 1.1 lukem _ure_trans_t *trans;
246 1.1 lukem } _ure_dstate_t;
247 1.1 lukem
248 1.1 lukem typedef struct _ure_dfa_t {
249 1.1 lukem unsigned long flags;
250 1.1 lukem
251 1.1 lukem _ure_symtab_t *syms;
252 1.1 lukem ucs2_t nsyms;
253 1.1 lukem
254 1.1 lukem _ure_dstate_t *states;
255 1.1 lukem ucs2_t nstates;
256 1.1 lukem
257 1.1 lukem _ure_trans_t *trans;
258 1.1 lukem ucs2_t ntrans;
259 1.1 lukem } _ure_dfa_t;
260 1.1 lukem
261 1.1 lukem /*************************************************************************
262 1.1 lukem *
263 1.1 lukem * Functions.
264 1.1 lukem *
265 1.1 lukem *************************************************************************/
266 1.1 lukem
267 1.1 lukem static void
268 1.1 lukem _ure_memmove(char *dest, char *src, unsigned long bytes)
269 1.1 lukem {
270 1.1 lukem long i, j;
271 1.1 lukem
272 1.1 lukem i = (long) bytes;
273 1.1 lukem j = i & 7;
274 1.1 lukem i = (i + 7) >> 3;
275 1.1 lukem
276 1.1 lukem /*
277 1.1 lukem * Do a memmove using Ye Olde Duff's Device for efficiency.
278 1.1 lukem */
279 1.1 lukem if (src < dest) {
280 1.1 lukem src += bytes;
281 1.1 lukem dest += bytes;
282 1.1 lukem
283 1.1 lukem switch (j) {
284 1.1 lukem case 0: do {
285 1.1 lukem *--dest = *--src;
286 1.1 lukem case 7: *--dest = *--src;
287 1.1 lukem case 6: *--dest = *--src;
288 1.1 lukem case 5: *--dest = *--src;
289 1.1 lukem case 4: *--dest = *--src;
290 1.1 lukem case 3: *--dest = *--src;
291 1.1 lukem case 2: *--dest = *--src;
292 1.1 lukem case 1: *--dest = *--src;
293 1.1 lukem } while (--i > 0);
294 1.1 lukem }
295 1.1 lukem } else if (src > dest) {
296 1.1 lukem switch (j) {
297 1.1 lukem case 0: do {
298 1.1 lukem *dest++ = *src++;
299 1.1 lukem case 7: *dest++ = *src++;
300 1.1 lukem case 6: *dest++ = *src++;
301 1.1 lukem case 5: *dest++ = *src++;
302 1.1 lukem case 4: *dest++ = *src++;
303 1.1 lukem case 3: *dest++ = *src++;
304 1.1 lukem case 2: *dest++ = *src++;
305 1.1 lukem case 1: *dest++ = *src++;
306 1.1 lukem } while (--i > 0);
307 1.1 lukem }
308 1.1 lukem }
309 1.1 lukem }
310 1.1 lukem
311 1.1 lukem static void
312 1.1 lukem _ure_push(ucs2_t v, _ure_buffer_t *b)
313 1.1 lukem {
314 1.1 lukem _ure_stlist_t *s;
315 1.1 lukem
316 1.1 lukem if (b == 0)
317 1.1 lukem return;
318 1.1 lukem
319 1.1 lukem /*
320 1.1 lukem * If the `reducing' parameter is non-zero, check to see if the value
321 1.1 lukem * passed is already on the stack.
322 1.1 lukem */
323 1.1 lukem if (b->reducing != 0 && b->expr[v].onstack != 0)
324 1.1 lukem return;
325 1.1 lukem
326 1.1 lukem s = &b->stack;
327 1.1 lukem if (s->slist_used == s->slist_size) {
328 1.1 lukem if (s->slist_size == 0)
329 1.1 lukem s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
330 1.1 lukem else
331 1.1 lukem s->slist = (ucs2_t *) realloc((char *) s->slist,
332 1.1 lukem sizeof(ucs2_t) * (s->slist_size + 8));
333 1.1 lukem s->slist_size += 8;
334 1.1 lukem }
335 1.1 lukem s->slist[s->slist_used++] = v;
336 1.1 lukem
337 1.1 lukem /*
338 1.1 lukem * If the `reducing' parameter is non-zero, flag the element as being on
339 1.1 lukem * the stack.
340 1.1 lukem */
341 1.1 lukem if (b->reducing != 0)
342 1.1 lukem b->expr[v].onstack = 1;
343 1.1 lukem }
344 1.1 lukem
345 1.1 lukem static ucs2_t
346 1.1 lukem _ure_peek(_ure_buffer_t *b)
347 1.1 lukem {
348 1.1 lukem if (b == 0 || b->stack.slist_used == 0)
349 1.1 lukem return _URE_NOOP;
350 1.1 lukem
351 1.1 lukem return b->stack.slist[b->stack.slist_used - 1];
352 1.1 lukem }
353 1.1 lukem
354 1.1 lukem static ucs2_t
355 1.1 lukem _ure_pop(_ure_buffer_t *b)
356 1.1 lukem {
357 1.1 lukem ucs2_t v;
358 1.1 lukem
359 1.1 lukem if (b == 0 || b->stack.slist_used == 0)
360 1.1 lukem return _URE_NOOP;
361 1.1 lukem
362 1.1 lukem v = b->stack.slist[--b->stack.slist_used];
363 1.1 lukem if (b->reducing)
364 1.1 lukem b->expr[v].onstack = 0;
365 1.1 lukem
366 1.1 lukem return v;
367 1.1 lukem }
368 1.1 lukem
369 1.1 lukem /*************************************************************************
370 1.1 lukem *
371 1.1 lukem * Start symbol parse functions.
372 1.1 lukem *
373 1.1 lukem *************************************************************************/
374 1.1 lukem
375 1.1 lukem /*
376 1.1 lukem * Parse a comma-separated list of integers that represent character
377 1.1 lukem * properties. Combine them into a mask that is returned in the `mask'
378 1.1 lukem * variable, and return the number of characters consumed.
379 1.1 lukem */
380 1.1 lukem static unsigned long
381 1.1 lukem _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
382 1.1 lukem _ure_buffer_t *b)
383 1.1 lukem {
384 1.1 lukem unsigned long n, m;
385 1.1 lukem ucs2_t *sp, *ep;
386 1.1 lukem
387 1.1 lukem sp = pp;
388 1.1 lukem ep = sp + limit;
389 1.1 lukem
390 1.1 lukem for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
391 1.1 lukem if (*sp == ',') {
392 1.1 lukem /*
393 1.1 lukem * Encountered a comma, so select the next character property flag
394 1.1 lukem * and reset the number.
395 1.1 lukem */
396 1.1 lukem m |= cclass_flags[n];
397 1.1 lukem n = 0;
398 1.1 lukem } else if (*sp >= '0' && *sp <= '9')
399 1.1 lukem /*
400 1.1 lukem * Encountered a digit, so start or continue building the cardinal
401 1.1 lukem * that represents the character property flag.
402 1.1 lukem */
403 1.1 lukem n = (n * 10) + (*sp - '0');
404 1.1 lukem else
405 1.1 lukem /*
406 1.1 lukem * Encountered something that is not part of the property list.
407 1.1 lukem * Indicate that we are done.
408 1.1 lukem */
409 1.1 lukem break;
410 1.1 lukem
411 1.1 lukem /*
412 1.1 lukem * If a property number greater than 32 occurs, then there is a
413 1.1 lukem * problem. Most likely a missing comma separator.
414 1.1 lukem */
415 1.1 lukem if (n > 32)
416 1.1 lukem b->error = _URE_INVALID_PROPERTY;
417 1.1 lukem }
418 1.1 lukem
419 1.1 lukem if (n != 0)
420 1.1 lukem m |= cclass_flags[n];
421 1.1 lukem
422 1.1 lukem /*
423 1.1 lukem * Set the mask that represents the group of character properties.
424 1.1 lukem */
425 1.1 lukem *mask = m;
426 1.1 lukem
427 1.1 lukem /*
428 1.1 lukem * Return the number of characters consumed.
429 1.1 lukem */
430 1.1 lukem return sp - pp;
431 1.1 lukem }
432 1.1 lukem
433 1.1 lukem /*
434 1.1 lukem * Collect a hex number with 1 to 4 digits and return the number
435 1.1 lukem * of characters used.
436 1.1 lukem */
437 1.1 lukem static unsigned long
438 1.1 lukem _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
439 1.1 lukem {
440 1.1 lukem ucs2_t i;
441 1.1 lukem ucs2_t *sp, *ep;
442 1.1 lukem ucs4_t nn;
443 1.1 lukem
444 1.1 lukem sp = np;
445 1.1 lukem ep = sp + limit;
446 1.1 lukem
447 1.1 lukem for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
448 1.1 lukem if (*sp >= '0' && *sp <= '9')
449 1.1 lukem nn = (nn << 4) + (*sp - '0');
450 1.1 lukem else if (*sp >= 'A' && *sp <= 'F')
451 1.1 lukem nn = (nn << 4) + ((*sp - 'A') + 10);
452 1.1 lukem else if (*sp >= 'a' && *sp <= 'f')
453 1.1 lukem nn = (nn << 4) + ((*sp - 'a') + 10);
454 1.1 lukem else
455 1.1 lukem /*
456 1.1 lukem * Encountered something that is not a hex digit.
457 1.1 lukem */
458 1.1 lukem break;
459 1.1 lukem }
460 1.1 lukem
461 1.1 lukem /*
462 1.1 lukem * Assign the character code collected and return the number of
463 1.1 lukem * characters used.
464 1.1 lukem */
465 1.1 lukem *n = nn;
466 1.1 lukem
467 1.1 lukem return sp - np;
468 1.1 lukem }
469 1.1 lukem
470 1.1 lukem /*
471 1.1 lukem * Insert a range into a character class, removing duplicates and ordering
472 1.1 lukem * them in increasing range-start order.
473 1.1 lukem */
474 1.1 lukem static void
475 1.1 lukem _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
476 1.1 lukem {
477 1.1 lukem ucs2_t i;
478 1.1 lukem ucs4_t tmp;
479 1.1 lukem _ure_range_t *rp;
480 1.1 lukem
481 1.1 lukem /*
482 1.1 lukem * If the `casefold' flag is set, then make sure both endpoints of the
483 1.1 lukem * range are converted to lower case.
484 1.1 lukem */
485 1.1 lukem if (b->flags & _URE_DFA_CASEFOLD) {
486 1.1 lukem r->min_code = _ure_tolower(r->min_code);
487 1.1 lukem r->max_code = _ure_tolower(r->max_code);
488 1.1 lukem }
489 1.1 lukem
490 1.1 lukem /*
491 1.1 lukem * Swap the range endpoints if they are not in increasing order.
492 1.1 lukem */
493 1.1 lukem if (r->min_code > r->max_code) {
494 1.1 lukem tmp = r->min_code;
495 1.1 lukem r->min_code = r->max_code;
496 1.1 lukem r->max_code = tmp;
497 1.1 lukem }
498 1.1 lukem
499 1.1 lukem for (i = 0, rp = ccl->ranges;
500 1.1 lukem i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
501 1.1 lukem
502 1.1 lukem /*
503 1.1 lukem * Check for a duplicate.
504 1.1 lukem */
505 1.1 lukem if (i < ccl->ranges_used &&
506 1.1 lukem r->min_code == rp->min_code && r->max_code == rp->max_code)
507 1.1 lukem return;
508 1.1 lukem
509 1.1 lukem if (ccl->ranges_used == ccl->ranges_size) {
510 1.1 lukem if (ccl->ranges_size == 0)
511 1.1 lukem ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
512 1.1 lukem else
513 1.1 lukem ccl->ranges = (_ure_range_t *)
514 1.1 lukem realloc((char *) ccl->ranges,
515 1.1 lukem sizeof(_ure_range_t) * (ccl->ranges_size + 8));
516 1.1 lukem ccl->ranges_size += 8;
517 1.1 lukem }
518 1.1 lukem
519 1.1 lukem rp = ccl->ranges + ccl->ranges_used;
520 1.1 lukem
521 1.1 lukem if (i < ccl->ranges_used)
522 1.1 lukem _ure_memmove((char *) (rp + 1), (char *) rp,
523 1.1 lukem sizeof(_ure_range_t) * (ccl->ranges_used - i));
524 1.1 lukem
525 1.1 lukem ccl->ranges_used++;
526 1.1 lukem rp->min_code = r->min_code;
527 1.1 lukem rp->max_code = r->max_code;
528 1.1 lukem }
529 1.1 lukem
530 1.1 lukem #define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
531 1.1 lukem _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
532 1.1 lukem #define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT)
533 1.1 lukem #define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
534 1.1 lukem _URE_OTHERPUNCT)
535 1.1 lukem #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
536 1.1 lukem _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
537 1.1 lukem #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
538 1.1 lukem #define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
539 1.1 lukem
540 1.1 lukem typedef void (*_ure_cclsetup_t)(
541 1.1 lukem _ure_symtab_t *sym,
542 1.1 lukem unsigned long mask,
543 1.1 lukem _ure_buffer_t *b
544 1.1 lukem );
545 1.1 lukem
546 1.1 lukem typedef struct {
547 1.1 lukem ucs2_t key;
548 1.1 lukem unsigned long len;
549 1.1 lukem unsigned long next;
550 1.1 lukem _ure_cclsetup_t func;
551 1.1 lukem unsigned long mask;
552 1.1 lukem } _ure_trie_t;
553 1.1 lukem
554 1.1 lukem static void
555 1.1 lukem _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
556 1.1 lukem {
557 1.1 lukem sym->props |= mask;
558 1.1 lukem }
559 1.1 lukem
560 1.1 lukem static void
561 1.1 lukem _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
562 1.1 lukem {
563 1.1 lukem _ure_range_t range;
564 1.1 lukem
565 1.1 lukem sym->props |= mask;
566 1.1 lukem
567 1.1 lukem /*
568 1.1 lukem * Add the additional characters needed for handling isspace().
569 1.1 lukem */
570 1.1 lukem range.min_code = range.max_code = '\t';
571 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
572 1.1 lukem range.min_code = range.max_code = '\r';
573 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
574 1.1 lukem range.min_code = range.max_code = '\n';
575 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
576 1.1 lukem range.min_code = range.max_code = '\f';
577 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
578 1.1 lukem range.min_code = range.max_code = 0xfeff;
579 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
580 1.1 lukem }
581 1.1 lukem
582 1.1 lukem static void
583 1.1 lukem _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
584 1.1 lukem {
585 1.1 lukem _ure_range_t range;
586 1.1 lukem
587 1.1 lukem /*
588 1.1 lukem * Add the additional characters needed for handling isxdigit().
589 1.1 lukem */
590 1.1 lukem range.min_code = '0';
591 1.1 lukem range.max_code = '9';
592 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
593 1.1 lukem range.min_code = 'A';
594 1.1 lukem range.max_code = 'F';
595 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
596 1.1 lukem range.min_code = 'a';
597 1.1 lukem range.max_code = 'f';
598 1.1 lukem _ure_add_range(&sym->sym.ccl, &range, b);
599 1.1 lukem }
600 1.1 lukem
601 1.1 lukem static _ure_trie_t cclass_trie[] = {
602 1.1 lukem {0x003a, 1, 1, 0, 0},
603 1.1 lukem {0x0061, 9, 10, 0, 0},
604 1.1 lukem {0x0063, 8, 19, 0, 0},
605 1.1 lukem {0x0064, 7, 24, 0, 0},
606 1.1 lukem {0x0067, 6, 29, 0, 0},
607 1.1 lukem {0x006c, 5, 34, 0, 0},
608 1.1 lukem {0x0070, 4, 39, 0, 0},
609 1.1 lukem {0x0073, 3, 49, 0, 0},
610 1.1 lukem {0x0075, 2, 54, 0, 0},
611 1.1 lukem {0x0078, 1, 59, 0, 0},
612 1.1 lukem {0x006c, 1, 11, 0, 0},
613 1.1 lukem {0x006e, 2, 13, 0, 0},
614 1.1 lukem {0x0070, 1, 16, 0, 0},
615 1.1 lukem {0x0075, 1, 14, 0, 0},
616 1.1 lukem {0x006d, 1, 15, 0, 0},
617 1.1 lukem {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
618 1.1 lukem {0x0068, 1, 17, 0, 0},
619 1.1 lukem {0x0061, 1, 18, 0, 0},
620 1.1 lukem {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
621 1.1 lukem {0x006e, 1, 20, 0, 0},
622 1.1 lukem {0x0074, 1, 21, 0, 0},
623 1.1 lukem {0x0072, 1, 22, 0, 0},
624 1.1 lukem {0x006c, 1, 23, 0, 0},
625 1.1 lukem {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
626 1.1 lukem {0x0069, 1, 25, 0, 0},
627 1.1 lukem {0x0067, 1, 26, 0, 0},
628 1.1 lukem {0x0069, 1, 27, 0, 0},
629 1.1 lukem {0x0074, 1, 28, 0, 0},
630 1.1 lukem {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
631 1.1 lukem {0x0072, 1, 30, 0, 0},
632 1.1 lukem {0x0061, 1, 31, 0, 0},
633 1.1 lukem {0x0070, 1, 32, 0, 0},
634 1.1 lukem {0x0068, 1, 33, 0, 0},
635 1.1 lukem {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
636 1.1 lukem {0x006f, 1, 35, 0, 0},
637 1.1 lukem {0x0077, 1, 36, 0, 0},
638 1.1 lukem {0x0065, 1, 37, 0, 0},
639 1.1 lukem {0x0072, 1, 38, 0, 0},
640 1.1 lukem {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
641 1.1 lukem {0x0072, 2, 41, 0, 0},
642 1.1 lukem {0x0075, 1, 45, 0, 0},
643 1.1 lukem {0x0069, 1, 42, 0, 0},
644 1.1 lukem {0x006e, 1, 43, 0, 0},
645 1.1 lukem {0x0074, 1, 44, 0, 0},
646 1.1 lukem {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
647 1.1 lukem {0x006e, 1, 46, 0, 0},
648 1.1 lukem {0x0063, 1, 47, 0, 0},
649 1.1 lukem {0x0074, 1, 48, 0, 0},
650 1.1 lukem {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
651 1.1 lukem {0x0070, 1, 50, 0, 0},
652 1.1 lukem {0x0061, 1, 51, 0, 0},
653 1.1 lukem {0x0063, 1, 52, 0, 0},
654 1.1 lukem {0x0065, 1, 53, 0, 0},
655 1.1 lukem {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
656 1.1 lukem {0x0070, 1, 55, 0, 0},
657 1.1 lukem {0x0070, 1, 56, 0, 0},
658 1.1 lukem {0x0065, 1, 57, 0, 0},
659 1.1 lukem {0x0072, 1, 58, 0, 0},
660 1.1 lukem {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
661 1.1 lukem {0x0064, 1, 60, 0, 0},
662 1.1 lukem {0x0069, 1, 61, 0, 0},
663 1.1 lukem {0x0067, 1, 62, 0, 0},
664 1.1 lukem {0x0069, 1, 63, 0, 0},
665 1.1 lukem {0x0074, 1, 64, 0, 0},
666 1.1 lukem {0x003a, 1, 65, _ure_xdigit_setup, 0},
667 1.1 lukem };
668 1.1 lukem
669 1.1 lukem /*
670 1.1 lukem * Probe for one of the POSIX colon delimited character classes in the static
671 1.1 lukem * trie.
672 1.1 lukem */
673 1.1 lukem static unsigned long
674 1.1 lukem _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
675 1.1 lukem _ure_buffer_t *b)
676 1.1 lukem {
677 1.1 lukem int i;
678 1.1 lukem unsigned long n;
679 1.1 lukem _ure_trie_t *tp;
680 1.1 lukem ucs2_t *sp, *ep;
681 1.1 lukem
682 1.1 lukem /*
683 1.1 lukem * If the number of characters left is less than 7, then this cannot be
684 1.1 lukem * interpreted as one of the colon delimited classes.
685 1.1 lukem */
686 1.1 lukem if (limit < 7)
687 1.1 lukem return 0;
688 1.1 lukem
689 1.1 lukem sp = cp;
690 1.1 lukem ep = sp + limit;
691 1.1 lukem tp = cclass_trie;
692 1.1 lukem for (i = 0; sp < ep && i < 8; i++, sp++) {
693 1.1 lukem n = tp->len;
694 1.1 lukem
695 1.1 lukem for (; n > 0 && tp->key != *sp; tp++, n--) ;
696 1.1 lukem
697 1.1 lukem if (n == 0)
698 1.1 lukem return 0;
699 1.1 lukem
700 1.1 lukem if (*sp == ':' && (i == 6 || i == 7)) {
701 1.1 lukem sp++;
702 1.1 lukem break;
703 1.1 lukem }
704 1.1 lukem if (sp + 1 < ep)
705 1.1 lukem tp = cclass_trie + tp->next;
706 1.1 lukem }
707 1.1 lukem if (tp->func == 0)
708 1.1 lukem return 0;
709 1.1 lukem
710 1.1 lukem (*tp->func)(sym, tp->mask, b);
711 1.1 lukem
712 1.1 lukem return sp - cp;
713 1.1 lukem }
714 1.1 lukem
715 1.1 lukem /*
716 1.1 lukem * Construct a list of ranges and return the number of characters consumed.
717 1.1 lukem */
718 1.1 lukem static unsigned long
719 1.1 lukem _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
720 1.1 lukem _ure_buffer_t *b)
721 1.1 lukem {
722 1.1 lukem int range_end;
723 1.1 lukem unsigned long n;
724 1.1 lukem ucs2_t *sp, *ep;
725 1.1 lukem ucs4_t c, last;
726 1.1 lukem _ure_ccl_t *cclp;
727 1.1 lukem _ure_range_t range;
728 1.1 lukem
729 1.1 lukem sp = cp;
730 1.1 lukem ep = sp + limit;
731 1.1 lukem
732 1.1 lukem if (*sp == '^') {
733 1.1 lukem symp->type = _URE_NCCLASS;
734 1.1 lukem sp++;
735 1.1 lukem } else
736 1.1 lukem symp->type = _URE_CCLASS;
737 1.1 lukem
738 1.1 lukem for (last = 0, range_end = 0;
739 1.1 lukem b->error == _URE_OK && sp < ep && *sp != ']'; ) {
740 1.1 lukem c = *sp++;
741 1.1 lukem if (c == '\\') {
742 1.1 lukem if (sp == ep) {
743 1.1 lukem /*
744 1.1 lukem * The EOS was encountered when expecting the reverse solidus
745 1.1 lukem * to be followed by the character it is escaping. Set an
746 1.1 lukem * error code and return the number of characters consumed up
747 1.1 lukem * to this point.
748 1.1 lukem */
749 1.1 lukem b->error = _URE_UNEXPECTED_EOS;
750 1.1 lukem return sp - cp;
751 1.1 lukem }
752 1.1 lukem
753 1.1 lukem c = *sp++;
754 1.1 lukem switch (c) {
755 1.1 lukem case 'a':
756 1.1 lukem c = 0x07;
757 1.1 lukem break;
758 1.1 lukem case 'b':
759 1.1 lukem c = 0x08;
760 1.1 lukem break;
761 1.1 lukem case 'f':
762 1.1 lukem c = 0x0c;
763 1.1 lukem break;
764 1.1 lukem case 'n':
765 1.1 lukem c = 0x0a;
766 1.1 lukem break;
767 1.1 lukem case 'r':
768 1.1 lukem c = 0x0d;
769 1.1 lukem break;
770 1.1 lukem case 't':
771 1.1 lukem c = 0x09;
772 1.1 lukem break;
773 1.1 lukem case 'v':
774 1.1 lukem c = 0x0b;
775 1.1 lukem break;
776 1.1 lukem case 'p':
777 1.1 lukem case 'P':
778 1.1 lukem sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
779 1.1 lukem /*
780 1.1 lukem * Invert the bit mask of the properties if this is a negated
781 1.1 lukem * character class or if 'P' is used to specify a list of
782 1.1 lukem * character properties that should *not* match in a
783 1.1 lukem * character class.
784 1.1 lukem */
785 1.1 lukem if (c == 'P')
786 1.1 lukem symp->props = ~symp->props;
787 1.1 lukem continue;
788 1.1 lukem break;
789 1.1 lukem case 'x':
790 1.1 lukem case 'X':
791 1.1 lukem case 'u':
792 1.1 lukem case 'U':
793 1.1 lukem if (sp < ep &&
794 1.1 lukem ((*sp >= '0' && *sp <= '9') ||
795 1.1 lukem (*sp >= 'A' && *sp <= 'F') ||
796 1.1 lukem (*sp >= 'a' && *sp <= 'f')))
797 1.1 lukem sp += _ure_hex(sp, ep - sp, &c);
798 1.1 lukem }
799 1.1 lukem } else if (c == ':') {
800 1.1 lukem /*
801 1.1 lukem * Probe for a POSIX colon delimited character class.
802 1.1 lukem */
803 1.1 lukem sp--;
804 1.1 lukem if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
805 1.1 lukem sp++;
806 1.1 lukem else {
807 1.1 lukem sp += n;
808 1.1 lukem continue;
809 1.1 lukem }
810 1.1 lukem }
811 1.1 lukem
812 1.1 lukem cclp = &symp->sym.ccl;
813 1.1 lukem
814 1.1 lukem /*
815 1.1 lukem * Check to see if the current character is a low surrogate that needs
816 1.1 lukem * to be combined with a preceding high surrogate.
817 1.1 lukem */
818 1.1 lukem if (last != 0) {
819 1.1 lukem if (c >= 0xdc00 && c <= 0xdfff)
820 1.1 lukem /*
821 1.1 lukem * Construct the UTF16 character code.
822 1.1 lukem */
823 1.1 lukem c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
824 1.1 lukem else {
825 1.1 lukem /*
826 1.1 lukem * Add the isolated high surrogate to the range.
827 1.1 lukem */
828 1.1 lukem if (range_end == 1)
829 1.1 lukem range.max_code = last & 0xffff;
830 1.1 lukem else
831 1.1 lukem range.min_code = range.max_code = last & 0xffff;
832 1.1 lukem
833 1.1 lukem _ure_add_range(cclp, &range, b);
834 1.1 lukem range_end = 0;
835 1.1 lukem }
836 1.1 lukem }
837 1.1 lukem
838 1.1 lukem /*
839 1.1 lukem * Clear the last character code.
840 1.1 lukem */
841 1.1 lukem last = 0;
842 1.1 lukem
843 1.1 lukem /*
844 1.1 lukem * This slightly awkward code handles the different cases needed to
845 1.1 lukem * construct a range.
846 1.1 lukem */
847 1.1 lukem if (c >= 0xd800 && c <= 0xdbff) {
848 1.1 lukem /*
849 1.1 lukem * If the high surrogate is followed by a range indicator, simply
850 1.1 lukem * add it as the range start. Otherwise, save it in case the next
851 1.1 lukem * character is a low surrogate.
852 1.1 lukem */
853 1.1 lukem if (*sp == '-') {
854 1.1 lukem sp++;
855 1.1 lukem range.min_code = c;
856 1.1 lukem range_end = 1;
857 1.1 lukem } else
858 1.1 lukem last = c;
859 1.1 lukem } else if (range_end == 1) {
860 1.1 lukem range.max_code = c;
861 1.1 lukem _ure_add_range(cclp, &range, b);
862 1.1 lukem range_end = 0;
863 1.1 lukem } else {
864 1.1 lukem range.min_code = range.max_code = c;
865 1.1 lukem if (*sp == '-') {
866 1.1 lukem sp++;
867 1.1 lukem range_end = 1;
868 1.1 lukem } else
869 1.1 lukem _ure_add_range(cclp, &range, b);
870 1.1 lukem }
871 1.1 lukem }
872 1.1 lukem
873 1.1 lukem if (sp < ep && *sp == ']')
874 1.1 lukem sp++;
875 1.1 lukem else
876 1.1 lukem /*
877 1.1 lukem * The parse was not terminated by the character class close symbol
878 1.1 lukem * (']'), so set an error code.
879 1.1 lukem */
880 1.1 lukem b->error = _URE_CCLASS_OPEN;
881 1.1 lukem
882 1.1 lukem return sp - cp;
883 1.1 lukem }
884 1.1 lukem
885 1.1 lukem /*
886 1.1 lukem * Probe for a low surrogate hex code.
887 1.1 lukem */
888 1.1 lukem static unsigned long
889 1.1 lukem _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
890 1.1 lukem {
891 1.1 lukem ucs4_t i, code;
892 1.1 lukem ucs2_t *sp, *ep;
893 1.1 lukem
894 1.1 lukem for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
895 1.1 lukem if (*sp >= '0' && *sp <= '9')
896 1.1 lukem code = (code << 4) + (*sp - '0');
897 1.1 lukem else if (*sp >= 'A' && *sp <= 'F')
898 1.1 lukem code = (code << 4) + ((*sp - 'A') + 10);
899 1.1 lukem else if (*sp >= 'a' && *sp <= 'f')
900 1.1 lukem code = (code << 4) + ((*sp - 'a') + 10);
901 1.1 lukem else
902 1.1 lukem break;
903 1.1 lukem }
904 1.1 lukem
905 1.1 lukem *c = code;
906 1.1 lukem return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
907 1.1 lukem }
908 1.1 lukem
909 1.1 lukem static unsigned long
910 1.1 lukem _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
911 1.1 lukem _ure_buffer_t *b)
912 1.1 lukem {
913 1.1 lukem ucs4_t c;
914 1.1 lukem ucs2_t *sp, *ep;
915 1.1 lukem
916 1.1 lukem sp = sym;
917 1.1 lukem ep = sym + limit;
918 1.1 lukem
919 1.1 lukem if ((c = *sp++) == '\\') {
920 1.1 lukem
921 1.1 lukem if (sp == ep) {
922 1.1 lukem /*
923 1.1 lukem * The EOS was encountered when expecting the reverse solidus to
924 1.1 lukem * be followed by the character it is escaping. Set an error code
925 1.1 lukem * and return the number of characters consumed up to this point.
926 1.1 lukem */
927 1.1 lukem b->error = _URE_UNEXPECTED_EOS;
928 1.1 lukem return sp - sym;
929 1.1 lukem }
930 1.1 lukem
931 1.1 lukem c = *sp++;
932 1.1 lukem switch (c) {
933 1.1 lukem case 'p':
934 1.1 lukem case 'P':
935 1.1 lukem symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
936 1.1 lukem sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
937 1.1 lukem break;
938 1.1 lukem case 'a':
939 1.1 lukem symp->type = _URE_CHAR;
940 1.1 lukem symp->sym.chr = 0x07;
941 1.1 lukem break;
942 1.1 lukem case 'b':
943 1.1 lukem symp->type = _URE_CHAR;
944 1.1 lukem symp->sym.chr = 0x08;
945 1.1 lukem break;
946 1.1 lukem case 'f':
947 1.1 lukem symp->type = _URE_CHAR;
948 1.1 lukem symp->sym.chr = 0x0c;
949 1.1 lukem break;
950 1.1 lukem case 'n':
951 1.1 lukem symp->type = _URE_CHAR;
952 1.1 lukem symp->sym.chr = 0x0a;
953 1.1 lukem break;
954 1.1 lukem case 'r':
955 1.1 lukem symp->type = _URE_CHAR;
956 1.1 lukem symp->sym.chr = 0x0d;
957 1.1 lukem break;
958 1.1 lukem case 't':
959 1.1 lukem symp->type = _URE_CHAR;
960 1.1 lukem symp->sym.chr = 0x09;
961 1.1 lukem break;
962 1.1 lukem case 'v':
963 1.1 lukem symp->type = _URE_CHAR;
964 1.1 lukem symp->sym.chr = 0x0b;
965 1.1 lukem break;
966 1.1 lukem case 'x':
967 1.1 lukem case 'X':
968 1.1 lukem case 'u':
969 1.1 lukem case 'U':
970 1.1 lukem /*
971 1.1 lukem * Collect between 1 and 4 digits representing a UCS2 code. Fall
972 1.1 lukem * through to the next case.
973 1.1 lukem */
974 1.1 lukem if (sp < ep &&
975 1.1 lukem ((*sp >= '0' && *sp <= '9') ||
976 1.1 lukem (*sp >= 'A' && *sp <= 'F') ||
977 1.1 lukem (*sp >= 'a' && *sp <= 'f')))
978 1.1 lukem sp += _ure_hex(sp, ep - sp, &c);
979 1.1 lukem /* FALLTHROUGH */
980 1.1 lukem default:
981 1.1 lukem /*
982 1.1 lukem * Simply add an escaped character here.
983 1.1 lukem */
984 1.1 lukem symp->type = _URE_CHAR;
985 1.1 lukem symp->sym.chr = c;
986 1.1 lukem }
987 1.1 lukem } else if (c == '^' || c == '$')
988 1.1 lukem /*
989 1.1 lukem * Handle the BOL and EOL anchors. This actually consists simply of
990 1.1 lukem * setting a flag that indicates that the user supplied anchor match
991 1.1 lukem * function should be called. This needs to be done instead of simply
992 1.1 lukem * matching line/paragraph separators because beginning-of-text and
993 1.1 lukem * end-of-text tests are needed as well.
994 1.1 lukem */
995 1.1 lukem symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
996 1.1 lukem else if (c == '[')
997 1.1 lukem /*
998 1.1 lukem * Construct a character class.
999 1.1 lukem */
1000 1.1 lukem sp += _ure_cclass(sp, ep - sp, symp, b);
1001 1.1 lukem else if (c == '.')
1002 1.1 lukem symp->type = _URE_ANY_CHAR;
1003 1.1 lukem else {
1004 1.1 lukem symp->type = _URE_CHAR;
1005 1.1 lukem symp->sym.chr = c;
1006 1.1 lukem }
1007 1.1 lukem
1008 1.1 lukem /*
1009 1.1 lukem * If the symbol type happens to be a character and is a high surrogate,
1010 1.1 lukem * then probe forward to see if it is followed by a low surrogate that
1011 1.1 lukem * needs to be added.
1012 1.1 lukem */
1013 1.1 lukem if (sp < ep && symp->type == _URE_CHAR &&
1014 1.1 lukem 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
1015 1.1 lukem
1016 1.1 lukem if (0xdc00 <= *sp && *sp <= 0xdfff) {
1017 1.1 lukem symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1018 1.1 lukem (*sp & 0x03ff));
1019 1.1 lukem sp++;
1020 1.1 lukem } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
1021 1.1 lukem *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
1022 1.1 lukem sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
1023 1.1 lukem if (0xdc00 <= c && c <= 0xdfff) {
1024 1.1 lukem /*
1025 1.1 lukem * Take into account the \[xu] in front of the hex code.
1026 1.1 lukem */
1027 1.1 lukem sp += 2;
1028 1.1 lukem symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1029 1.1 lukem (c & 0x03ff));
1030 1.1 lukem }
1031 1.1 lukem }
1032 1.1 lukem }
1033 1.1 lukem
1034 1.1 lukem /*
1035 1.1 lukem * Last, make sure any _URE_CHAR type symbols are changed to lower case if
1036 1.1 lukem * the `casefold' flag is set.
1037 1.1 lukem */
1038 1.1 lukem if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
1039 1.1 lukem symp->sym.chr = _ure_tolower(symp->sym.chr);
1040 1.1 lukem
1041 1.1 lukem /*
1042 1.1 lukem * If the symbol constructed is anything other than one of the anchors,
1043 1.1 lukem * make sure the _URE_DFA_BLANKLINE flag is removed.
1044 1.1 lukem */
1045 1.1 lukem if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
1046 1.1 lukem b->flags &= ~_URE_DFA_BLANKLINE;
1047 1.1 lukem
1048 1.1 lukem /*
1049 1.1 lukem * Return the number of characters consumed.
1050 1.1 lukem */
1051 1.1 lukem return sp - sym;
1052 1.1 lukem }
1053 1.1 lukem
1054 1.1 lukem static int
1055 1.1 lukem _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
1056 1.1 lukem {
1057 1.1 lukem if (a->type != b->type || a->mods != b->mods || a->props != b->props)
1058 1.1 lukem return 1;
1059 1.1 lukem
1060 1.1 lukem if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
1061 1.1 lukem if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
1062 1.1 lukem return 1;
1063 1.1 lukem if (a->sym.ccl.ranges_used > 0 &&
1064 1.1 lukem memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
1065 1.1 lukem sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
1066 1.1 lukem return 1;
1067 1.1 lukem } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
1068 1.1 lukem return 1;
1069 1.1 lukem return 0;
1070 1.1 lukem }
1071 1.1 lukem
1072 1.1 lukem /*
1073 1.1 lukem * Construct a symbol, but only keep unique symbols.
1074 1.1 lukem */
1075 1.1 lukem static ucs2_t
1076 1.1 lukem _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
1077 1.1 lukem _ure_buffer_t *b)
1078 1.1 lukem {
1079 1.1 lukem ucs2_t i;
1080 1.1 lukem _ure_symtab_t *sp, symbol;
1081 1.1 lukem
1082 1.1 lukem /*
1083 1.1 lukem * Build the next symbol so we can test to see if it is already in the
1084 1.1 lukem * symbol table.
1085 1.1 lukem */
1086 1.1 lukem (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
1087 1.1 lukem *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
1088 1.1 lukem
1089 1.1 lukem /*
1090 1.1 lukem * Check to see if the symbol exists.
1091 1.1 lukem */
1092 1.1 lukem for (i = 0, sp = b->symtab;
1093 1.1 lukem i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
1094 1.1 lukem
1095 1.1 lukem if (i < b->symtab_used) {
1096 1.1 lukem /*
1097 1.1 lukem * Free up any ranges used for the symbol.
1098 1.1 lukem */
1099 1.1 lukem if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
1100 1.1 lukem symbol.sym.ccl.ranges_size > 0)
1101 1.1 lukem free((char *) symbol.sym.ccl.ranges);
1102 1.1 lukem
1103 1.1 lukem return b->symtab[i].id;
1104 1.1 lukem }
1105 1.1 lukem
1106 1.1 lukem /*
1107 1.1 lukem * Need to add the new symbol.
1108 1.1 lukem */
1109 1.1 lukem if (b->symtab_used == b->symtab_size) {
1110 1.1 lukem if (b->symtab_size == 0)
1111 1.1 lukem b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
1112 1.1 lukem else
1113 1.1 lukem b->symtab = (_ure_symtab_t *)
1114 1.1 lukem realloc((char *) b->symtab,
1115 1.1 lukem sizeof(_ure_symtab_t) * (b->symtab_size + 8));
1116 1.1 lukem sp = b->symtab + b->symtab_size;
1117 1.1 lukem (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
1118 1.1 lukem b->symtab_size += 8;
1119 1.1 lukem }
1120 1.1 lukem
1121 1.1 lukem symbol.id = b->symtab_used++;
1122 1.1 lukem (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
1123 1.1 lukem sizeof(_ure_symtab_t));
1124 1.1 lukem
1125 1.1 lukem return symbol.id;
1126 1.1 lukem }
1127 1.1 lukem
1128 1.1 lukem /*************************************************************************
1129 1.1 lukem *
1130 1.1 lukem * End symbol parse functions.
1131 1.1 lukem *
1132 1.1 lukem *************************************************************************/
1133 1.1 lukem
1134 1.1 lukem static ucs2_t
1135 1.1 lukem _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
1136 1.1 lukem {
1137 1.1 lukem ucs2_t i;
1138 1.1 lukem
1139 1.1 lukem if (b == 0)
1140 1.1 lukem return _URE_NOOP;
1141 1.1 lukem
1142 1.1 lukem /*
1143 1.1 lukem * Determine if the expression already exists or not.
1144 1.1 lukem */
1145 1.1 lukem for (i = 0; i < b->expr_used; i++) {
1146 1.1 lukem if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
1147 1.1 lukem b->expr[i].rhs == rhs)
1148 1.1 lukem break;
1149 1.1 lukem }
1150 1.1 lukem if (i < b->expr_used)
1151 1.1 lukem return i;
1152 1.1 lukem
1153 1.1 lukem /*
1154 1.1 lukem * Need to add a new expression.
1155 1.1 lukem */
1156 1.1 lukem if (b->expr_used == b->expr_size) {
1157 1.1 lukem if (b->expr_size == 0)
1158 1.1 lukem b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
1159 1.1 lukem else
1160 1.1 lukem b->expr = (_ure_elt_t *)
1161 1.1 lukem realloc((char *) b->expr,
1162 1.1 lukem sizeof(_ure_elt_t) * (b->expr_size + 8));
1163 1.1 lukem b->expr_size += 8;
1164 1.1 lukem }
1165 1.1 lukem
1166 1.1 lukem b->expr[b->expr_used].onstack = 0;
1167 1.1 lukem b->expr[b->expr_used].type = type;
1168 1.1 lukem b->expr[b->expr_used].lhs = lhs;
1169 1.1 lukem b->expr[b->expr_used].rhs = rhs;
1170 1.1 lukem
1171 1.1 lukem return b->expr_used++;
1172 1.1 lukem }
1173 1.1 lukem
1174 1.1 lukem static unsigned char spmap[] = {
1175 1.1 lukem 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
1176 1.1 lukem 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1177 1.1 lukem 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1178 1.1 lukem };
1179 1.1 lukem
1180 1.1 lukem #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
1181 1.1 lukem (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
1182 1.1 lukem
1183 1.1 lukem /*
1184 1.1 lukem * Convert the regular expression into an NFA in a form that will be easy to
1185 1.1 lukem * reduce to a DFA. The starting state for the reduction will be returned.
1186 1.1 lukem */
1187 1.1 lukem static ucs2_t
1188 1.1 lukem _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
1189 1.1 lukem {
1190 1.1 lukem ucs2_t c, state, top, sym, *sp, *ep;
1191 1.1 lukem unsigned long used;
1192 1.1 lukem
1193 1.1 lukem state = _URE_NOOP;
1194 1.1 lukem
1195 1.1 lukem sp = re;
1196 1.1 lukem ep = sp + relen;
1197 1.1 lukem while (b->error == _URE_OK && sp < ep) {
1198 1.1 lukem c = *sp++;
1199 1.1 lukem switch (c) {
1200 1.1 lukem case '(':
1201 1.1 lukem _ure_push(_URE_PAREN, b);
1202 1.1 lukem break;
1203 1.1 lukem case ')':
1204 1.1 lukem /*
1205 1.1 lukem * Check for the case of too many close parentheses.
1206 1.1 lukem */
1207 1.1 lukem if (_ure_peek(b) == _URE_NOOP) {
1208 1.1 lukem b->error = _URE_UNBALANCED_GROUP;
1209 1.1 lukem break;
1210 1.1 lukem }
1211 1.1 lukem
1212 1.1 lukem while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1213 1.1 lukem /*
1214 1.1 lukem * Make an expression with the AND or OR operator and its right
1215 1.1 lukem * hand side.
1216 1.1 lukem */
1217 1.1 lukem state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1218 1.1 lukem
1219 1.1 lukem /*
1220 1.1 lukem * Remove the _URE_PAREN off the stack.
1221 1.1 lukem */
1222 1.1 lukem (void) _ure_pop(b);
1223 1.1 lukem break;
1224 1.1 lukem case '*':
1225 1.1 lukem state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
1226 1.1 lukem break;
1227 1.1 lukem case '+':
1228 1.1 lukem state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
1229 1.1 lukem break;
1230 1.1 lukem case '?':
1231 1.1 lukem state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
1232 1.1 lukem break;
1233 1.1 lukem case '|':
1234 1.1 lukem while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1235 1.1 lukem /*
1236 1.1 lukem * Make an expression with the AND or OR operator and its right
1237 1.1 lukem * hand side.
1238 1.1 lukem */
1239 1.1 lukem state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1240 1.1 lukem
1241 1.1 lukem _ure_push(state, b);
1242 1.1 lukem _ure_push(_URE_OR, b);
1243 1.1 lukem break;
1244 1.1 lukem default:
1245 1.1 lukem sp--;
1246 1.1 lukem sym = _ure_make_symbol(sp, ep - sp, &used, b);
1247 1.1 lukem sp += used;
1248 1.1 lukem state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
1249 1.1 lukem break;
1250 1.1 lukem }
1251 1.1 lukem
1252 1.1 lukem if (c != '(' && c != '|' && sp < ep &&
1253 1.1 lukem (!_ure_isspecial(*sp) || *sp == '(')) {
1254 1.1 lukem _ure_push(state, b);
1255 1.1 lukem _ure_push(_URE_AND, b);
1256 1.1 lukem }
1257 1.1 lukem }
1258 1.1 lukem while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1259 1.1 lukem /*
1260 1.1 lukem * Make an expression with the AND or OR operator and its right
1261 1.1 lukem * hand side.
1262 1.1 lukem */
1263 1.1 lukem state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1264 1.1 lukem
1265 1.1 lukem if (b->stack.slist_used > 0)
1266 1.1 lukem b->error = _URE_UNBALANCED_GROUP;
1267 1.1 lukem
1268 1.1 lukem return (b->error == _URE_OK) ? state : _URE_NOOP;
1269 1.1 lukem }
1270 1.1 lukem
1271 1.1 lukem static void
1272 1.1 lukem _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
1273 1.1 lukem {
1274 1.1 lukem ucs2_t i, *stp;
1275 1.1 lukem _ure_symtab_t *sp;
1276 1.1 lukem
1277 1.1 lukem /*
1278 1.1 lukem * Locate the symbol in the symbol table so the state can be added.
1279 1.1 lukem * If the symbol doesn't exist, then a real problem exists.
1280 1.1 lukem */
1281 1.1 lukem for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
1282 1.1 lukem i++, sp++) ;
1283 1.1 lukem
1284 1.1 lukem /*
1285 1.1 lukem * Now find out if the state exists in the symbol's state list.
1286 1.1 lukem */
1287 1.1 lukem for (i = 0, stp = sp->states.slist;
1288 1.1 lukem i < sp->states.slist_used && state > *stp; i++, stp++) ;
1289 1.1 lukem
1290 1.1 lukem if (i == sp->states.slist_used || state < *stp) {
1291 1.1 lukem /*
1292 1.1 lukem * Need to add the state in order.
1293 1.1 lukem */
1294 1.1 lukem if (sp->states.slist_used == sp->states.slist_size) {
1295 1.1 lukem if (sp->states.slist_size == 0)
1296 1.1 lukem sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
1297 1.1 lukem else
1298 1.1 lukem sp->states.slist = (ucs2_t *)
1299 1.1 lukem realloc((char *) sp->states.slist,
1300 1.1 lukem sizeof(ucs2_t) * (sp->states.slist_size + 8));
1301 1.1 lukem sp->states.slist_size += 8;
1302 1.1 lukem }
1303 1.1 lukem if (i < sp->states.slist_used)
1304 1.1 lukem (void) _ure_memmove((char *) (sp->states.slist + i + 1),
1305 1.1 lukem (char *) (sp->states.slist + i),
1306 1.1 lukem sizeof(ucs2_t) * (sp->states.slist_used - i));
1307 1.1 lukem sp->states.slist[i] = state;
1308 1.1 lukem sp->states.slist_used++;
1309 1.1 lukem }
1310 1.1 lukem }
1311 1.1 lukem
1312 1.1 lukem static ucs2_t
1313 1.1 lukem _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
1314 1.1 lukem {
1315 1.1 lukem ucs2_t i;
1316 1.1 lukem _ure_state_t *sp;
1317 1.1 lukem
1318 1.1 lukem for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
1319 1.1 lukem if (sp->st.slist_used == nstates &&
1320 1.1 lukem memcmp((char *) states, (char *) sp->st.slist,
1321 1.1 lukem sizeof(ucs2_t) * nstates) == 0)
1322 1.1 lukem break;
1323 1.1 lukem }
1324 1.1 lukem
1325 1.1 lukem if (i == b->states.states_used) {
1326 1.1 lukem /*
1327 1.1 lukem * Need to add a new DFA state (set of NFA states).
1328 1.1 lukem */
1329 1.1 lukem if (b->states.states_used == b->states.states_size) {
1330 1.1 lukem if (b->states.states_size == 0)
1331 1.1 lukem b->states.states = (_ure_state_t *)
1332 1.1 lukem malloc(sizeof(_ure_state_t) << 3);
1333 1.1 lukem else
1334 1.1 lukem b->states.states = (_ure_state_t *)
1335 1.1 lukem realloc((char *) b->states.states,
1336 1.1 lukem sizeof(_ure_state_t) * (b->states.states_size + 8));
1337 1.1 lukem sp = b->states.states + b->states.states_size;
1338 1.1 lukem (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
1339 1.1 lukem b->states.states_size += 8;
1340 1.1 lukem }
1341 1.1 lukem
1342 1.1 lukem sp = b->states.states + b->states.states_used++;
1343 1.1 lukem sp->id = i;
1344 1.1 lukem
1345 1.1 lukem if (sp->st.slist_used + nstates > sp->st.slist_size) {
1346 1.1 lukem if (sp->st.slist_size == 0)
1347 1.1 lukem sp->st.slist = (ucs2_t *)
1348 1.1 lukem malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1349 1.1 lukem else
1350 1.1 lukem sp->st.slist = (ucs2_t *)
1351 1.1 lukem realloc((char *) sp->st.slist,
1352 1.1 lukem sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1353 1.1 lukem sp->st.slist_size = sp->st.slist_used + nstates;
1354 1.1 lukem }
1355 1.1 lukem sp->st.slist_used = nstates;
1356 1.1 lukem (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
1357 1.1 lukem sizeof(ucs2_t) * nstates);
1358 1.1 lukem }
1359 1.1 lukem
1360 1.1 lukem /*
1361 1.1 lukem * Return the ID of the DFA state representing a group of NFA states.
1362 1.1 lukem */
1363 1.1 lukem return i;
1364 1.1 lukem }
1365 1.1 lukem
1366 1.1 lukem static void
1367 1.1 lukem _ure_reduce(ucs2_t start, _ure_buffer_t *b)
1368 1.1 lukem {
1369 1.1 lukem ucs2_t i, j, state, eval, syms, rhs;
1370 1.1 lukem ucs2_t s1, s2, ns1, ns2;
1371 1.1 lukem _ure_state_t *sp;
1372 1.1 lukem _ure_symtab_t *smp;
1373 1.1 lukem
1374 1.1 lukem b->reducing = 1;
1375 1.1 lukem
1376 1.1 lukem /*
1377 1.1 lukem * Add the starting state for the reduction.
1378 1.1 lukem */
1379 1.1 lukem _ure_add_state(1, &start, b);
1380 1.1 lukem
1381 1.1 lukem /*
1382 1.1 lukem * Process each set of NFA states that get created.
1383 1.1 lukem */
1384 1.1 lukem for (i = 0; i < b->states.states_used; i++) {
1385 1.1 lukem sp = b->states.states + i;
1386 1.1 lukem
1387 1.1 lukem /*
1388 1.1 lukem * Push the current states on the stack.
1389 1.1 lukem */
1390 1.1 lukem for (j = 0; j < sp->st.slist_used; j++)
1391 1.1 lukem _ure_push(sp->st.slist[j], b);
1392 1.1 lukem
1393 1.1 lukem /*
1394 1.1 lukem * Reduce the NFA states.
1395 1.1 lukem */
1396 1.1 lukem for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
1397 1.1 lukem state = b->stack.slist[j];
1398 1.1 lukem eval = 1;
1399 1.1 lukem
1400 1.1 lukem /*
1401 1.1 lukem * This inner loop is the iterative equivalent of recursively
1402 1.1 lukem * reducing subexpressions generated as a result of a reduction.
1403 1.1 lukem */
1404 1.1 lukem while (eval) {
1405 1.1 lukem switch (b->expr[state].type) {
1406 1.1 lukem case _URE_SYMBOL:
1407 1.1 lukem ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1408 1.1 lukem _ure_add_symstate(b->expr[state].lhs, ns1, b);
1409 1.1 lukem syms++;
1410 1.1 lukem eval = 0;
1411 1.1 lukem break;
1412 1.1 lukem case _URE_ONE:
1413 1.1 lukem sp->accepting = 1;
1414 1.1 lukem eval = 0;
1415 1.1 lukem break;
1416 1.1 lukem case _URE_QUEST:
1417 1.1 lukem s1 = b->expr[state].lhs;
1418 1.1 lukem ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1419 1.1 lukem state = _ure_make_expr(_URE_OR, ns1, s1, b);
1420 1.1 lukem break;
1421 1.1 lukem case _URE_PLUS:
1422 1.1 lukem s1 = b->expr[state].lhs;
1423 1.1 lukem ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
1424 1.1 lukem state = _ure_make_expr(_URE_AND, s1, ns1, b);
1425 1.1 lukem break;
1426 1.1 lukem case _URE_STAR:
1427 1.1 lukem s1 = b->expr[state].lhs;
1428 1.1 lukem ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1429 1.1 lukem ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
1430 1.1 lukem state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1431 1.1 lukem break;
1432 1.1 lukem case _URE_OR:
1433 1.1 lukem s1 = b->expr[state].lhs;
1434 1.1 lukem s2 = b->expr[state].rhs;
1435 1.1 lukem _ure_push(s1, b);
1436 1.1 lukem _ure_push(s2, b);
1437 1.1 lukem eval = 0;
1438 1.1 lukem break;
1439 1.1 lukem case _URE_AND:
1440 1.1 lukem s1 = b->expr[state].lhs;
1441 1.1 lukem s2 = b->expr[state].rhs;
1442 1.1 lukem switch (b->expr[s1].type) {
1443 1.1 lukem case _URE_SYMBOL:
1444 1.1 lukem _ure_add_symstate(b->expr[s1].lhs, s2, b);
1445 1.1 lukem syms++;
1446 1.1 lukem eval = 0;
1447 1.1 lukem break;
1448 1.1 lukem case _URE_ONE:
1449 1.1 lukem state = s2;
1450 1.1 lukem break;
1451 1.1 lukem case _URE_QUEST:
1452 1.1 lukem ns1 = b->expr[s1].lhs;
1453 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
1454 1.1 lukem state = _ure_make_expr(_URE_OR, s2, ns2, b);
1455 1.1 lukem break;
1456 1.1 lukem case _URE_PLUS:
1457 1.1 lukem ns1 = b->expr[s1].lhs;
1458 1.1 lukem ns2 = _ure_make_expr(_URE_OR, s2, state, b);
1459 1.1 lukem state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1460 1.1 lukem break;
1461 1.1 lukem case _URE_STAR:
1462 1.1 lukem ns1 = b->expr[s1].lhs;
1463 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
1464 1.1 lukem state = _ure_make_expr(_URE_OR, s2, ns2, b);
1465 1.1 lukem break;
1466 1.1 lukem case _URE_OR:
1467 1.1 lukem ns1 = b->expr[s1].lhs;
1468 1.1 lukem ns2 = b->expr[s1].rhs;
1469 1.1 lukem ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
1470 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1471 1.1 lukem state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1472 1.1 lukem break;
1473 1.1 lukem case _URE_AND:
1474 1.1 lukem ns1 = b->expr[s1].lhs;
1475 1.1 lukem ns2 = b->expr[s1].rhs;
1476 1.1 lukem ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1477 1.1 lukem state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1478 1.1 lukem break;
1479 1.1 lukem }
1480 1.1 lukem }
1481 1.1 lukem }
1482 1.1 lukem }
1483 1.1 lukem
1484 1.1 lukem /*
1485 1.1 lukem * Clear the state stack.
1486 1.1 lukem */
1487 1.1 lukem while (_ure_pop(b) != _URE_NOOP) ;
1488 1.1 lukem
1489 1.1 lukem /*
1490 1.1 lukem * Reset the state pointer because the reduction may have moved it
1491 1.1 lukem * during a reallocation.
1492 1.1 lukem */
1493 1.1 lukem sp = b->states.states + i;
1494 1.1 lukem
1495 1.1 lukem /*
1496 1.1 lukem * Generate the DFA states for the symbols collected during the
1497 1.1 lukem * current reduction.
1498 1.1 lukem */
1499 1.1 lukem if (sp->trans_used + syms > sp->trans_size) {
1500 1.1 lukem if (sp->trans_size == 0)
1501 1.1 lukem sp->trans = (_ure_elt_t *)
1502 1.1 lukem malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
1503 1.1 lukem else
1504 1.1 lukem sp->trans = (_ure_elt_t *)
1505 1.1 lukem realloc((char *) sp->trans,
1506 1.1 lukem sizeof(_ure_elt_t) * (sp->trans_used + syms));
1507 1.1 lukem sp->trans_size = sp->trans_used + syms;
1508 1.1 lukem }
1509 1.1 lukem
1510 1.1 lukem /*
1511 1.1 lukem * Go through the symbol table and generate the DFA state transitions
1512 1.1 lukem * for each symbol that has collected NFA states.
1513 1.1 lukem */
1514 1.1 lukem for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
1515 1.1 lukem sp = b->states.states + i;
1516 1.1 lukem
1517 1.1 lukem if (smp->states.slist_used > 0) {
1518 1.1 lukem sp->trans[syms].lhs = smp->id;
1519 1.1 lukem rhs = _ure_add_state(smp->states.slist_used,
1520 1.1 lukem smp->states.slist, b);
1521 1.1 lukem /*
1522 1.1 lukem * Reset the state pointer in case the reallocation moves it
1523 1.1 lukem * in memory.
1524 1.1 lukem */
1525 1.1 lukem sp = b->states.states + i;
1526 1.1 lukem sp->trans[syms].rhs = rhs;
1527 1.1 lukem
1528 1.1 lukem smp->states.slist_used = 0;
1529 1.1 lukem syms++;
1530 1.1 lukem }
1531 1.1 lukem }
1532 1.1 lukem
1533 1.1 lukem /*
1534 1.1 lukem * Set the number of transitions actually used.
1535 1.1 lukem */
1536 1.1 lukem sp->trans_used = syms;
1537 1.1 lukem }
1538 1.1 lukem b->reducing = 0;
1539 1.1 lukem }
1540 1.1 lukem
1541 1.1 lukem static void
1542 1.1 lukem _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
1543 1.1 lukem {
1544 1.1 lukem ucs2_t tmp;
1545 1.1 lukem
1546 1.1 lukem l = b->states.states[l].id;
1547 1.1 lukem r = b->states.states[r].id;
1548 1.1 lukem
1549 1.1 lukem if (l == r)
1550 1.1 lukem return;
1551 1.1 lukem
1552 1.1 lukem if (l > r) {
1553 1.1 lukem tmp = l;
1554 1.1 lukem l = r;
1555 1.1 lukem r = tmp;
1556 1.1 lukem }
1557 1.1 lukem
1558 1.1 lukem /*
1559 1.1 lukem * Check to see if the equivalence pair already exists.
1560 1.1 lukem */
1561 1.1 lukem for (tmp = 0; tmp < b->equiv_used &&
1562 1.1 lukem (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
1563 1.1 lukem tmp++) ;
1564 1.1 lukem
1565 1.1 lukem if (tmp < b->equiv_used)
1566 1.1 lukem return;
1567 1.1 lukem
1568 1.1 lukem if (b->equiv_used == b->equiv_size) {
1569 1.1 lukem if (b->equiv_size == 0)
1570 1.1 lukem b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
1571 1.1 lukem else
1572 1.1 lukem b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
1573 1.1 lukem sizeof(_ure_equiv_t) *
1574 1.1 lukem (b->equiv_size + 8));
1575 1.1 lukem b->equiv_size += 8;
1576 1.1 lukem }
1577 1.1 lukem b->equiv[b->equiv_used].l = l;
1578 1.1 lukem b->equiv[b->equiv_used].r = r;
1579 1.1 lukem b->equiv_used++;
1580 1.1 lukem }
1581 1.1 lukem
1582 1.1 lukem /*
1583 1.1 lukem * Merge the DFA states that are equivalent.
1584 1.1 lukem */
1585 1.1 lukem static void
1586 1.1 lukem _ure_merge_equiv(_ure_buffer_t *b)
1587 1.1 lukem {
1588 1.1 lukem ucs2_t i, j, k, eq, done;
1589 1.1 lukem _ure_state_t *sp1, *sp2, *ls, *rs;
1590 1.1 lukem
1591 1.1 lukem for (i = 0; i < b->states.states_used; i++) {
1592 1.1 lukem sp1 = b->states.states + i;
1593 1.1 lukem if (sp1->id != i)
1594 1.1 lukem continue;
1595 1.1 lukem for (j = 0; j < i; j++) {
1596 1.1 lukem sp2 = b->states.states + j;
1597 1.1 lukem if (sp2->id != j)
1598 1.1 lukem continue;
1599 1.1 lukem b->equiv_used = 0;
1600 1.1 lukem _ure_add_equiv(i, j, b);
1601 1.1 lukem for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
1602 1.1 lukem ls = b->states.states + b->equiv[eq].l;
1603 1.1 lukem rs = b->states.states + b->equiv[eq].r;
1604 1.1 lukem if (ls->accepting != rs->accepting ||
1605 1.1 lukem ls->trans_used != rs->trans_used) {
1606 1.1 lukem done = 1;
1607 1.1 lukem break;
1608 1.1 lukem }
1609 1.1 lukem for (k = 0; k < ls->trans_used &&
1610 1.1 lukem ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
1611 1.1 lukem if (k < ls->trans_used) {
1612 1.1 lukem done = 1;
1613 1.1 lukem break;
1614 1.1 lukem }
1615 1.1 lukem
1616 1.1 lukem for (k = 0; k < ls->trans_used; k++)
1617 1.1 lukem _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
1618 1.1 lukem }
1619 1.1 lukem if (done == 0)
1620 1.1 lukem break;
1621 1.1 lukem }
1622 1.1 lukem for (eq = 0; j < i && eq < b->equiv_used; eq++)
1623 1.1 lukem b->states.states[b->equiv[eq].r].id =
1624 1.1 lukem b->states.states[b->equiv[eq].l].id;
1625 1.1 lukem }
1626 1.1 lukem
1627 1.1 lukem /*
1628 1.1 lukem * Renumber the states appropriately.
1629 1.1 lukem */
1630 1.1 lukem for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
1631 1.1 lukem sp1++, i++)
1632 1.1 lukem sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
1633 1.1 lukem }
1634 1.1 lukem
1635 1.1 lukem /*************************************************************************
1636 1.1 lukem *
1637 1.1 lukem * API.
1638 1.1 lukem *
1639 1.1 lukem *************************************************************************/
1640 1.1 lukem
1641 1.1 lukem ure_buffer_t
1642 1.1 lukem ure_buffer_create(void)
1643 1.1 lukem {
1644 1.1 lukem ure_buffer_t b;
1645 1.1 lukem
1646 1.1 lukem b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
1647 1.1 lukem
1648 1.1 lukem return b;
1649 1.1 lukem }
1650 1.1 lukem
1651 1.1 lukem void
1652 1.1 lukem ure_buffer_free(ure_buffer_t buf)
1653 1.1 lukem {
1654 1.1 lukem unsigned long i;
1655 1.1 lukem
1656 1.1 lukem if (buf == 0)
1657 1.1 lukem return;
1658 1.1 lukem
1659 1.1 lukem if (buf->stack.slist_size > 0)
1660 1.1 lukem free((char *) buf->stack.slist);
1661 1.1 lukem
1662 1.1 lukem if (buf->expr_size > 0)
1663 1.1 lukem free((char *) buf->expr);
1664 1.1 lukem
1665 1.1 lukem for (i = 0; i < buf->symtab_size; i++) {
1666 1.1 lukem if (buf->symtab[i].states.slist_size > 0)
1667 1.1 lukem free((char *) buf->symtab[i].states.slist);
1668 1.1 lukem }
1669 1.1 lukem
1670 1.1 lukem if (buf->symtab_size > 0)
1671 1.1 lukem free((char *) buf->symtab);
1672 1.1 lukem
1673 1.1 lukem for (i = 0; i < buf->states.states_size; i++) {
1674 1.1 lukem if (buf->states.states[i].trans_size > 0)
1675 1.1 lukem free((char *) buf->states.states[i].trans);
1676 1.1 lukem if (buf->states.states[i].st.slist_size > 0)
1677 1.1 lukem free((char *) buf->states.states[i].st.slist);
1678 1.1 lukem }
1679 1.1 lukem
1680 1.1 lukem if (buf->states.states_size > 0)
1681 1.1 lukem free((char *) buf->states.states);
1682 1.1 lukem
1683 1.1 lukem if (buf->equiv_size > 0)
1684 1.1 lukem free((char *) buf->equiv);
1685 1.1 lukem
1686 1.1 lukem free((char *) buf);
1687 1.1 lukem }
1688 1.1 lukem
1689 1.1 lukem ure_dfa_t
1690 1.1 lukem ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
1691 1.1 lukem {
1692 1.1 lukem ucs2_t i, j, state;
1693 1.1 lukem _ure_state_t *sp;
1694 1.1 lukem _ure_dstate_t *dsp;
1695 1.1 lukem _ure_trans_t *tp;
1696 1.1 lukem ure_dfa_t dfa;
1697 1.1 lukem
1698 1.1 lukem if (re == 0 || *re == 0 || relen == 0 || buf == 0)
1699 1.1 lukem return 0;
1700 1.1 lukem
1701 1.1 lukem /*
1702 1.1 lukem * Reset the various fields of the compilation buffer. Default the flags
1703 1.1 lukem * to indicate the presense of the "^$" pattern. If any other pattern
1704 1.1 lukem * occurs, then this flag will be removed. This is done to catch this
1705 1.1 lukem * special pattern and handle it specially when matching.
1706 1.1 lukem */
1707 1.1 lukem buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
1708 1.1 lukem buf->reducing = 0;
1709 1.1 lukem buf->stack.slist_used = 0;
1710 1.1 lukem buf->expr_used = 0;
1711 1.1 lukem
1712 1.1 lukem for (i = 0; i < buf->symtab_used; i++)
1713 1.1 lukem buf->symtab[i].states.slist_used = 0;
1714 1.1 lukem buf->symtab_used = 0;
1715 1.1 lukem
1716 1.1 lukem for (i = 0; i < buf->states.states_used; i++) {
1717 1.1 lukem buf->states.states[i].st.slist_used = 0;
1718 1.1 lukem buf->states.states[i].trans_used = 0;
1719 1.1 lukem }
1720 1.1 lukem buf->states.states_used = 0;
1721 1.1 lukem
1722 1.1 lukem /*
1723 1.1 lukem * Construct the NFA. If this stage returns a 0, then an error occured or
1724 1.1 lukem * an empty expression was passed.
1725 1.1 lukem */
1726 1.1 lukem if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
1727 1.1 lukem return 0;
1728 1.1 lukem
1729 1.1 lukem /*
1730 1.1 lukem * Do the expression reduction to get the initial DFA.
1731 1.1 lukem */
1732 1.1 lukem _ure_reduce(state, buf);
1733 1.1 lukem
1734 1.1 lukem /*
1735 1.1 lukem * Merge all the equivalent DFA states.
1736 1.1 lukem */
1737 1.1 lukem _ure_merge_equiv(buf);
1738 1.1 lukem
1739 1.1 lukem /*
1740 1.1 lukem * Construct the minimal DFA.
1741 1.1 lukem */
1742 1.1 lukem dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
1743 1.1 lukem (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
1744 1.1 lukem
1745 1.1 lukem dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
1746 1.1 lukem
1747 1.1 lukem /*
1748 1.1 lukem * Free up the NFA state groups and transfer the symbols from the buffer
1749 1.1 lukem * to the DFA.
1750 1.1 lukem */
1751 1.1 lukem for (i = 0; i < buf->symtab_size; i++) {
1752 1.1 lukem if (buf->symtab[i].states.slist_size > 0)
1753 1.1 lukem free((char *) buf->symtab[i].states.slist);
1754 1.1 lukem }
1755 1.1 lukem dfa->syms = buf->symtab;
1756 1.1 lukem dfa->nsyms = buf->symtab_used;
1757 1.1 lukem
1758 1.1 lukem buf->symtab_used = buf->symtab_size = 0;
1759 1.1 lukem
1760 1.1 lukem /*
1761 1.1 lukem * Collect the total number of states and transitions needed for the DFA.
1762 1.1 lukem */
1763 1.1 lukem for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1764 1.1 lukem i++, sp++) {
1765 1.1 lukem if (sp->id == state) {
1766 1.1 lukem dfa->nstates++;
1767 1.1 lukem dfa->ntrans += sp->trans_used;
1768 1.1 lukem state++;
1769 1.1 lukem }
1770 1.1 lukem }
1771 1.1 lukem
1772 1.1 lukem /*
1773 1.1 lukem * Allocate enough space for the states and transitions.
1774 1.1 lukem */
1775 1.1 lukem dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
1776 1.1 lukem dfa->nstates);
1777 1.1 lukem dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
1778 1.1 lukem
1779 1.1 lukem /*
1780 1.1 lukem * Actually transfer the DFA states from the buffer.
1781 1.1 lukem */
1782 1.1 lukem dsp = dfa->states;
1783 1.1 lukem tp = dfa->trans;
1784 1.1 lukem for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1785 1.1 lukem i++, sp++) {
1786 1.1 lukem if (sp->id == state) {
1787 1.1 lukem dsp->trans = tp;
1788 1.1 lukem dsp->ntrans = sp->trans_used;
1789 1.1 lukem dsp->accepting = sp->accepting;
1790 1.1 lukem
1791 1.1 lukem /*
1792 1.1 lukem * Add the transitions for the state.
1793 1.1 lukem */
1794 1.1 lukem for (j = 0; j < dsp->ntrans; j++, tp++) {
1795 1.1 lukem tp->symbol = sp->trans[j].lhs;
1796 1.1 lukem tp->next_state = buf->states.states[sp->trans[j].rhs].id;
1797 1.1 lukem }
1798 1.1 lukem
1799 1.1 lukem dsp++;
1800 1.1 lukem state++;
1801 1.1 lukem }
1802 1.1 lukem }
1803 1.1 lukem
1804 1.1 lukem return dfa;
1805 1.1 lukem }
1806 1.1 lukem
1807 1.1 lukem void
1808 1.1 lukem ure_dfa_free(ure_dfa_t dfa)
1809 1.1 lukem {
1810 1.1 lukem ucs2_t i;
1811 1.1 lukem
1812 1.1 lukem if (dfa == 0)
1813 1.1 lukem return;
1814 1.1 lukem
1815 1.1 lukem for (i = 0; i < dfa->nsyms; i++) {
1816 1.1 lukem if ((dfa->syms[i].type == _URE_CCLASS ||
1817 1.1 lukem dfa->syms[i].type == _URE_NCCLASS) &&
1818 1.1 lukem dfa->syms[i].sym.ccl.ranges_size > 0)
1819 1.1 lukem free((char *) dfa->syms[i].sym.ccl.ranges);
1820 1.1 lukem }
1821 1.1 lukem if (dfa->nsyms > 0)
1822 1.1 lukem free((char *) dfa->syms);
1823 1.1 lukem
1824 1.1 lukem if (dfa->nstates > 0)
1825 1.1 lukem free((char *) dfa->states);
1826 1.1 lukem if (dfa->ntrans > 0)
1827 1.1 lukem free((char *) dfa->trans);
1828 1.1 lukem free((char *) dfa);
1829 1.1 lukem }
1830 1.1 lukem
1831 1.1 lukem void
1832 1.1 lukem ure_write_dfa(ure_dfa_t dfa, FILE *out)
1833 1.1 lukem {
1834 1.1 lukem ucs2_t i, j, k, h, l;
1835 1.1 lukem _ure_dstate_t *sp;
1836 1.1 lukem _ure_symtab_t *sym;
1837 1.1 lukem _ure_range_t *rp;
1838 1.1 lukem
1839 1.1 lukem if (dfa == 0 || out == 0)
1840 1.1 lukem return;
1841 1.1 lukem
1842 1.1 lukem /*
1843 1.1 lukem * Write all the different character classes.
1844 1.1 lukem */
1845 1.1 lukem for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
1846 1.1 lukem if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
1847 1.1 lukem fprintf(out, "C%hd = ", sym->id);
1848 1.1 lukem if (sym->sym.ccl.ranges_used > 0) {
1849 1.1 lukem putc('[', out);
1850 1.1 lukem if (sym->type == _URE_NCCLASS)
1851 1.1 lukem putc('^', out);
1852 1.1 lukem }
1853 1.1 lukem if (sym->props != 0) {
1854 1.1 lukem if (sym->type == _URE_NCCLASS)
1855 1.1 lukem fprintf(out, "\\P");
1856 1.1 lukem else
1857 1.1 lukem fprintf(out, "\\p");
1858 1.1 lukem for (k = h = 0; k < 32; k++) {
1859 1.1 lukem if (sym->props & (1 << k)) {
1860 1.1 lukem if (h != 0)
1861 1.1 lukem putc(',', out);
1862 1.1 lukem fprintf(out, "%hd", k + 1);
1863 1.1 lukem h = 1;
1864 1.1 lukem }
1865 1.1 lukem }
1866 1.1 lukem }
1867 1.1 lukem /*
1868 1.1 lukem * Dump the ranges.
1869 1.1 lukem */
1870 1.1 lukem for (k = 0, rp = sym->sym.ccl.ranges;
1871 1.1 lukem k < sym->sym.ccl.ranges_used; k++, rp++) {
1872 1.1 lukem /*
1873 1.1 lukem * Check for UTF16 characters.
1874 1.1 lukem */
1875 1.1 lukem if (0x10000 <= rp->min_code &&
1876 1.1 lukem rp->min_code <= 0x10ffff) {
1877 1.1 lukem h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
1878 1.1 lukem l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
1879 1.1 lukem fprintf(out, "\\x%04hX\\x%04hX", h, l);
1880 1.1 lukem } else
1881 1.1 lukem fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
1882 1.1 lukem if (rp->max_code != rp->min_code) {
1883 1.1 lukem putc('-', out);
1884 1.1 lukem if (rp->max_code >= 0x10000 &&
1885 1.1 lukem rp->max_code <= 0x10ffff) {
1886 1.1 lukem h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
1887 1.1 lukem l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
1888 1.1 lukem fprintf(out, "\\x%04hX\\x%04hX", h, l);
1889 1.1 lukem } else
1890 1.1 lukem fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
1891 1.1 lukem }
1892 1.1 lukem }
1893 1.1 lukem if (sym->sym.ccl.ranges_used > 0)
1894 1.1 lukem putc(']', out);
1895 1.1 lukem putc('\n', out);
1896 1.1 lukem }
1897 1.1 lukem }
1898 1.1 lukem
1899 1.1 lukem for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
1900 1.1 lukem fprintf(out, "S%hd = ", i);
1901 1.1 lukem if (sp->accepting) {
1902 1.1 lukem fprintf(out, "1 ");
1903 1.1 lukem if (sp->ntrans)
1904 1.1 lukem fprintf(out, "| ");
1905 1.1 lukem }
1906 1.1 lukem for (j = 0; j < sp->ntrans; j++) {
1907 1.1 lukem if (j > 0)
1908 1.1 lukem fprintf(out, "| ");
1909 1.1 lukem
1910 1.1 lukem sym = dfa->syms + sp->trans[j].symbol;
1911 1.1 lukem switch (sym->type) {
1912 1.1 lukem case _URE_CHAR:
1913 1.1 lukem if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
1914 1.1 lukem /*
1915 1.1 lukem * Take care of UTF16 characters.
1916 1.1 lukem */
1917 1.1 lukem h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
1918 1.1 lukem l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
1919 1.1 lukem fprintf(out, "\\x%04hX\\x%04hX ", h, l);
1920 1.1 lukem } else
1921 1.1 lukem fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
1922 1.1 lukem break;
1923 1.1 lukem case _URE_ANY_CHAR:
1924 1.1 lukem fprintf(out, "<any> ");
1925 1.1 lukem break;
1926 1.1 lukem case _URE_BOL_ANCHOR:
1927 1.1 lukem fprintf(out, "<bol-anchor> ");
1928 1.1 lukem break;
1929 1.1 lukem case _URE_EOL_ANCHOR:
1930 1.1 lukem fprintf(out, "<eol-anchor> ");
1931 1.1 lukem break;
1932 1.1 lukem case _URE_CCLASS:
1933 1.1 lukem case _URE_NCCLASS:
1934 1.1 lukem fprintf(out, "[C%hd] ", sym->id);
1935 1.1 lukem break;
1936 1.1 lukem }
1937 1.1 lukem fprintf(out, "S%hd", sp->trans[j].next_state);
1938 1.1 lukem if (j + 1 < sp->ntrans)
1939 1.1 lukem putc(' ', out);
1940 1.1 lukem }
1941 1.1 lukem putc('\n', out);
1942 1.1 lukem }
1943 1.1 lukem }
1944 1.1 lukem
1945 1.1 lukem #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
1946 1.1 lukem (cc) == 0x2029)
1947 1.1 lukem
1948 1.1 lukem int
1949 1.1 lukem ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
1950 1.1 lukem unsigned long *match_start, unsigned long *match_end)
1951 1.1 lukem {
1952 1.1 lukem int i, j, matched, found, skip;
1953 1.1 lukem unsigned long ms, me;
1954 1.1 lukem ucs4_t c;
1955 1.1 lukem ucs2_t *sp, *ep, *lp;
1956 1.1 lukem _ure_dstate_t *stp;
1957 1.1 lukem _ure_symtab_t *sym;
1958 1.1 lukem _ure_range_t *rp;
1959 1.1 lukem
1960 1.1 lukem if (dfa == 0 || text == 0)
1961 1.1 lukem return 0;
1962 1.1 lukem
1963 1.1 lukem /*
1964 1.1 lukem * Handle the special case of an empty string matching the "^$" pattern.
1965 1.1 lukem */
1966 1.1 lukem if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
1967 1.1 lukem *match_start = *match_end = 0;
1968 1.1 lukem return 1;
1969 1.1 lukem }
1970 1.1 lukem
1971 1.1 lukem sp = text;
1972 1.1 lukem ep = sp + textlen;
1973 1.1 lukem
1974 1.1 lukem ms = me = ~0;
1975 1.1 lukem
1976 1.1 lukem stp = dfa->states;
1977 1.1 lukem
1978 1.1 lukem for (found = skip = 0; found == 0 && sp < ep; ) {
1979 1.1 lukem lp = sp;
1980 1.1 lukem c = *sp++;
1981 1.1 lukem
1982 1.1 lukem /*
1983 1.1 lukem * Check to see if this is a high surrogate that should be
1984 1.1 lukem * combined with a following low surrogate.
1985 1.1 lukem */
1986 1.1 lukem if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
1987 1.1 lukem 0xdc00 <= *sp && *sp <= 0xdfff)
1988 1.1 lukem c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
1989 1.1 lukem
1990 1.1 lukem /*
1991 1.1 lukem * Determine if the character is non-spacing and should be skipped.
1992 1.1 lukem */
1993 1.1 lukem if (_ure_matches_properties(_URE_NONSPACING, c) &&
1994 1.1 lukem (flags & URE_IGNORE_NONSPACING)) {
1995 1.1 lukem sp++;
1996 1.1 lukem continue;
1997 1.1 lukem }
1998 1.1 lukem
1999 1.1 lukem if (dfa->flags & _URE_DFA_CASEFOLD)
2000 1.1 lukem c = _ure_tolower(c);
2001 1.1 lukem
2002 1.1 lukem /*
2003 1.1 lukem * See if one of the transitions matches.
2004 1.1 lukem */
2005 1.1 lukem for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
2006 1.1 lukem sym = dfa->syms + stp->trans[i].symbol;
2007 1.1 lukem switch (sym->type) {
2008 1.1 lukem case _URE_ANY_CHAR:
2009 1.1 lukem if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
2010 1.1 lukem !_ure_issep(c))
2011 1.1 lukem matched = 1;
2012 1.1 lukem break;
2013 1.1 lukem case _URE_CHAR:
2014 1.1 lukem if (c == sym->sym.chr)
2015 1.1 lukem matched = 1;
2016 1.1 lukem break;
2017 1.1 lukem case _URE_BOL_ANCHOR:
2018 1.1 lukem if (lp == text) {
2019 1.1 lukem sp = lp;
2020 1.1 lukem matched = 1;
2021 1.1 lukem } else if (_ure_issep(c)) {
2022 1.1 lukem if (c == '\r' && sp < ep && *sp == '\n')
2023 1.1 lukem sp++;
2024 1.1 lukem lp = sp;
2025 1.1 lukem matched = 1;
2026 1.1 lukem }
2027 1.1 lukem break;
2028 1.1 lukem case _URE_EOL_ANCHOR:
2029 1.1 lukem if (_ure_issep(c)) {
2030 1.1 lukem /*
2031 1.1 lukem * Put the pointer back before the separator so the match
2032 1.1 lukem * end position will be correct. This case will also
2033 1.1 lukem * cause the `sp' pointer to be advanced over the current
2034 1.1 lukem * separator once the match end point has been recorded.
2035 1.1 lukem */
2036 1.1 lukem sp = lp;
2037 1.1 lukem matched = 1;
2038 1.1 lukem }
2039 1.1 lukem break;
2040 1.1 lukem case _URE_CCLASS:
2041 1.1 lukem case _URE_NCCLASS:
2042 1.1 lukem if (sym->props != 0)
2043 1.1 lukem matched = _ure_matches_properties(sym->props, c);
2044 1.1 lukem for (j = 0, rp = sym->sym.ccl.ranges;
2045 1.1 lukem j < sym->sym.ccl.ranges_used; j++, rp++) {
2046 1.1 lukem if (rp->min_code <= c && c <= rp->max_code)
2047 1.1 lukem matched = 1;
2048 1.1 lukem }
2049 1.1 lukem if (sym->type == _URE_NCCLASS)
2050 1.1 lukem matched = !matched;
2051 1.1 lukem break;
2052 1.1 lukem }
2053 1.1 lukem
2054 1.1 lukem if (matched) {
2055 1.1 lukem if (ms == ~0UL)
2056 1.1 lukem ms = lp - text;
2057 1.1 lukem else
2058 1.1 lukem me = sp - text;
2059 1.1 lukem stp = dfa->states + stp->trans[i].next_state;
2060 1.1 lukem
2061 1.1 lukem /*
2062 1.1 lukem * If the match was an EOL anchor, adjust the pointer past the
2063 1.1 lukem * separator that caused the match. The correct match
2064 1.1 lukem * position has been recorded already.
2065 1.1 lukem */
2066 1.1 lukem if (sym->type == _URE_EOL_ANCHOR) {
2067 1.1 lukem /*
2068 1.1 lukem * Skip the character that caused the match.
2069 1.1 lukem */
2070 1.1 lukem sp++;
2071 1.1 lukem
2072 1.1 lukem /*
2073 1.1 lukem * Handle the infamous CRLF situation.
2074 1.1 lukem */
2075 1.1 lukem if (sp < ep && c == '\r' && *sp == '\n')
2076 1.1 lukem sp++;
2077 1.1 lukem }
2078 1.1 lukem }
2079 1.1 lukem }
2080 1.1 lukem
2081 1.1 lukem if (matched == 0) {
2082 1.1 lukem if (stp->accepting == 0) {
2083 1.1 lukem /*
2084 1.1 lukem * If the last state was not accepting, then reset
2085 1.1 lukem * and start over.
2086 1.1 lukem */
2087 1.1 lukem stp = dfa->states;
2088 1.1 lukem ms = me = ~0;
2089 1.1 lukem } else
2090 1.1 lukem /*
2091 1.1 lukem * The last state was accepting, so terminate the matching
2092 1.1 lukem * loop to avoid more work.
2093 1.1 lukem */
2094 1.1 lukem found = 1;
2095 1.1 lukem } else if (sp == ep) {
2096 1.1 lukem if (!stp->accepting) {
2097 1.1 lukem /*
2098 1.1 lukem * This ugly hack is to make sure the end-of-line anchors
2099 1.1 lukem * match when the source text hits the end. This is only done
2100 1.1 lukem * if the last subexpression matches.
2101 1.1 lukem */
2102 1.1 lukem for (i = 0; found == 0 && i < stp->ntrans; i++) {
2103 1.1 lukem sym = dfa->syms + stp->trans[i].symbol;
2104 1.1 lukem if (sym->type ==_URE_EOL_ANCHOR) {
2105 1.1 lukem stp = dfa->states + stp->trans[i].next_state;
2106 1.1 lukem if (stp->accepting) {
2107 1.1 lukem me = sp - text;
2108 1.1 lukem found = 1;
2109 1.1 lukem } else
2110 1.1 lukem break;
2111 1.1 lukem }
2112 1.1 lukem }
2113 1.1 lukem } else {
2114 1.1 lukem /*
2115 1.1 lukem * Make sure any conditions that match all the way to the end
2116 1.1 lukem * of the string match.
2117 1.1 lukem */
2118 1.1 lukem found = 1;
2119 1.1 lukem me = sp - text;
2120 1.1 lukem }
2121 1.1 lukem }
2122 1.1 lukem }
2123 1.1 lukem
2124 1.1 lukem if (found == 0)
2125 1.1 lukem ms = me = ~0;
2126 1.1 lukem
2127 1.1 lukem *match_start = ms;
2128 1.1 lukem *match_end = me;
2129 1.1 lukem
2130 1.1 lukem return (ms != ~0UL) ? 1 : 0;
2131 1.1 lukem }
2132