tre-match-approx.c revision 1.5 1 1.1 agc /*
2 1.1 agc tre-match-approx.c - TRE approximate regex matching engine
3 1.1 agc
4 1.1 agc This software is released under a BSD-style license.
5 1.1 agc See the file LICENSE for details and copyright.
6 1.1 agc
7 1.1 agc */
8 1.1 agc #ifdef HAVE_CONFIG_H
9 1.1 agc #include <config.h>
10 1.1 agc #endif /* HAVE_CONFIG_H */
11 1.1 agc
12 1.2 christos #include "tre-alloca.h"
13 1.1 agc
14 1.1 agc #define __USE_STRING_INLINES
15 1.1 agc #undef __NO_INLINE__
16 1.1 agc
17 1.1 agc #include <assert.h>
18 1.1 agc #include <stdlib.h>
19 1.1 agc #include <string.h>
20 1.1 agc #include <limits.h>
21 1.1 agc #ifdef HAVE_WCHAR_H
22 1.1 agc #include <wchar.h>
23 1.1 agc #endif /* HAVE_WCHAR_H */
24 1.1 agc #ifdef HAVE_WCTYPE_H
25 1.1 agc #include <wctype.h>
26 1.1 agc #endif /* HAVE_WCTYPE_H */
27 1.1 agc #ifndef TRE_WCHAR
28 1.1 agc #include <ctype.h>
29 1.1 agc #endif /* !TRE_WCHAR */
30 1.1 agc #ifdef HAVE_MALLOC_H
31 1.1 agc #include <malloc.h>
32 1.1 agc #endif /* HAVE_MALLOC_H */
33 1.1 agc
34 1.1 agc #include "tre-internal.h"
35 1.1 agc #include "tre-match-utils.h"
36 1.1 agc #include "tre.h"
37 1.1 agc #include "xmalloc.h"
38 1.1 agc
39 1.1 agc #define TRE_M_COST 0
40 1.1 agc #define TRE_M_NUM_INS 1
41 1.1 agc #define TRE_M_NUM_DEL 2
42 1.1 agc #define TRE_M_NUM_SUBST 3
43 1.1 agc #define TRE_M_NUM_ERR 4
44 1.1 agc #define TRE_M_LAST 5
45 1.1 agc
46 1.1 agc #define TRE_M_MAX_DEPTH 3
47 1.1 agc
48 1.1 agc typedef struct {
49 1.1 agc /* State in the TNFA transition table. */
50 1.1 agc tre_tnfa_transition_t *state;
51 1.1 agc /* Position in input string. */
52 1.1 agc int pos;
53 1.1 agc /* Tag values. */
54 1.1 agc int *tags;
55 1.1 agc /* Matching parameters. */
56 1.1 agc regaparams_t params;
57 1.1 agc /* Nesting depth of parameters. This is used as an index in
58 1.1 agc the `costs' array. */
59 1.1 agc int depth;
60 1.1 agc /* Costs and counter values for different parameter nesting depths. */
61 1.1 agc int costs[TRE_M_MAX_DEPTH + 1][TRE_M_LAST];
62 1.1 agc } tre_tnfa_approx_reach_t;
63 1.1 agc
64 1.1 agc
65 1.1 agc #ifdef TRE_DEBUG
66 1.1 agc /* Prints the `reach' array in a readable fashion with DPRINT. */
67 1.1 agc static void
68 1.1 agc tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_approx_reach_t *reach,
69 1.2 christos int pos, size_t num_tags)
70 1.1 agc {
71 1.1 agc int id;
72 1.1 agc
73 1.1 agc /* Print each state on one line. */
74 1.1 agc DPRINT((" reach:\n"));
75 1.1 agc for (id = 0; id < tnfa->num_states; id++)
76 1.1 agc {
77 1.1 agc int i, j;
78 1.1 agc if (reach[id].pos < pos)
79 1.1 agc continue; /* Not reached. */
80 1.1 agc DPRINT((" %03d, costs ", id));
81 1.1 agc for (i = 0; i <= reach[id].depth; i++)
82 1.1 agc {
83 1.1 agc DPRINT(("["));
84 1.1 agc for (j = 0; j < TRE_M_LAST; j++)
85 1.1 agc {
86 1.1 agc DPRINT(("%2d", reach[id].costs[i][j]));
87 1.1 agc if (j + 1 < TRE_M_LAST)
88 1.1 agc DPRINT((","));
89 1.1 agc }
90 1.1 agc DPRINT(("]"));
91 1.1 agc if (i + 1 <= reach[id].depth)
92 1.1 agc DPRINT((", "));
93 1.1 agc }
94 1.1 agc DPRINT(("\n tags "));
95 1.1 agc for (i = 0; i < num_tags; i++)
96 1.1 agc {
97 1.1 agc DPRINT(("%02d", reach[id].tags[i]));
98 1.1 agc if (i + 1 < num_tags)
99 1.1 agc DPRINT((","));
100 1.1 agc }
101 1.1 agc DPRINT(("\n"));
102 1.1 agc }
103 1.1 agc DPRINT(("\n"));
104 1.1 agc }
105 1.1 agc #endif /* TRE_DEBUG */
106 1.1 agc
107 1.1 agc
108 1.1 agc /* Sets the matching parameters in `reach' to the ones defined in the `pa'
109 1.1 agc array. If `pa' specifies default values, they are taken from
110 1.1 agc `default_params'. */
111 1.1 agc inline static void
112 1.1 agc tre_set_params(tre_tnfa_approx_reach_t *reach,
113 1.1 agc int *pa, regaparams_t default_params)
114 1.1 agc {
115 1.1 agc int value;
116 1.1 agc
117 1.1 agc /* If depth is increased reset costs and counters to zero for the
118 1.1 agc new levels. */
119 1.1 agc value = pa[TRE_PARAM_DEPTH];
120 1.1 agc assert(value <= TRE_M_MAX_DEPTH);
121 1.1 agc if (value > reach->depth)
122 1.1 agc {
123 1.1 agc int i, j;
124 1.1 agc for (i = reach->depth + 1; i <= value; i++)
125 1.1 agc for (j = 0; j < TRE_M_LAST; j++)
126 1.1 agc reach->costs[i][j] = 0;
127 1.1 agc }
128 1.1 agc reach->depth = value;
129 1.1 agc
130 1.1 agc /* Set insert cost. */
131 1.1 agc value = pa[TRE_PARAM_COST_INS];
132 1.1 agc if (value == TRE_PARAM_DEFAULT)
133 1.1 agc reach->params.cost_ins = default_params.cost_ins;
134 1.1 agc else if (value != TRE_PARAM_UNSET)
135 1.1 agc reach->params.cost_ins = value;
136 1.1 agc
137 1.1 agc /* Set delete cost. */
138 1.1 agc value = pa[TRE_PARAM_COST_DEL];
139 1.1 agc if (value == TRE_PARAM_DEFAULT)
140 1.1 agc reach->params.cost_del = default_params.cost_del;
141 1.1 agc else if (value != TRE_PARAM_UNSET)
142 1.1 agc reach->params.cost_del = value;
143 1.1 agc
144 1.1 agc /* Set substitute cost. */
145 1.1 agc value = pa[TRE_PARAM_COST_SUBST];
146 1.1 agc if (value == TRE_PARAM_DEFAULT)
147 1.1 agc reach->params.cost_subst = default_params.cost_subst;
148 1.1 agc else
149 1.1 agc reach->params.cost_subst = value;
150 1.1 agc
151 1.1 agc /* Set maximum cost. */
152 1.1 agc value = pa[TRE_PARAM_COST_MAX];
153 1.1 agc if (value == TRE_PARAM_DEFAULT)
154 1.1 agc reach->params.max_cost = default_params.max_cost;
155 1.1 agc else if (value != TRE_PARAM_UNSET)
156 1.1 agc reach->params.max_cost = value;
157 1.1 agc
158 1.1 agc /* Set maximum inserts. */
159 1.1 agc value = pa[TRE_PARAM_MAX_INS];
160 1.1 agc if (value == TRE_PARAM_DEFAULT)
161 1.1 agc reach->params.max_ins = default_params.max_ins;
162 1.1 agc else if (value != TRE_PARAM_UNSET)
163 1.1 agc reach->params.max_ins = value;
164 1.1 agc
165 1.1 agc /* Set maximum deletes. */
166 1.1 agc value = pa[TRE_PARAM_MAX_DEL];
167 1.1 agc if (value == TRE_PARAM_DEFAULT)
168 1.1 agc reach->params.max_del = default_params.max_del;
169 1.1 agc else if (value != TRE_PARAM_UNSET)
170 1.1 agc reach->params.max_del = value;
171 1.1 agc
172 1.1 agc /* Set maximum substitutes. */
173 1.1 agc value = pa[TRE_PARAM_MAX_SUBST];
174 1.1 agc if (value == TRE_PARAM_DEFAULT)
175 1.1 agc reach->params.max_subst = default_params.max_subst;
176 1.1 agc else if (value != TRE_PARAM_UNSET)
177 1.1 agc reach->params.max_subst = value;
178 1.1 agc
179 1.1 agc /* Set maximum number of errors. */
180 1.1 agc value = pa[TRE_PARAM_MAX_ERR];
181 1.1 agc if (value == TRE_PARAM_DEFAULT)
182 1.1 agc reach->params.max_err = default_params.max_err;
183 1.1 agc else if (value != TRE_PARAM_UNSET)
184 1.1 agc reach->params.max_err = value;
185 1.1 agc }
186 1.1 agc
187 1.1 agc reg_errcode_t
188 1.1 agc tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
189 1.1 agc tre_str_type_t type, int *match_tags,
190 1.1 agc regamatch_t *match, regaparams_t default_params,
191 1.1 agc int eflags, int *match_end_ofs)
192 1.1 agc {
193 1.1 agc /* State variables required by GET_NEXT_WCHAR. */
194 1.1 agc tre_char_t prev_c = 0, next_c = 0;
195 1.1 agc const char *str_byte = string;
196 1.1 agc int pos = -1;
197 1.1 agc unsigned int pos_add_next = 1;
198 1.1 agc #ifdef TRE_WCHAR
199 1.1 agc const wchar_t *str_wide = string;
200 1.1 agc #ifdef TRE_MBSTATE
201 1.1 agc mbstate_t mbstate;
202 1.1 agc #endif /* !TRE_WCHAR */
203 1.1 agc #endif /* TRE_WCHAR */
204 1.1 agc int reg_notbol = eflags & REG_NOTBOL;
205 1.1 agc int reg_noteol = eflags & REG_NOTEOL;
206 1.1 agc int reg_newline = tnfa->cflags & REG_NEWLINE;
207 1.2 christos size_t str_user_end = 0;
208 1.1 agc
209 1.1 agc int prev_pos;
210 1.1 agc
211 1.1 agc /* Number of tags. */
212 1.2 christos size_t num_tags;
213 1.1 agc /* The reach tables. */
214 1.1 agc tre_tnfa_approx_reach_t *reach, *reach_next;
215 1.1 agc /* Tag array for temporary use. */
216 1.1 agc int *tmp_tags;
217 1.1 agc
218 1.1 agc /* End offset of best match so far, or -1 if no match found yet. */
219 1.1 agc int match_eo = -1;
220 1.1 agc /* Costs of the match. */
221 1.1 agc int match_costs[TRE_M_LAST];
222 1.1 agc
223 1.1 agc /* Space for temporary data required for matching. */
224 1.1 agc unsigned char *buf;
225 1.1 agc
226 1.2 christos size_t i, id;
227 1.1 agc
228 1.4 rin reg_errcode_t ret;
229 1.4 rin
230 1.1 agc if (!match_tags)
231 1.1 agc num_tags = 0;
232 1.1 agc else
233 1.1 agc num_tags = tnfa->num_tags;
234 1.1 agc
235 1.1 agc #ifdef TRE_MBSTATE
236 1.1 agc memset(&mbstate, '\0', sizeof(mbstate));
237 1.1 agc #endif /* TRE_MBSTATE */
238 1.1 agc
239 1.1 agc DPRINT(("tre_tnfa_run_approx, input type %d, len %d, eflags %d, "
240 1.1 agc "match_tags %p\n",
241 1.1 agc type, len, eflags,
242 1.1 agc match_tags));
243 1.1 agc DPRINT(("max cost %d, ins %d, del %d, subst %d\n",
244 1.1 agc default_params.max_cost,
245 1.1 agc default_params.cost_ins,
246 1.1 agc default_params.cost_del,
247 1.1 agc default_params.cost_subst));
248 1.1 agc
249 1.1 agc /* Allocate memory for temporary data required for matching. This needs to
250 1.1 agc be done for every matching operation to be thread safe. This allocates
251 1.1 agc everything in a single large block from the stack frame using alloca()
252 1.1 agc or with malloc() if alloca is unavailable. */
253 1.1 agc {
254 1.1 agc unsigned char *buf_cursor;
255 1.5 rin
256 1.5 rin /* Ensure that tag_bytes*num_states cannot overflow, and that it don't
257 1.5 rin * contribute more than 1/8 of SIZE_MAX to total_bytes. */
258 1.5 rin if (num_tags > SIZE_MAX/(8 * sizeof(*tmp_tags) * tnfa->num_states))
259 1.5 rin return REG_ESPACE;
260 1.5 rin
261 1.5 rin /* Likewise check reach_bytes. */
262 1.5 rin if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_next)))
263 1.5 rin return REG_ESPACE;
264 1.5 rin
265 1.1 agc /* Space needed for one array of tags. */
266 1.2 christos size_t tag_bytes = sizeof(*tmp_tags) * num_tags;
267 1.1 agc /* Space needed for one reach table. */
268 1.2 christos size_t reach_bytes = sizeof(*reach_next) * tnfa->num_states;
269 1.1 agc /* Total space needed. */
270 1.2 christos size_t total_bytes = reach_bytes * 2 + (tnfa->num_states * 2 + 1 ) * tag_bytes;
271 1.1 agc /* Add some extra to make sure we can align the pointers. The multiplier
272 1.1 agc used here must be equal to the number of ALIGN calls below. */
273 1.1 agc total_bytes += (sizeof(long) - 1) * 3;
274 1.1 agc
275 1.1 agc /* Allocate the memory. */
276 1.1 agc #ifdef TRE_USE_ALLOCA
277 1.1 agc buf = alloca(total_bytes);
278 1.1 agc #else /* !TRE_USE_ALLOCA */
279 1.1 agc buf = xmalloc((unsigned)total_bytes);
280 1.1 agc #endif /* !TRE_USE_ALLOCA */
281 1.1 agc if (!buf)
282 1.1 agc return REG_ESPACE;
283 1.1 agc memset(buf, 0, (size_t)total_bytes);
284 1.1 agc
285 1.1 agc /* Allocate `tmp_tags' from `buf'. */
286 1.1 agc tmp_tags = (void *)buf;
287 1.1 agc buf_cursor = buf + tag_bytes;
288 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
289 1.1 agc
290 1.1 agc /* Allocate `reach' from `buf'. */
291 1.1 agc reach = (void *)buf_cursor;
292 1.1 agc buf_cursor += reach_bytes;
293 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
294 1.1 agc
295 1.1 agc /* Allocate `reach_next' from `buf'. */
296 1.1 agc reach_next = (void *)buf_cursor;
297 1.1 agc buf_cursor += reach_bytes;
298 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
299 1.1 agc
300 1.1 agc /* Allocate tag arrays for `reach' and `reach_next' from `buf'. */
301 1.1 agc for (i = 0; i < tnfa->num_states; i++)
302 1.1 agc {
303 1.1 agc reach[i].tags = (void *)buf_cursor;
304 1.1 agc buf_cursor += tag_bytes;
305 1.1 agc reach_next[i].tags = (void *)buf_cursor;
306 1.1 agc buf_cursor += tag_bytes;
307 1.1 agc }
308 1.1 agc assert(buf_cursor <= buf + total_bytes);
309 1.1 agc }
310 1.1 agc
311 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
312 1.1 agc match_costs[i] = INT_MAX;
313 1.1 agc
314 1.1 agc /* Mark the reach arrays empty. */
315 1.1 agc for (i = 0; i < tnfa->num_states; i++)
316 1.1 agc reach[i].pos = reach_next[i].pos = -2;
317 1.1 agc
318 1.1 agc prev_pos = pos;
319 1.1 agc GET_NEXT_WCHAR();
320 1.1 agc pos = 0;
321 1.1 agc
322 1.3 rin while (/*CONSTCOND*/(void)1,1)
323 1.1 agc {
324 1.1 agc DPRINT(("%03d:%2lc/%05d\n", pos, (tre_cint_t)next_c, (int)next_c));
325 1.1 agc
326 1.1 agc /* Add initial states to `reach_next' if an exact match has not yet
327 1.1 agc been found. */
328 1.1 agc if (match_costs[TRE_M_COST] > 0)
329 1.1 agc {
330 1.1 agc tre_tnfa_transition_t *trans;
331 1.1 agc DPRINT((" init"));
332 1.1 agc for (trans = tnfa->initial; trans->state; trans++)
333 1.1 agc {
334 1.1 agc int stateid = trans->state_id;
335 1.1 agc
336 1.1 agc /* If this state is not currently in `reach_next', add it
337 1.1 agc there. */
338 1.1 agc if (reach_next[stateid].pos < pos)
339 1.1 agc {
340 1.1 agc if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
341 1.1 agc {
342 1.1 agc /* Assertions failed, don't add this state. */
343 1.1 agc DPRINT((" !%d (assert)", stateid));
344 1.1 agc continue;
345 1.1 agc }
346 1.1 agc DPRINT((" %d", stateid));
347 1.1 agc reach_next[stateid].state = trans->state;
348 1.1 agc reach_next[stateid].pos = pos;
349 1.1 agc
350 1.1 agc /* Compute tag values after this transition. */
351 1.1 agc for (i = 0; i < num_tags; i++)
352 1.1 agc reach_next[stateid].tags[i] = -1;
353 1.1 agc
354 1.1 agc if (trans->tags)
355 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
356 1.2 christos if ((size_t)trans->tags[i] < num_tags)
357 1.1 agc reach_next[stateid].tags[trans->tags[i]] = pos;
358 1.1 agc
359 1.1 agc /* Set the parameters, depth, and costs. */
360 1.1 agc reach_next[stateid].params = default_params;
361 1.1 agc reach_next[stateid].depth = 0;
362 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
363 1.1 agc reach_next[stateid].costs[0][i] = 0;
364 1.1 agc if (trans->params)
365 1.1 agc tre_set_params(&reach_next[stateid], trans->params,
366 1.1 agc default_params);
367 1.1 agc
368 1.1 agc /* If this is the final state, mark the exact match. */
369 1.1 agc if (trans->state == tnfa->final)
370 1.1 agc {
371 1.1 agc match_eo = pos;
372 1.1 agc for (i = 0; i < num_tags; i++)
373 1.1 agc match_tags[i] = reach_next[stateid].tags[i];
374 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
375 1.1 agc match_costs[i] = 0;
376 1.1 agc }
377 1.1 agc }
378 1.1 agc }
379 1.1 agc DPRINT(("\n"));
380 1.1 agc }
381 1.1 agc
382 1.1 agc
383 1.1 agc /* Handle inserts. This is done by pretending there's an epsilon
384 1.1 agc transition from each state in `reach' back to the same state.
385 1.1 agc We don't need to worry about the final state here; this will never
386 1.1 agc give a better match than what we already have. */
387 1.1 agc for (id = 0; id < tnfa->num_states; id++)
388 1.1 agc {
389 1.1 agc int depth;
390 1.1 agc int cost, cost0;
391 1.1 agc
392 1.1 agc if (reach[id].pos != prev_pos)
393 1.1 agc {
394 1.1 agc DPRINT((" insert: %d not reached\n", id));
395 1.1 agc continue; /* Not reached. */
396 1.1 agc }
397 1.1 agc
398 1.1 agc depth = reach[id].depth;
399 1.1 agc
400 1.1 agc /* Compute and check cost at current depth. */
401 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
402 1.1 agc if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
403 1.1 agc cost += reach[id].params.cost_ins;
404 1.1 agc if (cost > reach[id].params.max_cost)
405 1.1 agc continue; /* Cost too large. */
406 1.1 agc
407 1.1 agc /* Check number of inserts at current depth. */
408 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_INS] + 1
409 1.1 agc > reach[id].params.max_ins)
410 1.1 agc continue; /* Too many inserts. */
411 1.1 agc
412 1.1 agc /* Check total number of errors at current depth. */
413 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
414 1.1 agc > reach[id].params.max_err)
415 1.1 agc continue; /* Too many errors. */
416 1.1 agc
417 1.1 agc /* Compute overall cost. */
418 1.1 agc cost0 = cost;
419 1.1 agc if (depth > 0)
420 1.1 agc {
421 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
422 1.1 agc if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
423 1.1 agc cost0 += reach[id].params.cost_ins;
424 1.1 agc else
425 1.1 agc cost0 += default_params.cost_ins;
426 1.1 agc }
427 1.1 agc
428 1.1 agc DPRINT((" insert: from %d to %d, cost %d: ", id, id,
429 1.1 agc reach[id].costs[depth][TRE_M_COST]));
430 1.1 agc if (reach_next[id].pos == pos
431 1.1 agc && (cost0 >= reach_next[id].costs[0][TRE_M_COST]))
432 1.1 agc {
433 1.1 agc DPRINT(("lose\n"));
434 1.1 agc continue;
435 1.1 agc }
436 1.1 agc DPRINT(("win\n"));
437 1.1 agc
438 1.1 agc /* Copy state, position, tags, parameters, and depth. */
439 1.1 agc reach_next[id].state = reach[id].state;
440 1.1 agc reach_next[id].pos = pos;
441 1.1 agc for (i = 0; i < num_tags; i++)
442 1.1 agc reach_next[id].tags[i] = reach[id].tags[i];
443 1.1 agc reach_next[id].params = reach[id].params;
444 1.1 agc reach_next[id].depth = reach[id].depth;
445 1.1 agc
446 1.1 agc /* Set the costs after this transition. */
447 1.1 agc memcpy(reach_next[id].costs, reach[id].costs,
448 1.1 agc sizeof(reach_next[id].costs[0][0])
449 1.1 agc * TRE_M_LAST * (depth + 1));
450 1.1 agc reach_next[id].costs[depth][TRE_M_COST] = cost;
451 1.1 agc reach_next[id].costs[depth][TRE_M_NUM_INS]++;
452 1.1 agc reach_next[id].costs[depth][TRE_M_NUM_ERR]++;
453 1.1 agc if (depth > 0)
454 1.1 agc {
455 1.1 agc reach_next[id].costs[0][TRE_M_COST] = cost0;
456 1.1 agc reach_next[id].costs[0][TRE_M_NUM_INS]++;
457 1.1 agc reach_next[id].costs[0][TRE_M_NUM_ERR]++;
458 1.1 agc }
459 1.1 agc
460 1.1 agc }
461 1.1 agc
462 1.1 agc
463 1.1 agc /* Handle deletes. This is done by traversing through the whole TNFA
464 1.1 agc pretending that all transitions are epsilon transitions, until
465 1.1 agc no more states can be reached with better costs. */
466 1.1 agc {
467 1.1 agc /* XXX - dynamic ringbuffer size */
468 1.1 agc tre_tnfa_approx_reach_t *ringbuffer[512];
469 1.1 agc tre_tnfa_approx_reach_t **deque_start, **deque_end;
470 1.1 agc
471 1.1 agc deque_start = deque_end = ringbuffer;
472 1.1 agc
473 1.1 agc /* Add all states in `reach_next' to the deque. */
474 1.1 agc for (id = 0; id < tnfa->num_states; id++)
475 1.1 agc {
476 1.1 agc if (reach_next[id].pos != pos)
477 1.1 agc continue;
478 1.1 agc *deque_end = &reach_next[id];
479 1.1 agc deque_end++;
480 1.1 agc assert(deque_end != deque_start);
481 1.1 agc }
482 1.1 agc
483 1.1 agc /* Repeat until the deque is empty. */
484 1.1 agc while (deque_end != deque_start)
485 1.1 agc {
486 1.1 agc tre_tnfa_approx_reach_t *reach_p;
487 1.1 agc int depth;
488 1.1 agc int cost, cost0;
489 1.1 agc tre_tnfa_transition_t *trans;
490 1.1 agc
491 1.1 agc /* Pop the first item off the deque. */
492 1.1 agc reach_p = *deque_start;
493 1.1 agc id = reach_p - reach_next;
494 1.1 agc depth = reach_p->depth;
495 1.1 agc
496 1.1 agc /* Compute cost at current depth. */
497 1.1 agc cost = reach_p->costs[depth][TRE_M_COST];
498 1.1 agc if (reach_p->params.cost_del != TRE_PARAM_UNSET)
499 1.1 agc cost += reach_p->params.cost_del;
500 1.1 agc
501 1.1 agc /* Check cost, number of deletes, and total number of errors
502 1.1 agc at current depth. */
503 1.1 agc if (cost > reach_p->params.max_cost
504 1.1 agc || (reach_p->costs[depth][TRE_M_NUM_DEL] + 1
505 1.1 agc > reach_p->params.max_del)
506 1.1 agc || (reach_p->costs[depth][TRE_M_NUM_ERR] + 1
507 1.1 agc > reach_p->params.max_err))
508 1.1 agc {
509 1.1 agc /* Too many errors or cost too large. */
510 1.1 agc DPRINT((" delete: from %03d: cost too large\n", id));
511 1.1 agc deque_start++;
512 1.1 agc if (deque_start >= (ringbuffer + 512))
513 1.1 agc deque_start = ringbuffer;
514 1.1 agc continue;
515 1.1 agc }
516 1.1 agc
517 1.1 agc /* Compute overall cost. */
518 1.1 agc cost0 = cost;
519 1.1 agc if (depth > 0)
520 1.1 agc {
521 1.1 agc cost0 = reach_p->costs[0][TRE_M_COST];
522 1.1 agc if (reach_p->params.cost_del != TRE_PARAM_UNSET)
523 1.1 agc cost0 += reach_p->params.cost_del;
524 1.1 agc else
525 1.1 agc cost0 += default_params.cost_del;
526 1.1 agc }
527 1.1 agc
528 1.1 agc for (trans = reach_p->state; trans->state; trans++)
529 1.1 agc {
530 1.1 agc int dest_id = trans->state_id;
531 1.1 agc DPRINT((" delete: from %03d to %03d, cost %d (%d): ",
532 1.1 agc id, dest_id, cost0, reach_p->params.max_cost));
533 1.1 agc
534 1.1 agc if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
535 1.1 agc {
536 1.1 agc DPRINT(("assertion failed\n"));
537 1.1 agc continue;
538 1.1 agc }
539 1.1 agc
540 1.1 agc /* Compute tag values after this transition. */
541 1.1 agc for (i = 0; i < num_tags; i++)
542 1.1 agc tmp_tags[i] = reach_p->tags[i];
543 1.1 agc if (trans->tags)
544 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
545 1.2 christos if ((size_t)trans->tags[i] < num_tags)
546 1.1 agc tmp_tags[trans->tags[i]] = pos;
547 1.1 agc
548 1.1 agc /* If another path has also reached this state, choose the one
549 1.1 agc with the smallest cost or best tags if costs are equal. */
550 1.1 agc if (reach_next[dest_id].pos == pos
551 1.1 agc && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
552 1.1 agc || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
553 1.1 agc && (!match_tags
554 1.1 agc || !tre_tag_order(num_tags,
555 1.1 agc tnfa->tag_directions,
556 1.1 agc tmp_tags,
557 1.1 agc reach_next[dest_id].tags)))))
558 1.1 agc {
559 1.1 agc DPRINT(("lose, cost0 %d, have %d\n",
560 1.1 agc cost0, reach_next[dest_id].costs[0][TRE_M_COST]));
561 1.1 agc continue;
562 1.1 agc }
563 1.1 agc DPRINT(("win\n"));
564 1.1 agc
565 1.1 agc /* Set state, position, tags, parameters, depth, and costs. */
566 1.1 agc reach_next[dest_id].state = trans->state;
567 1.1 agc reach_next[dest_id].pos = pos;
568 1.1 agc for (i = 0; i < num_tags; i++)
569 1.1 agc reach_next[dest_id].tags[i] = tmp_tags[i];
570 1.1 agc
571 1.1 agc reach_next[dest_id].params = reach_p->params;
572 1.1 agc if (trans->params)
573 1.1 agc tre_set_params(&reach_next[dest_id], trans->params,
574 1.1 agc default_params);
575 1.1 agc
576 1.1 agc reach_next[dest_id].depth = reach_p->depth;
577 1.1 agc memcpy(&reach_next[dest_id].costs,
578 1.1 agc reach_p->costs,
579 1.1 agc sizeof(reach_p->costs[0][0])
580 1.1 agc * TRE_M_LAST * (depth + 1));
581 1.1 agc reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
582 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_DEL]++;
583 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_ERR]++;
584 1.1 agc if (depth > 0)
585 1.1 agc {
586 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
587 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_DEL]++;
588 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_ERR]++;
589 1.1 agc }
590 1.1 agc
591 1.1 agc if (trans->state == tnfa->final
592 1.1 agc && (match_eo < 0
593 1.1 agc || match_costs[TRE_M_COST] > cost0
594 1.1 agc || (match_costs[TRE_M_COST] == cost0
595 1.1 agc && (num_tags > 0
596 1.1 agc && tmp_tags[0] <= match_tags[0]))))
597 1.1 agc {
598 1.1 agc DPRINT((" setting new match at %d, cost %d\n",
599 1.1 agc pos, cost0));
600 1.1 agc match_eo = pos;
601 1.1 agc memcpy(match_costs, reach_next[dest_id].costs[0],
602 1.1 agc sizeof(match_costs[0]) * TRE_M_LAST);
603 1.1 agc for (i = 0; i < num_tags; i++)
604 1.1 agc match_tags[i] = tmp_tags[i];
605 1.1 agc }
606 1.1 agc
607 1.1 agc /* Add to the end of the deque. */
608 1.1 agc *deque_end = &reach_next[dest_id];
609 1.1 agc deque_end++;
610 1.1 agc if (deque_end >= (ringbuffer + 512))
611 1.1 agc deque_end = ringbuffer;
612 1.1 agc assert(deque_end != deque_start);
613 1.1 agc }
614 1.1 agc deque_start++;
615 1.1 agc if (deque_start >= (ringbuffer + 512))
616 1.1 agc deque_start = ringbuffer;
617 1.1 agc }
618 1.1 agc
619 1.1 agc }
620 1.1 agc
621 1.1 agc #ifdef TRE_DEBUG
622 1.1 agc tre_print_reach(tnfa, reach_next, pos, num_tags);
623 1.1 agc #endif /* TRE_DEBUG */
624 1.1 agc
625 1.1 agc /* Check for end of string. */
626 1.1 agc if (len < 0)
627 1.1 agc {
628 1.1 agc if (type == STR_USER)
629 1.1 agc {
630 1.1 agc if (str_user_end)
631 1.1 agc break;
632 1.1 agc }
633 1.1 agc else if (next_c == L'\0')
634 1.1 agc break;
635 1.1 agc }
636 1.1 agc else
637 1.1 agc {
638 1.1 agc if (pos >= len)
639 1.1 agc break;
640 1.1 agc }
641 1.1 agc
642 1.1 agc prev_pos = pos;
643 1.1 agc GET_NEXT_WCHAR();
644 1.1 agc
645 1.1 agc /* Swap `reach' and `reach_next'. */
646 1.1 agc {
647 1.1 agc tre_tnfa_approx_reach_t *tmp;
648 1.1 agc tmp = reach;
649 1.1 agc reach = reach_next;
650 1.1 agc reach_next = tmp;
651 1.1 agc }
652 1.1 agc
653 1.1 agc /* Handle exact matches and substitutions. */
654 1.1 agc for (id = 0; id < tnfa->num_states; id++)
655 1.1 agc {
656 1.1 agc tre_tnfa_transition_t *trans;
657 1.1 agc
658 1.1 agc if (reach[id].pos < prev_pos)
659 1.1 agc continue; /* Not reached. */
660 1.1 agc for (trans = reach[id].state; trans->state; trans++)
661 1.1 agc {
662 1.1 agc int dest_id;
663 1.1 agc int depth;
664 1.1 agc int cost, cost0, err;
665 1.1 agc
666 1.1 agc if (trans->assertions
667 1.1 agc && (CHECK_ASSERTIONS(trans->assertions)
668 1.1 agc || CHECK_CHAR_CLASSES(trans, tnfa, eflags)))
669 1.1 agc {
670 1.1 agc DPRINT((" exact, from %d: assert failed\n", id));
671 1.1 agc continue;
672 1.1 agc }
673 1.1 agc
674 1.1 agc depth = reach[id].depth;
675 1.1 agc dest_id = trans->state_id;
676 1.1 agc
677 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
678 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
679 1.1 agc err = 0;
680 1.1 agc
681 1.1 agc if (trans->code_min > (tre_cint_t)prev_c
682 1.1 agc || trans->code_max < (tre_cint_t)prev_c)
683 1.1 agc {
684 1.1 agc /* Handle substitutes. The required character was not in
685 1.1 agc the string, so match it in place of whatever was supposed
686 1.1 agc to be there and increase costs accordingly. */
687 1.1 agc err = 1;
688 1.1 agc
689 1.1 agc /* Compute and check cost at current depth. */
690 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
691 1.1 agc if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
692 1.1 agc cost += reach[id].params.cost_subst;
693 1.1 agc if (cost > reach[id].params.max_cost)
694 1.1 agc continue; /* Cost too large. */
695 1.1 agc
696 1.1 agc /* Check number of substitutes at current depth. */
697 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_SUBST] + 1
698 1.1 agc > reach[id].params.max_subst)
699 1.1 agc continue; /* Too many substitutes. */
700 1.1 agc
701 1.1 agc /* Check total number of errors at current depth. */
702 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
703 1.1 agc > reach[id].params.max_err)
704 1.1 agc continue; /* Too many errors. */
705 1.1 agc
706 1.1 agc /* Compute overall cost. */
707 1.1 agc cost0 = cost;
708 1.1 agc if (depth > 0)
709 1.1 agc {
710 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
711 1.1 agc if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
712 1.1 agc cost0 += reach[id].params.cost_subst;
713 1.1 agc else
714 1.1 agc cost0 += default_params.cost_subst;
715 1.1 agc }
716 1.1 agc DPRINT((" subst, from %03d to %03d, cost %d: ",
717 1.1 agc id, dest_id, cost0));
718 1.1 agc }
719 1.1 agc else
720 1.1 agc DPRINT((" exact, from %03d to %03d, cost %d: ",
721 1.1 agc id, dest_id, cost0));
722 1.1 agc
723 1.1 agc /* Compute tag values after this transition. */
724 1.1 agc for (i = 0; i < num_tags; i++)
725 1.1 agc tmp_tags[i] = reach[id].tags[i];
726 1.1 agc if (trans->tags)
727 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
728 1.2 christos if ((size_t)trans->tags[i] < num_tags)
729 1.1 agc tmp_tags[trans->tags[i]] = pos;
730 1.1 agc
731 1.1 agc /* If another path has also reached this state, choose the
732 1.1 agc one with the smallest cost or best tags if costs are equal. */
733 1.1 agc if (reach_next[dest_id].pos == pos
734 1.1 agc && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
735 1.1 agc || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
736 1.1 agc && !tre_tag_order(num_tags, tnfa->tag_directions,
737 1.1 agc tmp_tags,
738 1.1 agc reach_next[dest_id].tags))))
739 1.1 agc {
740 1.1 agc DPRINT(("lose\n"));
741 1.1 agc continue;
742 1.1 agc }
743 1.1 agc DPRINT(("win %d %d\n",
744 1.1 agc reach_next[dest_id].pos,
745 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST]));
746 1.1 agc
747 1.1 agc /* Set state, position, tags, and depth. */
748 1.1 agc reach_next[dest_id].state = trans->state;
749 1.1 agc reach_next[dest_id].pos = pos;
750 1.1 agc for (i = 0; i < num_tags; i++)
751 1.1 agc reach_next[dest_id].tags[i] = tmp_tags[i];
752 1.1 agc reach_next[dest_id].depth = reach[id].depth;
753 1.1 agc
754 1.1 agc /* Set parameters. */
755 1.1 agc reach_next[dest_id].params = reach[id].params;
756 1.1 agc if (trans->params)
757 1.1 agc tre_set_params(&reach_next[dest_id], trans->params,
758 1.1 agc default_params);
759 1.1 agc
760 1.1 agc /* Set the costs after this transition. */
761 1.1 agc memcpy(&reach_next[dest_id].costs,
762 1.1 agc reach[id].costs,
763 1.1 agc sizeof(reach[id].costs[0][0])
764 1.1 agc * TRE_M_LAST * (depth + 1));
765 1.1 agc reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
766 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_SUBST] += err;
767 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_ERR] += err;
768 1.1 agc if (depth > 0)
769 1.1 agc {
770 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
771 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_SUBST] += err;
772 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_ERR] += err;
773 1.1 agc }
774 1.1 agc
775 1.1 agc if (trans->state == tnfa->final
776 1.1 agc && (match_eo < 0
777 1.1 agc || cost0 < match_costs[TRE_M_COST]
778 1.1 agc || (cost0 == match_costs[TRE_M_COST]
779 1.1 agc && num_tags > 0 && tmp_tags[0] <= match_tags[0])))
780 1.1 agc {
781 1.1 agc DPRINT((" setting new match at %d, cost %d\n",
782 1.1 agc pos, cost0));
783 1.1 agc match_eo = pos;
784 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
785 1.1 agc match_costs[i] = reach_next[dest_id].costs[0][i];
786 1.1 agc for (i = 0; i < num_tags; i++)
787 1.1 agc match_tags[i] = tmp_tags[i];
788 1.1 agc }
789 1.1 agc }
790 1.1 agc }
791 1.1 agc }
792 1.1 agc
793 1.1 agc DPRINT(("match end offset = %d, match cost = %d\n", match_eo,
794 1.1 agc match_costs[TRE_M_COST]));
795 1.1 agc
796 1.1 agc match->cost = match_costs[TRE_M_COST];
797 1.1 agc match->num_ins = match_costs[TRE_M_NUM_INS];
798 1.1 agc match->num_del = match_costs[TRE_M_NUM_DEL];
799 1.1 agc match->num_subst = match_costs[TRE_M_NUM_SUBST];
800 1.1 agc *match_end_ofs = match_eo;
801 1.1 agc
802 1.4 rin ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
803 1.4 rin error_exit:
804 1.4 rin #ifndef TRE_USE_ALLOCA
805 1.4 rin if (buf)
806 1.4 rin xfree(buf);
807 1.4 rin #endif /* !TRE_USE_ALLOCA */
808 1.4 rin return ret;
809 1.1 agc }
810