tre-match-approx.c revision 1.7 1 1.1 agc /*
2 1.1 agc tre-match-approx.c - TRE approximate regex matching engine
3 1.1 agc
4 1.1 agc This software is released under a BSD-style license.
5 1.1 agc See the file LICENSE for details and copyright.
6 1.1 agc
7 1.1 agc */
8 1.1 agc #ifdef HAVE_CONFIG_H
9 1.1 agc #include <config.h>
10 1.1 agc #endif /* HAVE_CONFIG_H */
11 1.1 agc
12 1.2 christos #include "tre-alloca.h"
13 1.1 agc
14 1.1 agc #define __USE_STRING_INLINES
15 1.1 agc #undef __NO_INLINE__
16 1.1 agc
17 1.1 agc #include <assert.h>
18 1.1 agc #include <stdlib.h>
19 1.1 agc #include <string.h>
20 1.1 agc #include <limits.h>
21 1.1 agc #ifdef HAVE_WCHAR_H
22 1.1 agc #include <wchar.h>
23 1.1 agc #endif /* HAVE_WCHAR_H */
24 1.1 agc #ifdef HAVE_WCTYPE_H
25 1.1 agc #include <wctype.h>
26 1.1 agc #endif /* HAVE_WCTYPE_H */
27 1.1 agc #ifndef TRE_WCHAR
28 1.1 agc #include <ctype.h>
29 1.1 agc #endif /* !TRE_WCHAR */
30 1.1 agc #ifdef HAVE_MALLOC_H
31 1.1 agc #include <malloc.h>
32 1.1 agc #endif /* HAVE_MALLOC_H */
33 1.6 rin #include <stdint.h>
34 1.1 agc
35 1.1 agc #include "tre-internal.h"
36 1.1 agc #include "tre-match-utils.h"
37 1.1 agc #include "tre.h"
38 1.1 agc #include "xmalloc.h"
39 1.1 agc
40 1.1 agc #define TRE_M_COST 0
41 1.1 agc #define TRE_M_NUM_INS 1
42 1.1 agc #define TRE_M_NUM_DEL 2
43 1.1 agc #define TRE_M_NUM_SUBST 3
44 1.1 agc #define TRE_M_NUM_ERR 4
45 1.1 agc #define TRE_M_LAST 5
46 1.1 agc
47 1.1 agc #define TRE_M_MAX_DEPTH 3
48 1.1 agc
49 1.1 agc typedef struct {
50 1.1 agc /* State in the TNFA transition table. */
51 1.1 agc tre_tnfa_transition_t *state;
52 1.1 agc /* Position in input string. */
53 1.1 agc int pos;
54 1.1 agc /* Tag values. */
55 1.1 agc int *tags;
56 1.1 agc /* Matching parameters. */
57 1.1 agc regaparams_t params;
58 1.1 agc /* Nesting depth of parameters. This is used as an index in
59 1.1 agc the `costs' array. */
60 1.1 agc int depth;
61 1.1 agc /* Costs and counter values for different parameter nesting depths. */
62 1.1 agc int costs[TRE_M_MAX_DEPTH + 1][TRE_M_LAST];
63 1.1 agc } tre_tnfa_approx_reach_t;
64 1.1 agc
65 1.1 agc
66 1.1 agc #ifdef TRE_DEBUG
67 1.1 agc /* Prints the `reach' array in a readable fashion with DPRINT. */
68 1.1 agc static void
69 1.1 agc tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_approx_reach_t *reach,
70 1.2 christos int pos, size_t num_tags)
71 1.1 agc {
72 1.1 agc int id;
73 1.1 agc
74 1.1 agc /* Print each state on one line. */
75 1.1 agc DPRINT((" reach:\n"));
76 1.1 agc for (id = 0; id < tnfa->num_states; id++)
77 1.1 agc {
78 1.1 agc int i, j;
79 1.1 agc if (reach[id].pos < pos)
80 1.1 agc continue; /* Not reached. */
81 1.1 agc DPRINT((" %03d, costs ", id));
82 1.1 agc for (i = 0; i <= reach[id].depth; i++)
83 1.1 agc {
84 1.1 agc DPRINT(("["));
85 1.1 agc for (j = 0; j < TRE_M_LAST; j++)
86 1.1 agc {
87 1.1 agc DPRINT(("%2d", reach[id].costs[i][j]));
88 1.1 agc if (j + 1 < TRE_M_LAST)
89 1.1 agc DPRINT((","));
90 1.1 agc }
91 1.1 agc DPRINT(("]"));
92 1.1 agc if (i + 1 <= reach[id].depth)
93 1.1 agc DPRINT((", "));
94 1.1 agc }
95 1.1 agc DPRINT(("\n tags "));
96 1.1 agc for (i = 0; i < num_tags; i++)
97 1.1 agc {
98 1.1 agc DPRINT(("%02d", reach[id].tags[i]));
99 1.1 agc if (i + 1 < num_tags)
100 1.1 agc DPRINT((","));
101 1.1 agc }
102 1.1 agc DPRINT(("\n"));
103 1.1 agc }
104 1.1 agc DPRINT(("\n"));
105 1.1 agc }
106 1.1 agc #endif /* TRE_DEBUG */
107 1.1 agc
108 1.1 agc
109 1.1 agc /* Sets the matching parameters in `reach' to the ones defined in the `pa'
110 1.1 agc array. If `pa' specifies default values, they are taken from
111 1.1 agc `default_params'. */
112 1.1 agc inline static void
113 1.1 agc tre_set_params(tre_tnfa_approx_reach_t *reach,
114 1.1 agc int *pa, regaparams_t default_params)
115 1.1 agc {
116 1.1 agc int value;
117 1.1 agc
118 1.1 agc /* If depth is increased reset costs and counters to zero for the
119 1.1 agc new levels. */
120 1.1 agc value = pa[TRE_PARAM_DEPTH];
121 1.1 agc assert(value <= TRE_M_MAX_DEPTH);
122 1.1 agc if (value > reach->depth)
123 1.1 agc {
124 1.1 agc int i, j;
125 1.1 agc for (i = reach->depth + 1; i <= value; i++)
126 1.1 agc for (j = 0; j < TRE_M_LAST; j++)
127 1.1 agc reach->costs[i][j] = 0;
128 1.1 agc }
129 1.1 agc reach->depth = value;
130 1.1 agc
131 1.1 agc /* Set insert cost. */
132 1.1 agc value = pa[TRE_PARAM_COST_INS];
133 1.1 agc if (value == TRE_PARAM_DEFAULT)
134 1.1 agc reach->params.cost_ins = default_params.cost_ins;
135 1.1 agc else if (value != TRE_PARAM_UNSET)
136 1.1 agc reach->params.cost_ins = value;
137 1.1 agc
138 1.1 agc /* Set delete cost. */
139 1.1 agc value = pa[TRE_PARAM_COST_DEL];
140 1.1 agc if (value == TRE_PARAM_DEFAULT)
141 1.1 agc reach->params.cost_del = default_params.cost_del;
142 1.1 agc else if (value != TRE_PARAM_UNSET)
143 1.1 agc reach->params.cost_del = value;
144 1.1 agc
145 1.1 agc /* Set substitute cost. */
146 1.1 agc value = pa[TRE_PARAM_COST_SUBST];
147 1.1 agc if (value == TRE_PARAM_DEFAULT)
148 1.1 agc reach->params.cost_subst = default_params.cost_subst;
149 1.1 agc else
150 1.1 agc reach->params.cost_subst = value;
151 1.1 agc
152 1.1 agc /* Set maximum cost. */
153 1.1 agc value = pa[TRE_PARAM_COST_MAX];
154 1.1 agc if (value == TRE_PARAM_DEFAULT)
155 1.1 agc reach->params.max_cost = default_params.max_cost;
156 1.1 agc else if (value != TRE_PARAM_UNSET)
157 1.1 agc reach->params.max_cost = value;
158 1.1 agc
159 1.1 agc /* Set maximum inserts. */
160 1.1 agc value = pa[TRE_PARAM_MAX_INS];
161 1.1 agc if (value == TRE_PARAM_DEFAULT)
162 1.1 agc reach->params.max_ins = default_params.max_ins;
163 1.1 agc else if (value != TRE_PARAM_UNSET)
164 1.1 agc reach->params.max_ins = value;
165 1.1 agc
166 1.1 agc /* Set maximum deletes. */
167 1.1 agc value = pa[TRE_PARAM_MAX_DEL];
168 1.1 agc if (value == TRE_PARAM_DEFAULT)
169 1.1 agc reach->params.max_del = default_params.max_del;
170 1.1 agc else if (value != TRE_PARAM_UNSET)
171 1.1 agc reach->params.max_del = value;
172 1.1 agc
173 1.1 agc /* Set maximum substitutes. */
174 1.1 agc value = pa[TRE_PARAM_MAX_SUBST];
175 1.1 agc if (value == TRE_PARAM_DEFAULT)
176 1.1 agc reach->params.max_subst = default_params.max_subst;
177 1.1 agc else if (value != TRE_PARAM_UNSET)
178 1.1 agc reach->params.max_subst = value;
179 1.1 agc
180 1.1 agc /* Set maximum number of errors. */
181 1.1 agc value = pa[TRE_PARAM_MAX_ERR];
182 1.1 agc if (value == TRE_PARAM_DEFAULT)
183 1.1 agc reach->params.max_err = default_params.max_err;
184 1.1 agc else if (value != TRE_PARAM_UNSET)
185 1.1 agc reach->params.max_err = value;
186 1.1 agc }
187 1.1 agc
188 1.1 agc reg_errcode_t
189 1.1 agc tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
190 1.1 agc tre_str_type_t type, int *match_tags,
191 1.1 agc regamatch_t *match, regaparams_t default_params,
192 1.1 agc int eflags, int *match_end_ofs)
193 1.1 agc {
194 1.1 agc /* State variables required by GET_NEXT_WCHAR. */
195 1.1 agc tre_char_t prev_c = 0, next_c = 0;
196 1.1 agc const char *str_byte = string;
197 1.1 agc int pos = -1;
198 1.1 agc unsigned int pos_add_next = 1;
199 1.1 agc #ifdef TRE_WCHAR
200 1.1 agc const wchar_t *str_wide = string;
201 1.1 agc #ifdef TRE_MBSTATE
202 1.1 agc mbstate_t mbstate;
203 1.1 agc #endif /* !TRE_WCHAR */
204 1.1 agc #endif /* TRE_WCHAR */
205 1.1 agc int reg_notbol = eflags & REG_NOTBOL;
206 1.1 agc int reg_noteol = eflags & REG_NOTEOL;
207 1.1 agc int reg_newline = tnfa->cflags & REG_NEWLINE;
208 1.2 christos size_t str_user_end = 0;
209 1.1 agc
210 1.1 agc int prev_pos;
211 1.1 agc
212 1.1 agc /* Number of tags. */
213 1.2 christos size_t num_tags;
214 1.1 agc /* The reach tables. */
215 1.1 agc tre_tnfa_approx_reach_t *reach, *reach_next;
216 1.1 agc /* Tag array for temporary use. */
217 1.1 agc int *tmp_tags;
218 1.1 agc
219 1.1 agc /* End offset of best match so far, or -1 if no match found yet. */
220 1.1 agc int match_eo = -1;
221 1.1 agc /* Costs of the match. */
222 1.1 agc int match_costs[TRE_M_LAST];
223 1.1 agc
224 1.1 agc /* Space for temporary data required for matching. */
225 1.1 agc unsigned char *buf;
226 1.1 agc
227 1.2 christos size_t i, id;
228 1.1 agc
229 1.4 rin reg_errcode_t ret;
230 1.4 rin
231 1.1 agc if (!match_tags)
232 1.1 agc num_tags = 0;
233 1.1 agc else
234 1.1 agc num_tags = tnfa->num_tags;
235 1.1 agc
236 1.1 agc #ifdef TRE_MBSTATE
237 1.1 agc memset(&mbstate, '\0', sizeof(mbstate));
238 1.1 agc #endif /* TRE_MBSTATE */
239 1.1 agc
240 1.1 agc DPRINT(("tre_tnfa_run_approx, input type %d, len %d, eflags %d, "
241 1.1 agc "match_tags %p\n",
242 1.1 agc type, len, eflags,
243 1.1 agc match_tags));
244 1.1 agc DPRINT(("max cost %d, ins %d, del %d, subst %d\n",
245 1.1 agc default_params.max_cost,
246 1.1 agc default_params.cost_ins,
247 1.1 agc default_params.cost_del,
248 1.1 agc default_params.cost_subst));
249 1.1 agc
250 1.1 agc /* Allocate memory for temporary data required for matching. This needs to
251 1.1 agc be done for every matching operation to be thread safe. This allocates
252 1.1 agc everything in a single large block from the stack frame using alloca()
253 1.1 agc or with malloc() if alloca is unavailable. */
254 1.1 agc {
255 1.1 agc unsigned char *buf_cursor;
256 1.5 rin
257 1.5 rin /* Ensure that tag_bytes*num_states cannot overflow, and that it don't
258 1.5 rin * contribute more than 1/8 of SIZE_MAX to total_bytes. */
259 1.5 rin if (num_tags > SIZE_MAX/(8 * sizeof(*tmp_tags) * tnfa->num_states))
260 1.5 rin return REG_ESPACE;
261 1.5 rin
262 1.5 rin /* Likewise check reach_bytes. */
263 1.5 rin if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_next)))
264 1.5 rin return REG_ESPACE;
265 1.5 rin
266 1.1 agc /* Space needed for one array of tags. */
267 1.2 christos size_t tag_bytes = sizeof(*tmp_tags) * num_tags;
268 1.1 agc /* Space needed for one reach table. */
269 1.2 christos size_t reach_bytes = sizeof(*reach_next) * tnfa->num_states;
270 1.1 agc /* Total space needed. */
271 1.2 christos size_t total_bytes = reach_bytes * 2 + (tnfa->num_states * 2 + 1 ) * tag_bytes;
272 1.1 agc /* Add some extra to make sure we can align the pointers. The multiplier
273 1.1 agc used here must be equal to the number of ALIGN calls below. */
274 1.1 agc total_bytes += (sizeof(long) - 1) * 3;
275 1.1 agc
276 1.1 agc /* Allocate the memory. */
277 1.1 agc #ifdef TRE_USE_ALLOCA
278 1.1 agc buf = alloca(total_bytes);
279 1.1 agc #else /* !TRE_USE_ALLOCA */
280 1.1 agc buf = xmalloc((unsigned)total_bytes);
281 1.1 agc #endif /* !TRE_USE_ALLOCA */
282 1.1 agc if (!buf)
283 1.1 agc return REG_ESPACE;
284 1.1 agc memset(buf, 0, (size_t)total_bytes);
285 1.1 agc
286 1.1 agc /* Allocate `tmp_tags' from `buf'. */
287 1.1 agc tmp_tags = (void *)buf;
288 1.1 agc buf_cursor = buf + tag_bytes;
289 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
290 1.1 agc
291 1.1 agc /* Allocate `reach' from `buf'. */
292 1.1 agc reach = (void *)buf_cursor;
293 1.1 agc buf_cursor += reach_bytes;
294 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
295 1.1 agc
296 1.1 agc /* Allocate `reach_next' from `buf'. */
297 1.1 agc reach_next = (void *)buf_cursor;
298 1.1 agc buf_cursor += reach_bytes;
299 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
300 1.1 agc
301 1.1 agc /* Allocate tag arrays for `reach' and `reach_next' from `buf'. */
302 1.1 agc for (i = 0; i < tnfa->num_states; i++)
303 1.1 agc {
304 1.1 agc reach[i].tags = (void *)buf_cursor;
305 1.1 agc buf_cursor += tag_bytes;
306 1.1 agc reach_next[i].tags = (void *)buf_cursor;
307 1.1 agc buf_cursor += tag_bytes;
308 1.1 agc }
309 1.1 agc assert(buf_cursor <= buf + total_bytes);
310 1.1 agc }
311 1.1 agc
312 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
313 1.1 agc match_costs[i] = INT_MAX;
314 1.1 agc
315 1.1 agc /* Mark the reach arrays empty. */
316 1.1 agc for (i = 0; i < tnfa->num_states; i++)
317 1.1 agc reach[i].pos = reach_next[i].pos = -2;
318 1.1 agc
319 1.1 agc prev_pos = pos;
320 1.1 agc GET_NEXT_WCHAR();
321 1.1 agc pos = 0;
322 1.1 agc
323 1.3 rin while (/*CONSTCOND*/(void)1,1)
324 1.1 agc {
325 1.1 agc DPRINT(("%03d:%2lc/%05d\n", pos, (tre_cint_t)next_c, (int)next_c));
326 1.1 agc
327 1.1 agc /* Add initial states to `reach_next' if an exact match has not yet
328 1.1 agc been found. */
329 1.1 agc if (match_costs[TRE_M_COST] > 0)
330 1.1 agc {
331 1.1 agc tre_tnfa_transition_t *trans;
332 1.1 agc DPRINT((" init"));
333 1.1 agc for (trans = tnfa->initial; trans->state; trans++)
334 1.1 agc {
335 1.1 agc int stateid = trans->state_id;
336 1.1 agc
337 1.1 agc /* If this state is not currently in `reach_next', add it
338 1.1 agc there. */
339 1.1 agc if (reach_next[stateid].pos < pos)
340 1.1 agc {
341 1.1 agc if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
342 1.1 agc {
343 1.1 agc /* Assertions failed, don't add this state. */
344 1.1 agc DPRINT((" !%d (assert)", stateid));
345 1.1 agc continue;
346 1.1 agc }
347 1.1 agc DPRINT((" %d", stateid));
348 1.1 agc reach_next[stateid].state = trans->state;
349 1.1 agc reach_next[stateid].pos = pos;
350 1.1 agc
351 1.1 agc /* Compute tag values after this transition. */
352 1.1 agc for (i = 0; i < num_tags; i++)
353 1.1 agc reach_next[stateid].tags[i] = -1;
354 1.1 agc
355 1.1 agc if (trans->tags)
356 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
357 1.2 christos if ((size_t)trans->tags[i] < num_tags)
358 1.1 agc reach_next[stateid].tags[trans->tags[i]] = pos;
359 1.1 agc
360 1.1 agc /* Set the parameters, depth, and costs. */
361 1.1 agc reach_next[stateid].params = default_params;
362 1.1 agc reach_next[stateid].depth = 0;
363 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
364 1.1 agc reach_next[stateid].costs[0][i] = 0;
365 1.1 agc if (trans->params)
366 1.1 agc tre_set_params(&reach_next[stateid], trans->params,
367 1.1 agc default_params);
368 1.1 agc
369 1.1 agc /* If this is the final state, mark the exact match. */
370 1.1 agc if (trans->state == tnfa->final)
371 1.1 agc {
372 1.1 agc match_eo = pos;
373 1.1 agc for (i = 0; i < num_tags; i++)
374 1.1 agc match_tags[i] = reach_next[stateid].tags[i];
375 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
376 1.1 agc match_costs[i] = 0;
377 1.1 agc }
378 1.1 agc }
379 1.1 agc }
380 1.1 agc DPRINT(("\n"));
381 1.1 agc }
382 1.1 agc
383 1.1 agc
384 1.1 agc /* Handle inserts. This is done by pretending there's an epsilon
385 1.1 agc transition from each state in `reach' back to the same state.
386 1.1 agc We don't need to worry about the final state here; this will never
387 1.1 agc give a better match than what we already have. */
388 1.1 agc for (id = 0; id < tnfa->num_states; id++)
389 1.1 agc {
390 1.1 agc int depth;
391 1.1 agc int cost, cost0;
392 1.1 agc
393 1.1 agc if (reach[id].pos != prev_pos)
394 1.1 agc {
395 1.1 agc DPRINT((" insert: %d not reached\n", id));
396 1.1 agc continue; /* Not reached. */
397 1.1 agc }
398 1.1 agc
399 1.1 agc depth = reach[id].depth;
400 1.1 agc
401 1.1 agc /* Compute and check cost at current depth. */
402 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
403 1.1 agc if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
404 1.1 agc cost += reach[id].params.cost_ins;
405 1.1 agc if (cost > reach[id].params.max_cost)
406 1.1 agc continue; /* Cost too large. */
407 1.1 agc
408 1.1 agc /* Check number of inserts at current depth. */
409 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_INS] + 1
410 1.1 agc > reach[id].params.max_ins)
411 1.1 agc continue; /* Too many inserts. */
412 1.1 agc
413 1.1 agc /* Check total number of errors at current depth. */
414 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
415 1.1 agc > reach[id].params.max_err)
416 1.1 agc continue; /* Too many errors. */
417 1.1 agc
418 1.1 agc /* Compute overall cost. */
419 1.1 agc cost0 = cost;
420 1.1 agc if (depth > 0)
421 1.1 agc {
422 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
423 1.1 agc if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
424 1.1 agc cost0 += reach[id].params.cost_ins;
425 1.1 agc else
426 1.1 agc cost0 += default_params.cost_ins;
427 1.1 agc }
428 1.1 agc
429 1.1 agc DPRINT((" insert: from %d to %d, cost %d: ", id, id,
430 1.1 agc reach[id].costs[depth][TRE_M_COST]));
431 1.1 agc if (reach_next[id].pos == pos
432 1.1 agc && (cost0 >= reach_next[id].costs[0][TRE_M_COST]))
433 1.1 agc {
434 1.1 agc DPRINT(("lose\n"));
435 1.1 agc continue;
436 1.1 agc }
437 1.1 agc DPRINT(("win\n"));
438 1.1 agc
439 1.1 agc /* Copy state, position, tags, parameters, and depth. */
440 1.1 agc reach_next[id].state = reach[id].state;
441 1.1 agc reach_next[id].pos = pos;
442 1.1 agc for (i = 0; i < num_tags; i++)
443 1.1 agc reach_next[id].tags[i] = reach[id].tags[i];
444 1.1 agc reach_next[id].params = reach[id].params;
445 1.1 agc reach_next[id].depth = reach[id].depth;
446 1.1 agc
447 1.1 agc /* Set the costs after this transition. */
448 1.1 agc memcpy(reach_next[id].costs, reach[id].costs,
449 1.1 agc sizeof(reach_next[id].costs[0][0])
450 1.1 agc * TRE_M_LAST * (depth + 1));
451 1.1 agc reach_next[id].costs[depth][TRE_M_COST] = cost;
452 1.1 agc reach_next[id].costs[depth][TRE_M_NUM_INS]++;
453 1.1 agc reach_next[id].costs[depth][TRE_M_NUM_ERR]++;
454 1.1 agc if (depth > 0)
455 1.1 agc {
456 1.1 agc reach_next[id].costs[0][TRE_M_COST] = cost0;
457 1.1 agc reach_next[id].costs[0][TRE_M_NUM_INS]++;
458 1.1 agc reach_next[id].costs[0][TRE_M_NUM_ERR]++;
459 1.1 agc }
460 1.1 agc
461 1.1 agc }
462 1.1 agc
463 1.1 agc
464 1.1 agc /* Handle deletes. This is done by traversing through the whole TNFA
465 1.1 agc pretending that all transitions are epsilon transitions, until
466 1.1 agc no more states can be reached with better costs. */
467 1.1 agc {
468 1.1 agc /* XXX - dynamic ringbuffer size */
469 1.1 agc tre_tnfa_approx_reach_t *ringbuffer[512];
470 1.1 agc tre_tnfa_approx_reach_t **deque_start, **deque_end;
471 1.1 agc
472 1.1 agc deque_start = deque_end = ringbuffer;
473 1.1 agc
474 1.1 agc /* Add all states in `reach_next' to the deque. */
475 1.1 agc for (id = 0; id < tnfa->num_states; id++)
476 1.1 agc {
477 1.1 agc if (reach_next[id].pos != pos)
478 1.1 agc continue;
479 1.1 agc *deque_end = &reach_next[id];
480 1.1 agc deque_end++;
481 1.1 agc assert(deque_end != deque_start);
482 1.1 agc }
483 1.1 agc
484 1.1 agc /* Repeat until the deque is empty. */
485 1.1 agc while (deque_end != deque_start)
486 1.1 agc {
487 1.1 agc tre_tnfa_approx_reach_t *reach_p;
488 1.1 agc int depth;
489 1.1 agc int cost, cost0;
490 1.1 agc tre_tnfa_transition_t *trans;
491 1.1 agc
492 1.1 agc /* Pop the first item off the deque. */
493 1.1 agc reach_p = *deque_start;
494 1.1 agc id = reach_p - reach_next;
495 1.1 agc depth = reach_p->depth;
496 1.1 agc
497 1.1 agc /* Compute cost at current depth. */
498 1.1 agc cost = reach_p->costs[depth][TRE_M_COST];
499 1.1 agc if (reach_p->params.cost_del != TRE_PARAM_UNSET)
500 1.1 agc cost += reach_p->params.cost_del;
501 1.1 agc
502 1.1 agc /* Check cost, number of deletes, and total number of errors
503 1.1 agc at current depth. */
504 1.1 agc if (cost > reach_p->params.max_cost
505 1.1 agc || (reach_p->costs[depth][TRE_M_NUM_DEL] + 1
506 1.1 agc > reach_p->params.max_del)
507 1.1 agc || (reach_p->costs[depth][TRE_M_NUM_ERR] + 1
508 1.1 agc > reach_p->params.max_err))
509 1.1 agc {
510 1.1 agc /* Too many errors or cost too large. */
511 1.1 agc DPRINT((" delete: from %03d: cost too large\n", id));
512 1.1 agc deque_start++;
513 1.1 agc if (deque_start >= (ringbuffer + 512))
514 1.1 agc deque_start = ringbuffer;
515 1.1 agc continue;
516 1.1 agc }
517 1.1 agc
518 1.1 agc /* Compute overall cost. */
519 1.1 agc cost0 = cost;
520 1.1 agc if (depth > 0)
521 1.1 agc {
522 1.1 agc cost0 = reach_p->costs[0][TRE_M_COST];
523 1.1 agc if (reach_p->params.cost_del != TRE_PARAM_UNSET)
524 1.1 agc cost0 += reach_p->params.cost_del;
525 1.1 agc else
526 1.1 agc cost0 += default_params.cost_del;
527 1.1 agc }
528 1.1 agc
529 1.1 agc for (trans = reach_p->state; trans->state; trans++)
530 1.1 agc {
531 1.1 agc int dest_id = trans->state_id;
532 1.1 agc DPRINT((" delete: from %03d to %03d, cost %d (%d): ",
533 1.1 agc id, dest_id, cost0, reach_p->params.max_cost));
534 1.1 agc
535 1.1 agc if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
536 1.1 agc {
537 1.1 agc DPRINT(("assertion failed\n"));
538 1.1 agc continue;
539 1.1 agc }
540 1.1 agc
541 1.1 agc /* Compute tag values after this transition. */
542 1.1 agc for (i = 0; i < num_tags; i++)
543 1.1 agc tmp_tags[i] = reach_p->tags[i];
544 1.1 agc if (trans->tags)
545 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
546 1.2 christos if ((size_t)trans->tags[i] < num_tags)
547 1.1 agc tmp_tags[trans->tags[i]] = pos;
548 1.1 agc
549 1.1 agc /* If another path has also reached this state, choose the one
550 1.1 agc with the smallest cost or best tags if costs are equal. */
551 1.1 agc if (reach_next[dest_id].pos == pos
552 1.1 agc && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
553 1.1 agc || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
554 1.1 agc && (!match_tags
555 1.1 agc || !tre_tag_order(num_tags,
556 1.1 agc tnfa->tag_directions,
557 1.1 agc tmp_tags,
558 1.1 agc reach_next[dest_id].tags)))))
559 1.1 agc {
560 1.1 agc DPRINT(("lose, cost0 %d, have %d\n",
561 1.1 agc cost0, reach_next[dest_id].costs[0][TRE_M_COST]));
562 1.1 agc continue;
563 1.1 agc }
564 1.1 agc DPRINT(("win\n"));
565 1.1 agc
566 1.1 agc /* Set state, position, tags, parameters, depth, and costs. */
567 1.1 agc reach_next[dest_id].state = trans->state;
568 1.1 agc reach_next[dest_id].pos = pos;
569 1.1 agc for (i = 0; i < num_tags; i++)
570 1.1 agc reach_next[dest_id].tags[i] = tmp_tags[i];
571 1.1 agc
572 1.1 agc reach_next[dest_id].params = reach_p->params;
573 1.1 agc if (trans->params)
574 1.1 agc tre_set_params(&reach_next[dest_id], trans->params,
575 1.1 agc default_params);
576 1.1 agc
577 1.1 agc reach_next[dest_id].depth = reach_p->depth;
578 1.1 agc memcpy(&reach_next[dest_id].costs,
579 1.1 agc reach_p->costs,
580 1.1 agc sizeof(reach_p->costs[0][0])
581 1.1 agc * TRE_M_LAST * (depth + 1));
582 1.1 agc reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
583 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_DEL]++;
584 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_ERR]++;
585 1.1 agc if (depth > 0)
586 1.1 agc {
587 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
588 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_DEL]++;
589 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_ERR]++;
590 1.1 agc }
591 1.1 agc
592 1.1 agc if (trans->state == tnfa->final
593 1.1 agc && (match_eo < 0
594 1.1 agc || match_costs[TRE_M_COST] > cost0
595 1.1 agc || (match_costs[TRE_M_COST] == cost0
596 1.1 agc && (num_tags > 0
597 1.1 agc && tmp_tags[0] <= match_tags[0]))))
598 1.1 agc {
599 1.1 agc DPRINT((" setting new match at %d, cost %d\n",
600 1.1 agc pos, cost0));
601 1.1 agc match_eo = pos;
602 1.1 agc memcpy(match_costs, reach_next[dest_id].costs[0],
603 1.1 agc sizeof(match_costs[0]) * TRE_M_LAST);
604 1.1 agc for (i = 0; i < num_tags; i++)
605 1.1 agc match_tags[i] = tmp_tags[i];
606 1.1 agc }
607 1.1 agc
608 1.1 agc /* Add to the end of the deque. */
609 1.1 agc *deque_end = &reach_next[dest_id];
610 1.1 agc deque_end++;
611 1.1 agc if (deque_end >= (ringbuffer + 512))
612 1.1 agc deque_end = ringbuffer;
613 1.1 agc assert(deque_end != deque_start);
614 1.1 agc }
615 1.1 agc deque_start++;
616 1.1 agc if (deque_start >= (ringbuffer + 512))
617 1.1 agc deque_start = ringbuffer;
618 1.1 agc }
619 1.1 agc
620 1.1 agc }
621 1.1 agc
622 1.1 agc #ifdef TRE_DEBUG
623 1.1 agc tre_print_reach(tnfa, reach_next, pos, num_tags);
624 1.1 agc #endif /* TRE_DEBUG */
625 1.1 agc
626 1.1 agc /* Check for end of string. */
627 1.1 agc if (len < 0)
628 1.1 agc {
629 1.1 agc if (type == STR_USER)
630 1.1 agc {
631 1.1 agc if (str_user_end)
632 1.1 agc break;
633 1.1 agc }
634 1.1 agc else if (next_c == L'\0')
635 1.1 agc break;
636 1.1 agc }
637 1.1 agc else
638 1.1 agc {
639 1.1 agc if (pos >= len)
640 1.1 agc break;
641 1.1 agc }
642 1.1 agc
643 1.1 agc prev_pos = pos;
644 1.1 agc GET_NEXT_WCHAR();
645 1.1 agc
646 1.1 agc /* Swap `reach' and `reach_next'. */
647 1.1 agc {
648 1.1 agc tre_tnfa_approx_reach_t *tmp;
649 1.1 agc tmp = reach;
650 1.1 agc reach = reach_next;
651 1.1 agc reach_next = tmp;
652 1.1 agc }
653 1.1 agc
654 1.1 agc /* Handle exact matches and substitutions. */
655 1.1 agc for (id = 0; id < tnfa->num_states; id++)
656 1.1 agc {
657 1.1 agc tre_tnfa_transition_t *trans;
658 1.1 agc
659 1.1 agc if (reach[id].pos < prev_pos)
660 1.1 agc continue; /* Not reached. */
661 1.1 agc for (trans = reach[id].state; trans->state; trans++)
662 1.1 agc {
663 1.1 agc int dest_id;
664 1.1 agc int depth;
665 1.1 agc int cost, cost0, err;
666 1.1 agc
667 1.1 agc if (trans->assertions
668 1.1 agc && (CHECK_ASSERTIONS(trans->assertions)
669 1.1 agc || CHECK_CHAR_CLASSES(trans, tnfa, eflags)))
670 1.1 agc {
671 1.1 agc DPRINT((" exact, from %d: assert failed\n", id));
672 1.1 agc continue;
673 1.1 agc }
674 1.1 agc
675 1.1 agc depth = reach[id].depth;
676 1.1 agc dest_id = trans->state_id;
677 1.1 agc
678 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
679 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
680 1.1 agc err = 0;
681 1.1 agc
682 1.1 agc if (trans->code_min > (tre_cint_t)prev_c
683 1.1 agc || trans->code_max < (tre_cint_t)prev_c)
684 1.1 agc {
685 1.1 agc /* Handle substitutes. The required character was not in
686 1.1 agc the string, so match it in place of whatever was supposed
687 1.1 agc to be there and increase costs accordingly. */
688 1.1 agc err = 1;
689 1.1 agc
690 1.1 agc /* Compute and check cost at current depth. */
691 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
692 1.1 agc if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
693 1.1 agc cost += reach[id].params.cost_subst;
694 1.1 agc if (cost > reach[id].params.max_cost)
695 1.1 agc continue; /* Cost too large. */
696 1.1 agc
697 1.1 agc /* Check number of substitutes at current depth. */
698 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_SUBST] + 1
699 1.1 agc > reach[id].params.max_subst)
700 1.1 agc continue; /* Too many substitutes. */
701 1.1 agc
702 1.1 agc /* Check total number of errors at current depth. */
703 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
704 1.1 agc > reach[id].params.max_err)
705 1.1 agc continue; /* Too many errors. */
706 1.1 agc
707 1.1 agc /* Compute overall cost. */
708 1.1 agc cost0 = cost;
709 1.1 agc if (depth > 0)
710 1.1 agc {
711 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
712 1.1 agc if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
713 1.1 agc cost0 += reach[id].params.cost_subst;
714 1.1 agc else
715 1.1 agc cost0 += default_params.cost_subst;
716 1.1 agc }
717 1.1 agc DPRINT((" subst, from %03d to %03d, cost %d: ",
718 1.1 agc id, dest_id, cost0));
719 1.1 agc }
720 1.1 agc else
721 1.1 agc DPRINT((" exact, from %03d to %03d, cost %d: ",
722 1.1 agc id, dest_id, cost0));
723 1.1 agc
724 1.1 agc /* Compute tag values after this transition. */
725 1.1 agc for (i = 0; i < num_tags; i++)
726 1.1 agc tmp_tags[i] = reach[id].tags[i];
727 1.1 agc if (trans->tags)
728 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
729 1.2 christos if ((size_t)trans->tags[i] < num_tags)
730 1.1 agc tmp_tags[trans->tags[i]] = pos;
731 1.1 agc
732 1.1 agc /* If another path has also reached this state, choose the
733 1.1 agc one with the smallest cost or best tags if costs are equal. */
734 1.1 agc if (reach_next[dest_id].pos == pos
735 1.1 agc && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
736 1.1 agc || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
737 1.1 agc && !tre_tag_order(num_tags, tnfa->tag_directions,
738 1.1 agc tmp_tags,
739 1.1 agc reach_next[dest_id].tags))))
740 1.1 agc {
741 1.1 agc DPRINT(("lose\n"));
742 1.1 agc continue;
743 1.1 agc }
744 1.1 agc DPRINT(("win %d %d\n",
745 1.1 agc reach_next[dest_id].pos,
746 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST]));
747 1.1 agc
748 1.1 agc /* Set state, position, tags, and depth. */
749 1.1 agc reach_next[dest_id].state = trans->state;
750 1.1 agc reach_next[dest_id].pos = pos;
751 1.1 agc for (i = 0; i < num_tags; i++)
752 1.1 agc reach_next[dest_id].tags[i] = tmp_tags[i];
753 1.1 agc reach_next[dest_id].depth = reach[id].depth;
754 1.1 agc
755 1.1 agc /* Set parameters. */
756 1.1 agc reach_next[dest_id].params = reach[id].params;
757 1.1 agc if (trans->params)
758 1.1 agc tre_set_params(&reach_next[dest_id], trans->params,
759 1.1 agc default_params);
760 1.1 agc
761 1.1 agc /* Set the costs after this transition. */
762 1.7 joerg memcpy(&reach_next[dest_id].costs,
763 1.7 joerg reach[id].costs,
764 1.7 joerg sizeof(reach[id].costs[0][0])
765 1.7 joerg * TRE_M_LAST * (depth + 1));
766 1.1 agc reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
767 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_SUBST] += err;
768 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_ERR] += err;
769 1.1 agc if (depth > 0)
770 1.1 agc {
771 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
772 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_SUBST] += err;
773 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_ERR] += err;
774 1.1 agc }
775 1.1 agc
776 1.1 agc if (trans->state == tnfa->final
777 1.1 agc && (match_eo < 0
778 1.1 agc || cost0 < match_costs[TRE_M_COST]
779 1.1 agc || (cost0 == match_costs[TRE_M_COST]
780 1.1 agc && num_tags > 0 && tmp_tags[0] <= match_tags[0])))
781 1.1 agc {
782 1.1 agc DPRINT((" setting new match at %d, cost %d\n",
783 1.1 agc pos, cost0));
784 1.1 agc match_eo = pos;
785 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
786 1.1 agc match_costs[i] = reach_next[dest_id].costs[0][i];
787 1.1 agc for (i = 0; i < num_tags; i++)
788 1.1 agc match_tags[i] = tmp_tags[i];
789 1.1 agc }
790 1.1 agc }
791 1.1 agc }
792 1.1 agc }
793 1.1 agc
794 1.1 agc DPRINT(("match end offset = %d, match cost = %d\n", match_eo,
795 1.1 agc match_costs[TRE_M_COST]));
796 1.1 agc
797 1.1 agc match->cost = match_costs[TRE_M_COST];
798 1.1 agc match->num_ins = match_costs[TRE_M_NUM_INS];
799 1.1 agc match->num_del = match_costs[TRE_M_NUM_DEL];
800 1.1 agc match->num_subst = match_costs[TRE_M_NUM_SUBST];
801 1.1 agc *match_end_ofs = match_eo;
802 1.1 agc
803 1.4 rin ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
804 1.4 rin error_exit:
805 1.4 rin #ifndef TRE_USE_ALLOCA
806 1.4 rin if (buf)
807 1.4 rin xfree(buf);
808 1.4 rin #endif /* !TRE_USE_ALLOCA */
809 1.4 rin return ret;
810 1.1 agc }
811