tre-match-approx.c revision 1.4 1 1.1 agc /*
2 1.1 agc tre-match-approx.c - TRE approximate regex matching engine
3 1.1 agc
4 1.1 agc This software is released under a BSD-style license.
5 1.1 agc See the file LICENSE for details and copyright.
6 1.1 agc
7 1.1 agc */
8 1.1 agc #ifdef HAVE_CONFIG_H
9 1.1 agc #include <config.h>
10 1.1 agc #endif /* HAVE_CONFIG_H */
11 1.1 agc
12 1.2 christos #include "tre-alloca.h"
13 1.1 agc
14 1.1 agc #define __USE_STRING_INLINES
15 1.1 agc #undef __NO_INLINE__
16 1.1 agc
17 1.1 agc #include <assert.h>
18 1.1 agc #include <stdlib.h>
19 1.1 agc #include <string.h>
20 1.1 agc #include <limits.h>
21 1.1 agc #ifdef HAVE_WCHAR_H
22 1.1 agc #include <wchar.h>
23 1.1 agc #endif /* HAVE_WCHAR_H */
24 1.1 agc #ifdef HAVE_WCTYPE_H
25 1.1 agc #include <wctype.h>
26 1.1 agc #endif /* HAVE_WCTYPE_H */
27 1.1 agc #ifndef TRE_WCHAR
28 1.1 agc #include <ctype.h>
29 1.1 agc #endif /* !TRE_WCHAR */
30 1.1 agc #ifdef HAVE_MALLOC_H
31 1.1 agc #include <malloc.h>
32 1.1 agc #endif /* HAVE_MALLOC_H */
33 1.1 agc
34 1.1 agc #include "tre-internal.h"
35 1.1 agc #include "tre-match-utils.h"
36 1.1 agc #include "tre.h"
37 1.1 agc #include "xmalloc.h"
38 1.1 agc
39 1.1 agc #define TRE_M_COST 0
40 1.1 agc #define TRE_M_NUM_INS 1
41 1.1 agc #define TRE_M_NUM_DEL 2
42 1.1 agc #define TRE_M_NUM_SUBST 3
43 1.1 agc #define TRE_M_NUM_ERR 4
44 1.1 agc #define TRE_M_LAST 5
45 1.1 agc
46 1.1 agc #define TRE_M_MAX_DEPTH 3
47 1.1 agc
48 1.1 agc typedef struct {
49 1.1 agc /* State in the TNFA transition table. */
50 1.1 agc tre_tnfa_transition_t *state;
51 1.1 agc /* Position in input string. */
52 1.1 agc int pos;
53 1.1 agc /* Tag values. */
54 1.1 agc int *tags;
55 1.1 agc /* Matching parameters. */
56 1.1 agc regaparams_t params;
57 1.1 agc /* Nesting depth of parameters. This is used as an index in
58 1.1 agc the `costs' array. */
59 1.1 agc int depth;
60 1.1 agc /* Costs and counter values for different parameter nesting depths. */
61 1.1 agc int costs[TRE_M_MAX_DEPTH + 1][TRE_M_LAST];
62 1.1 agc } tre_tnfa_approx_reach_t;
63 1.1 agc
64 1.1 agc
65 1.1 agc #ifdef TRE_DEBUG
66 1.1 agc /* Prints the `reach' array in a readable fashion with DPRINT. */
67 1.1 agc static void
68 1.1 agc tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_approx_reach_t *reach,
69 1.2 christos int pos, size_t num_tags)
70 1.1 agc {
71 1.1 agc int id;
72 1.1 agc
73 1.1 agc /* Print each state on one line. */
74 1.1 agc DPRINT((" reach:\n"));
75 1.1 agc for (id = 0; id < tnfa->num_states; id++)
76 1.1 agc {
77 1.1 agc int i, j;
78 1.1 agc if (reach[id].pos < pos)
79 1.1 agc continue; /* Not reached. */
80 1.1 agc DPRINT((" %03d, costs ", id));
81 1.1 agc for (i = 0; i <= reach[id].depth; i++)
82 1.1 agc {
83 1.1 agc DPRINT(("["));
84 1.1 agc for (j = 0; j < TRE_M_LAST; j++)
85 1.1 agc {
86 1.1 agc DPRINT(("%2d", reach[id].costs[i][j]));
87 1.1 agc if (j + 1 < TRE_M_LAST)
88 1.1 agc DPRINT((","));
89 1.1 agc }
90 1.1 agc DPRINT(("]"));
91 1.1 agc if (i + 1 <= reach[id].depth)
92 1.1 agc DPRINT((", "));
93 1.1 agc }
94 1.1 agc DPRINT(("\n tags "));
95 1.1 agc for (i = 0; i < num_tags; i++)
96 1.1 agc {
97 1.1 agc DPRINT(("%02d", reach[id].tags[i]));
98 1.1 agc if (i + 1 < num_tags)
99 1.1 agc DPRINT((","));
100 1.1 agc }
101 1.1 agc DPRINT(("\n"));
102 1.1 agc }
103 1.1 agc DPRINT(("\n"));
104 1.1 agc }
105 1.1 agc #endif /* TRE_DEBUG */
106 1.1 agc
107 1.1 agc
108 1.1 agc /* Sets the matching parameters in `reach' to the ones defined in the `pa'
109 1.1 agc array. If `pa' specifies default values, they are taken from
110 1.1 agc `default_params'. */
111 1.1 agc inline static void
112 1.1 agc tre_set_params(tre_tnfa_approx_reach_t *reach,
113 1.1 agc int *pa, regaparams_t default_params)
114 1.1 agc {
115 1.1 agc int value;
116 1.1 agc
117 1.1 agc /* If depth is increased reset costs and counters to zero for the
118 1.1 agc new levels. */
119 1.1 agc value = pa[TRE_PARAM_DEPTH];
120 1.1 agc assert(value <= TRE_M_MAX_DEPTH);
121 1.1 agc if (value > reach->depth)
122 1.1 agc {
123 1.1 agc int i, j;
124 1.1 agc for (i = reach->depth + 1; i <= value; i++)
125 1.1 agc for (j = 0; j < TRE_M_LAST; j++)
126 1.1 agc reach->costs[i][j] = 0;
127 1.1 agc }
128 1.1 agc reach->depth = value;
129 1.1 agc
130 1.1 agc /* Set insert cost. */
131 1.1 agc value = pa[TRE_PARAM_COST_INS];
132 1.1 agc if (value == TRE_PARAM_DEFAULT)
133 1.1 agc reach->params.cost_ins = default_params.cost_ins;
134 1.1 agc else if (value != TRE_PARAM_UNSET)
135 1.1 agc reach->params.cost_ins = value;
136 1.1 agc
137 1.1 agc /* Set delete cost. */
138 1.1 agc value = pa[TRE_PARAM_COST_DEL];
139 1.1 agc if (value == TRE_PARAM_DEFAULT)
140 1.1 agc reach->params.cost_del = default_params.cost_del;
141 1.1 agc else if (value != TRE_PARAM_UNSET)
142 1.1 agc reach->params.cost_del = value;
143 1.1 agc
144 1.1 agc /* Set substitute cost. */
145 1.1 agc value = pa[TRE_PARAM_COST_SUBST];
146 1.1 agc if (value == TRE_PARAM_DEFAULT)
147 1.1 agc reach->params.cost_subst = default_params.cost_subst;
148 1.1 agc else
149 1.1 agc reach->params.cost_subst = value;
150 1.1 agc
151 1.1 agc /* Set maximum cost. */
152 1.1 agc value = pa[TRE_PARAM_COST_MAX];
153 1.1 agc if (value == TRE_PARAM_DEFAULT)
154 1.1 agc reach->params.max_cost = default_params.max_cost;
155 1.1 agc else if (value != TRE_PARAM_UNSET)
156 1.1 agc reach->params.max_cost = value;
157 1.1 agc
158 1.1 agc /* Set maximum inserts. */
159 1.1 agc value = pa[TRE_PARAM_MAX_INS];
160 1.1 agc if (value == TRE_PARAM_DEFAULT)
161 1.1 agc reach->params.max_ins = default_params.max_ins;
162 1.1 agc else if (value != TRE_PARAM_UNSET)
163 1.1 agc reach->params.max_ins = value;
164 1.1 agc
165 1.1 agc /* Set maximum deletes. */
166 1.1 agc value = pa[TRE_PARAM_MAX_DEL];
167 1.1 agc if (value == TRE_PARAM_DEFAULT)
168 1.1 agc reach->params.max_del = default_params.max_del;
169 1.1 agc else if (value != TRE_PARAM_UNSET)
170 1.1 agc reach->params.max_del = value;
171 1.1 agc
172 1.1 agc /* Set maximum substitutes. */
173 1.1 agc value = pa[TRE_PARAM_MAX_SUBST];
174 1.1 agc if (value == TRE_PARAM_DEFAULT)
175 1.1 agc reach->params.max_subst = default_params.max_subst;
176 1.1 agc else if (value != TRE_PARAM_UNSET)
177 1.1 agc reach->params.max_subst = value;
178 1.1 agc
179 1.1 agc /* Set maximum number of errors. */
180 1.1 agc value = pa[TRE_PARAM_MAX_ERR];
181 1.1 agc if (value == TRE_PARAM_DEFAULT)
182 1.1 agc reach->params.max_err = default_params.max_err;
183 1.1 agc else if (value != TRE_PARAM_UNSET)
184 1.1 agc reach->params.max_err = value;
185 1.1 agc }
186 1.1 agc
187 1.1 agc reg_errcode_t
188 1.1 agc tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
189 1.1 agc tre_str_type_t type, int *match_tags,
190 1.1 agc regamatch_t *match, regaparams_t default_params,
191 1.1 agc int eflags, int *match_end_ofs)
192 1.1 agc {
193 1.1 agc /* State variables required by GET_NEXT_WCHAR. */
194 1.1 agc tre_char_t prev_c = 0, next_c = 0;
195 1.1 agc const char *str_byte = string;
196 1.1 agc int pos = -1;
197 1.1 agc unsigned int pos_add_next = 1;
198 1.1 agc #ifdef TRE_WCHAR
199 1.1 agc const wchar_t *str_wide = string;
200 1.1 agc #ifdef TRE_MBSTATE
201 1.1 agc mbstate_t mbstate;
202 1.1 agc #endif /* !TRE_WCHAR */
203 1.1 agc #endif /* TRE_WCHAR */
204 1.1 agc int reg_notbol = eflags & REG_NOTBOL;
205 1.1 agc int reg_noteol = eflags & REG_NOTEOL;
206 1.1 agc int reg_newline = tnfa->cflags & REG_NEWLINE;
207 1.2 christos size_t str_user_end = 0;
208 1.1 agc
209 1.1 agc int prev_pos;
210 1.1 agc
211 1.1 agc /* Number of tags. */
212 1.2 christos size_t num_tags;
213 1.1 agc /* The reach tables. */
214 1.1 agc tre_tnfa_approx_reach_t *reach, *reach_next;
215 1.1 agc /* Tag array for temporary use. */
216 1.1 agc int *tmp_tags;
217 1.1 agc
218 1.1 agc /* End offset of best match so far, or -1 if no match found yet. */
219 1.1 agc int match_eo = -1;
220 1.1 agc /* Costs of the match. */
221 1.1 agc int match_costs[TRE_M_LAST];
222 1.1 agc
223 1.1 agc /* Space for temporary data required for matching. */
224 1.1 agc unsigned char *buf;
225 1.1 agc
226 1.2 christos size_t i, id;
227 1.1 agc
228 1.4 rin reg_errcode_t ret;
229 1.4 rin
230 1.1 agc if (!match_tags)
231 1.1 agc num_tags = 0;
232 1.1 agc else
233 1.1 agc num_tags = tnfa->num_tags;
234 1.1 agc
235 1.1 agc #ifdef TRE_MBSTATE
236 1.1 agc memset(&mbstate, '\0', sizeof(mbstate));
237 1.1 agc #endif /* TRE_MBSTATE */
238 1.1 agc
239 1.1 agc DPRINT(("tre_tnfa_run_approx, input type %d, len %d, eflags %d, "
240 1.1 agc "match_tags %p\n",
241 1.1 agc type, len, eflags,
242 1.1 agc match_tags));
243 1.1 agc DPRINT(("max cost %d, ins %d, del %d, subst %d\n",
244 1.1 agc default_params.max_cost,
245 1.1 agc default_params.cost_ins,
246 1.1 agc default_params.cost_del,
247 1.1 agc default_params.cost_subst));
248 1.1 agc
249 1.1 agc /* Allocate memory for temporary data required for matching. This needs to
250 1.1 agc be done for every matching operation to be thread safe. This allocates
251 1.1 agc everything in a single large block from the stack frame using alloca()
252 1.1 agc or with malloc() if alloca is unavailable. */
253 1.1 agc {
254 1.1 agc unsigned char *buf_cursor;
255 1.1 agc /* Space needed for one array of tags. */
256 1.2 christos size_t tag_bytes = sizeof(*tmp_tags) * num_tags;
257 1.1 agc /* Space needed for one reach table. */
258 1.2 christos size_t reach_bytes = sizeof(*reach_next) * tnfa->num_states;
259 1.1 agc /* Total space needed. */
260 1.2 christos size_t total_bytes = reach_bytes * 2 + (tnfa->num_states * 2 + 1 ) * tag_bytes;
261 1.1 agc /* Add some extra to make sure we can align the pointers. The multiplier
262 1.1 agc used here must be equal to the number of ALIGN calls below. */
263 1.1 agc total_bytes += (sizeof(long) - 1) * 3;
264 1.1 agc
265 1.1 agc /* Allocate the memory. */
266 1.1 agc #ifdef TRE_USE_ALLOCA
267 1.1 agc buf = alloca(total_bytes);
268 1.1 agc #else /* !TRE_USE_ALLOCA */
269 1.1 agc buf = xmalloc((unsigned)total_bytes);
270 1.1 agc #endif /* !TRE_USE_ALLOCA */
271 1.1 agc if (!buf)
272 1.1 agc return REG_ESPACE;
273 1.1 agc memset(buf, 0, (size_t)total_bytes);
274 1.1 agc
275 1.1 agc /* Allocate `tmp_tags' from `buf'. */
276 1.1 agc tmp_tags = (void *)buf;
277 1.1 agc buf_cursor = buf + tag_bytes;
278 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
279 1.1 agc
280 1.1 agc /* Allocate `reach' from `buf'. */
281 1.1 agc reach = (void *)buf_cursor;
282 1.1 agc buf_cursor += reach_bytes;
283 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
284 1.1 agc
285 1.1 agc /* Allocate `reach_next' from `buf'. */
286 1.1 agc reach_next = (void *)buf_cursor;
287 1.1 agc buf_cursor += reach_bytes;
288 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
289 1.1 agc
290 1.1 agc /* Allocate tag arrays for `reach' and `reach_next' from `buf'. */
291 1.1 agc for (i = 0; i < tnfa->num_states; i++)
292 1.1 agc {
293 1.1 agc reach[i].tags = (void *)buf_cursor;
294 1.1 agc buf_cursor += tag_bytes;
295 1.1 agc reach_next[i].tags = (void *)buf_cursor;
296 1.1 agc buf_cursor += tag_bytes;
297 1.1 agc }
298 1.1 agc assert(buf_cursor <= buf + total_bytes);
299 1.1 agc }
300 1.1 agc
301 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
302 1.1 agc match_costs[i] = INT_MAX;
303 1.1 agc
304 1.1 agc /* Mark the reach arrays empty. */
305 1.1 agc for (i = 0; i < tnfa->num_states; i++)
306 1.1 agc reach[i].pos = reach_next[i].pos = -2;
307 1.1 agc
308 1.1 agc prev_pos = pos;
309 1.1 agc GET_NEXT_WCHAR();
310 1.1 agc pos = 0;
311 1.1 agc
312 1.3 rin while (/*CONSTCOND*/(void)1,1)
313 1.1 agc {
314 1.1 agc DPRINT(("%03d:%2lc/%05d\n", pos, (tre_cint_t)next_c, (int)next_c));
315 1.1 agc
316 1.1 agc /* Add initial states to `reach_next' if an exact match has not yet
317 1.1 agc been found. */
318 1.1 agc if (match_costs[TRE_M_COST] > 0)
319 1.1 agc {
320 1.1 agc tre_tnfa_transition_t *trans;
321 1.1 agc DPRINT((" init"));
322 1.1 agc for (trans = tnfa->initial; trans->state; trans++)
323 1.1 agc {
324 1.1 agc int stateid = trans->state_id;
325 1.1 agc
326 1.1 agc /* If this state is not currently in `reach_next', add it
327 1.1 agc there. */
328 1.1 agc if (reach_next[stateid].pos < pos)
329 1.1 agc {
330 1.1 agc if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
331 1.1 agc {
332 1.1 agc /* Assertions failed, don't add this state. */
333 1.1 agc DPRINT((" !%d (assert)", stateid));
334 1.1 agc continue;
335 1.1 agc }
336 1.1 agc DPRINT((" %d", stateid));
337 1.1 agc reach_next[stateid].state = trans->state;
338 1.1 agc reach_next[stateid].pos = pos;
339 1.1 agc
340 1.1 agc /* Compute tag values after this transition. */
341 1.1 agc for (i = 0; i < num_tags; i++)
342 1.1 agc reach_next[stateid].tags[i] = -1;
343 1.1 agc
344 1.1 agc if (trans->tags)
345 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
346 1.2 christos if ((size_t)trans->tags[i] < num_tags)
347 1.1 agc reach_next[stateid].tags[trans->tags[i]] = pos;
348 1.1 agc
349 1.1 agc /* Set the parameters, depth, and costs. */
350 1.1 agc reach_next[stateid].params = default_params;
351 1.1 agc reach_next[stateid].depth = 0;
352 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
353 1.1 agc reach_next[stateid].costs[0][i] = 0;
354 1.1 agc if (trans->params)
355 1.1 agc tre_set_params(&reach_next[stateid], trans->params,
356 1.1 agc default_params);
357 1.1 agc
358 1.1 agc /* If this is the final state, mark the exact match. */
359 1.1 agc if (trans->state == tnfa->final)
360 1.1 agc {
361 1.1 agc match_eo = pos;
362 1.1 agc for (i = 0; i < num_tags; i++)
363 1.1 agc match_tags[i] = reach_next[stateid].tags[i];
364 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
365 1.1 agc match_costs[i] = 0;
366 1.1 agc }
367 1.1 agc }
368 1.1 agc }
369 1.1 agc DPRINT(("\n"));
370 1.1 agc }
371 1.1 agc
372 1.1 agc
373 1.1 agc /* Handle inserts. This is done by pretending there's an epsilon
374 1.1 agc transition from each state in `reach' back to the same state.
375 1.1 agc We don't need to worry about the final state here; this will never
376 1.1 agc give a better match than what we already have. */
377 1.1 agc for (id = 0; id < tnfa->num_states; id++)
378 1.1 agc {
379 1.1 agc int depth;
380 1.1 agc int cost, cost0;
381 1.1 agc
382 1.1 agc if (reach[id].pos != prev_pos)
383 1.1 agc {
384 1.1 agc DPRINT((" insert: %d not reached\n", id));
385 1.1 agc continue; /* Not reached. */
386 1.1 agc }
387 1.1 agc
388 1.1 agc depth = reach[id].depth;
389 1.1 agc
390 1.1 agc /* Compute and check cost at current depth. */
391 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
392 1.1 agc if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
393 1.1 agc cost += reach[id].params.cost_ins;
394 1.1 agc if (cost > reach[id].params.max_cost)
395 1.1 agc continue; /* Cost too large. */
396 1.1 agc
397 1.1 agc /* Check number of inserts at current depth. */
398 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_INS] + 1
399 1.1 agc > reach[id].params.max_ins)
400 1.1 agc continue; /* Too many inserts. */
401 1.1 agc
402 1.1 agc /* Check total number of errors at current depth. */
403 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
404 1.1 agc > reach[id].params.max_err)
405 1.1 agc continue; /* Too many errors. */
406 1.1 agc
407 1.1 agc /* Compute overall cost. */
408 1.1 agc cost0 = cost;
409 1.1 agc if (depth > 0)
410 1.1 agc {
411 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
412 1.1 agc if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
413 1.1 agc cost0 += reach[id].params.cost_ins;
414 1.1 agc else
415 1.1 agc cost0 += default_params.cost_ins;
416 1.1 agc }
417 1.1 agc
418 1.1 agc DPRINT((" insert: from %d to %d, cost %d: ", id, id,
419 1.1 agc reach[id].costs[depth][TRE_M_COST]));
420 1.1 agc if (reach_next[id].pos == pos
421 1.1 agc && (cost0 >= reach_next[id].costs[0][TRE_M_COST]))
422 1.1 agc {
423 1.1 agc DPRINT(("lose\n"));
424 1.1 agc continue;
425 1.1 agc }
426 1.1 agc DPRINT(("win\n"));
427 1.1 agc
428 1.1 agc /* Copy state, position, tags, parameters, and depth. */
429 1.1 agc reach_next[id].state = reach[id].state;
430 1.1 agc reach_next[id].pos = pos;
431 1.1 agc for (i = 0; i < num_tags; i++)
432 1.1 agc reach_next[id].tags[i] = reach[id].tags[i];
433 1.1 agc reach_next[id].params = reach[id].params;
434 1.1 agc reach_next[id].depth = reach[id].depth;
435 1.1 agc
436 1.1 agc /* Set the costs after this transition. */
437 1.1 agc memcpy(reach_next[id].costs, reach[id].costs,
438 1.1 agc sizeof(reach_next[id].costs[0][0])
439 1.1 agc * TRE_M_LAST * (depth + 1));
440 1.1 agc reach_next[id].costs[depth][TRE_M_COST] = cost;
441 1.1 agc reach_next[id].costs[depth][TRE_M_NUM_INS]++;
442 1.1 agc reach_next[id].costs[depth][TRE_M_NUM_ERR]++;
443 1.1 agc if (depth > 0)
444 1.1 agc {
445 1.1 agc reach_next[id].costs[0][TRE_M_COST] = cost0;
446 1.1 agc reach_next[id].costs[0][TRE_M_NUM_INS]++;
447 1.1 agc reach_next[id].costs[0][TRE_M_NUM_ERR]++;
448 1.1 agc }
449 1.1 agc
450 1.1 agc }
451 1.1 agc
452 1.1 agc
453 1.1 agc /* Handle deletes. This is done by traversing through the whole TNFA
454 1.1 agc pretending that all transitions are epsilon transitions, until
455 1.1 agc no more states can be reached with better costs. */
456 1.1 agc {
457 1.1 agc /* XXX - dynamic ringbuffer size */
458 1.1 agc tre_tnfa_approx_reach_t *ringbuffer[512];
459 1.1 agc tre_tnfa_approx_reach_t **deque_start, **deque_end;
460 1.1 agc
461 1.1 agc deque_start = deque_end = ringbuffer;
462 1.1 agc
463 1.1 agc /* Add all states in `reach_next' to the deque. */
464 1.1 agc for (id = 0; id < tnfa->num_states; id++)
465 1.1 agc {
466 1.1 agc if (reach_next[id].pos != pos)
467 1.1 agc continue;
468 1.1 agc *deque_end = &reach_next[id];
469 1.1 agc deque_end++;
470 1.1 agc assert(deque_end != deque_start);
471 1.1 agc }
472 1.1 agc
473 1.1 agc /* Repeat until the deque is empty. */
474 1.1 agc while (deque_end != deque_start)
475 1.1 agc {
476 1.1 agc tre_tnfa_approx_reach_t *reach_p;
477 1.1 agc int depth;
478 1.1 agc int cost, cost0;
479 1.1 agc tre_tnfa_transition_t *trans;
480 1.1 agc
481 1.1 agc /* Pop the first item off the deque. */
482 1.1 agc reach_p = *deque_start;
483 1.1 agc id = reach_p - reach_next;
484 1.1 agc depth = reach_p->depth;
485 1.1 agc
486 1.1 agc /* Compute cost at current depth. */
487 1.1 agc cost = reach_p->costs[depth][TRE_M_COST];
488 1.1 agc if (reach_p->params.cost_del != TRE_PARAM_UNSET)
489 1.1 agc cost += reach_p->params.cost_del;
490 1.1 agc
491 1.1 agc /* Check cost, number of deletes, and total number of errors
492 1.1 agc at current depth. */
493 1.1 agc if (cost > reach_p->params.max_cost
494 1.1 agc || (reach_p->costs[depth][TRE_M_NUM_DEL] + 1
495 1.1 agc > reach_p->params.max_del)
496 1.1 agc || (reach_p->costs[depth][TRE_M_NUM_ERR] + 1
497 1.1 agc > reach_p->params.max_err))
498 1.1 agc {
499 1.1 agc /* Too many errors or cost too large. */
500 1.1 agc DPRINT((" delete: from %03d: cost too large\n", id));
501 1.1 agc deque_start++;
502 1.1 agc if (deque_start >= (ringbuffer + 512))
503 1.1 agc deque_start = ringbuffer;
504 1.1 agc continue;
505 1.1 agc }
506 1.1 agc
507 1.1 agc /* Compute overall cost. */
508 1.1 agc cost0 = cost;
509 1.1 agc if (depth > 0)
510 1.1 agc {
511 1.1 agc cost0 = reach_p->costs[0][TRE_M_COST];
512 1.1 agc if (reach_p->params.cost_del != TRE_PARAM_UNSET)
513 1.1 agc cost0 += reach_p->params.cost_del;
514 1.1 agc else
515 1.1 agc cost0 += default_params.cost_del;
516 1.1 agc }
517 1.1 agc
518 1.1 agc for (trans = reach_p->state; trans->state; trans++)
519 1.1 agc {
520 1.1 agc int dest_id = trans->state_id;
521 1.1 agc DPRINT((" delete: from %03d to %03d, cost %d (%d): ",
522 1.1 agc id, dest_id, cost0, reach_p->params.max_cost));
523 1.1 agc
524 1.1 agc if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
525 1.1 agc {
526 1.1 agc DPRINT(("assertion failed\n"));
527 1.1 agc continue;
528 1.1 agc }
529 1.1 agc
530 1.1 agc /* Compute tag values after this transition. */
531 1.1 agc for (i = 0; i < num_tags; i++)
532 1.1 agc tmp_tags[i] = reach_p->tags[i];
533 1.1 agc if (trans->tags)
534 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
535 1.2 christos if ((size_t)trans->tags[i] < num_tags)
536 1.1 agc tmp_tags[trans->tags[i]] = pos;
537 1.1 agc
538 1.1 agc /* If another path has also reached this state, choose the one
539 1.1 agc with the smallest cost or best tags if costs are equal. */
540 1.1 agc if (reach_next[dest_id].pos == pos
541 1.1 agc && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
542 1.1 agc || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
543 1.1 agc && (!match_tags
544 1.1 agc || !tre_tag_order(num_tags,
545 1.1 agc tnfa->tag_directions,
546 1.1 agc tmp_tags,
547 1.1 agc reach_next[dest_id].tags)))))
548 1.1 agc {
549 1.1 agc DPRINT(("lose, cost0 %d, have %d\n",
550 1.1 agc cost0, reach_next[dest_id].costs[0][TRE_M_COST]));
551 1.1 agc continue;
552 1.1 agc }
553 1.1 agc DPRINT(("win\n"));
554 1.1 agc
555 1.1 agc /* Set state, position, tags, parameters, depth, and costs. */
556 1.1 agc reach_next[dest_id].state = trans->state;
557 1.1 agc reach_next[dest_id].pos = pos;
558 1.1 agc for (i = 0; i < num_tags; i++)
559 1.1 agc reach_next[dest_id].tags[i] = tmp_tags[i];
560 1.1 agc
561 1.1 agc reach_next[dest_id].params = reach_p->params;
562 1.1 agc if (trans->params)
563 1.1 agc tre_set_params(&reach_next[dest_id], trans->params,
564 1.1 agc default_params);
565 1.1 agc
566 1.1 agc reach_next[dest_id].depth = reach_p->depth;
567 1.1 agc memcpy(&reach_next[dest_id].costs,
568 1.1 agc reach_p->costs,
569 1.1 agc sizeof(reach_p->costs[0][0])
570 1.1 agc * TRE_M_LAST * (depth + 1));
571 1.1 agc reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
572 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_DEL]++;
573 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_ERR]++;
574 1.1 agc if (depth > 0)
575 1.1 agc {
576 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
577 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_DEL]++;
578 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_ERR]++;
579 1.1 agc }
580 1.1 agc
581 1.1 agc if (trans->state == tnfa->final
582 1.1 agc && (match_eo < 0
583 1.1 agc || match_costs[TRE_M_COST] > cost0
584 1.1 agc || (match_costs[TRE_M_COST] == cost0
585 1.1 agc && (num_tags > 0
586 1.1 agc && tmp_tags[0] <= match_tags[0]))))
587 1.1 agc {
588 1.1 agc DPRINT((" setting new match at %d, cost %d\n",
589 1.1 agc pos, cost0));
590 1.1 agc match_eo = pos;
591 1.1 agc memcpy(match_costs, reach_next[dest_id].costs[0],
592 1.1 agc sizeof(match_costs[0]) * TRE_M_LAST);
593 1.1 agc for (i = 0; i < num_tags; i++)
594 1.1 agc match_tags[i] = tmp_tags[i];
595 1.1 agc }
596 1.1 agc
597 1.1 agc /* Add to the end of the deque. */
598 1.1 agc *deque_end = &reach_next[dest_id];
599 1.1 agc deque_end++;
600 1.1 agc if (deque_end >= (ringbuffer + 512))
601 1.1 agc deque_end = ringbuffer;
602 1.1 agc assert(deque_end != deque_start);
603 1.1 agc }
604 1.1 agc deque_start++;
605 1.1 agc if (deque_start >= (ringbuffer + 512))
606 1.1 agc deque_start = ringbuffer;
607 1.1 agc }
608 1.1 agc
609 1.1 agc }
610 1.1 agc
611 1.1 agc #ifdef TRE_DEBUG
612 1.1 agc tre_print_reach(tnfa, reach_next, pos, num_tags);
613 1.1 agc #endif /* TRE_DEBUG */
614 1.1 agc
615 1.1 agc /* Check for end of string. */
616 1.1 agc if (len < 0)
617 1.1 agc {
618 1.1 agc if (type == STR_USER)
619 1.1 agc {
620 1.1 agc if (str_user_end)
621 1.1 agc break;
622 1.1 agc }
623 1.1 agc else if (next_c == L'\0')
624 1.1 agc break;
625 1.1 agc }
626 1.1 agc else
627 1.1 agc {
628 1.1 agc if (pos >= len)
629 1.1 agc break;
630 1.1 agc }
631 1.1 agc
632 1.1 agc prev_pos = pos;
633 1.1 agc GET_NEXT_WCHAR();
634 1.1 agc
635 1.1 agc /* Swap `reach' and `reach_next'. */
636 1.1 agc {
637 1.1 agc tre_tnfa_approx_reach_t *tmp;
638 1.1 agc tmp = reach;
639 1.1 agc reach = reach_next;
640 1.1 agc reach_next = tmp;
641 1.1 agc }
642 1.1 agc
643 1.1 agc /* Handle exact matches and substitutions. */
644 1.1 agc for (id = 0; id < tnfa->num_states; id++)
645 1.1 agc {
646 1.1 agc tre_tnfa_transition_t *trans;
647 1.1 agc
648 1.1 agc if (reach[id].pos < prev_pos)
649 1.1 agc continue; /* Not reached. */
650 1.1 agc for (trans = reach[id].state; trans->state; trans++)
651 1.1 agc {
652 1.1 agc int dest_id;
653 1.1 agc int depth;
654 1.1 agc int cost, cost0, err;
655 1.1 agc
656 1.1 agc if (trans->assertions
657 1.1 agc && (CHECK_ASSERTIONS(trans->assertions)
658 1.1 agc || CHECK_CHAR_CLASSES(trans, tnfa, eflags)))
659 1.1 agc {
660 1.1 agc DPRINT((" exact, from %d: assert failed\n", id));
661 1.1 agc continue;
662 1.1 agc }
663 1.1 agc
664 1.1 agc depth = reach[id].depth;
665 1.1 agc dest_id = trans->state_id;
666 1.1 agc
667 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
668 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
669 1.1 agc err = 0;
670 1.1 agc
671 1.1 agc if (trans->code_min > (tre_cint_t)prev_c
672 1.1 agc || trans->code_max < (tre_cint_t)prev_c)
673 1.1 agc {
674 1.1 agc /* Handle substitutes. The required character was not in
675 1.1 agc the string, so match it in place of whatever was supposed
676 1.1 agc to be there and increase costs accordingly. */
677 1.1 agc err = 1;
678 1.1 agc
679 1.1 agc /* Compute and check cost at current depth. */
680 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
681 1.1 agc if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
682 1.1 agc cost += reach[id].params.cost_subst;
683 1.1 agc if (cost > reach[id].params.max_cost)
684 1.1 agc continue; /* Cost too large. */
685 1.1 agc
686 1.1 agc /* Check number of substitutes at current depth. */
687 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_SUBST] + 1
688 1.1 agc > reach[id].params.max_subst)
689 1.1 agc continue; /* Too many substitutes. */
690 1.1 agc
691 1.1 agc /* Check total number of errors at current depth. */
692 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
693 1.1 agc > reach[id].params.max_err)
694 1.1 agc continue; /* Too many errors. */
695 1.1 agc
696 1.1 agc /* Compute overall cost. */
697 1.1 agc cost0 = cost;
698 1.1 agc if (depth > 0)
699 1.1 agc {
700 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
701 1.1 agc if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
702 1.1 agc cost0 += reach[id].params.cost_subst;
703 1.1 agc else
704 1.1 agc cost0 += default_params.cost_subst;
705 1.1 agc }
706 1.1 agc DPRINT((" subst, from %03d to %03d, cost %d: ",
707 1.1 agc id, dest_id, cost0));
708 1.1 agc }
709 1.1 agc else
710 1.1 agc DPRINT((" exact, from %03d to %03d, cost %d: ",
711 1.1 agc id, dest_id, cost0));
712 1.1 agc
713 1.1 agc /* Compute tag values after this transition. */
714 1.1 agc for (i = 0; i < num_tags; i++)
715 1.1 agc tmp_tags[i] = reach[id].tags[i];
716 1.1 agc if (trans->tags)
717 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
718 1.2 christos if ((size_t)trans->tags[i] < num_tags)
719 1.1 agc tmp_tags[trans->tags[i]] = pos;
720 1.1 agc
721 1.1 agc /* If another path has also reached this state, choose the
722 1.1 agc one with the smallest cost or best tags if costs are equal. */
723 1.1 agc if (reach_next[dest_id].pos == pos
724 1.1 agc && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
725 1.1 agc || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
726 1.1 agc && !tre_tag_order(num_tags, tnfa->tag_directions,
727 1.1 agc tmp_tags,
728 1.1 agc reach_next[dest_id].tags))))
729 1.1 agc {
730 1.1 agc DPRINT(("lose\n"));
731 1.1 agc continue;
732 1.1 agc }
733 1.1 agc DPRINT(("win %d %d\n",
734 1.1 agc reach_next[dest_id].pos,
735 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST]));
736 1.1 agc
737 1.1 agc /* Set state, position, tags, and depth. */
738 1.1 agc reach_next[dest_id].state = trans->state;
739 1.1 agc reach_next[dest_id].pos = pos;
740 1.1 agc for (i = 0; i < num_tags; i++)
741 1.1 agc reach_next[dest_id].tags[i] = tmp_tags[i];
742 1.1 agc reach_next[dest_id].depth = reach[id].depth;
743 1.1 agc
744 1.1 agc /* Set parameters. */
745 1.1 agc reach_next[dest_id].params = reach[id].params;
746 1.1 agc if (trans->params)
747 1.1 agc tre_set_params(&reach_next[dest_id], trans->params,
748 1.1 agc default_params);
749 1.1 agc
750 1.1 agc /* Set the costs after this transition. */
751 1.1 agc memcpy(&reach_next[dest_id].costs,
752 1.1 agc reach[id].costs,
753 1.1 agc sizeof(reach[id].costs[0][0])
754 1.1 agc * TRE_M_LAST * (depth + 1));
755 1.1 agc reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
756 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_SUBST] += err;
757 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_ERR] += err;
758 1.1 agc if (depth > 0)
759 1.1 agc {
760 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
761 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_SUBST] += err;
762 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_ERR] += err;
763 1.1 agc }
764 1.1 agc
765 1.1 agc if (trans->state == tnfa->final
766 1.1 agc && (match_eo < 0
767 1.1 agc || cost0 < match_costs[TRE_M_COST]
768 1.1 agc || (cost0 == match_costs[TRE_M_COST]
769 1.1 agc && num_tags > 0 && tmp_tags[0] <= match_tags[0])))
770 1.1 agc {
771 1.1 agc DPRINT((" setting new match at %d, cost %d\n",
772 1.1 agc pos, cost0));
773 1.1 agc match_eo = pos;
774 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
775 1.1 agc match_costs[i] = reach_next[dest_id].costs[0][i];
776 1.1 agc for (i = 0; i < num_tags; i++)
777 1.1 agc match_tags[i] = tmp_tags[i];
778 1.1 agc }
779 1.1 agc }
780 1.1 agc }
781 1.1 agc }
782 1.1 agc
783 1.1 agc DPRINT(("match end offset = %d, match cost = %d\n", match_eo,
784 1.1 agc match_costs[TRE_M_COST]));
785 1.1 agc
786 1.1 agc match->cost = match_costs[TRE_M_COST];
787 1.1 agc match->num_ins = match_costs[TRE_M_NUM_INS];
788 1.1 agc match->num_del = match_costs[TRE_M_NUM_DEL];
789 1.1 agc match->num_subst = match_costs[TRE_M_NUM_SUBST];
790 1.1 agc *match_end_ofs = match_eo;
791 1.1 agc
792 1.4 rin ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
793 1.4 rin error_exit:
794 1.4 rin #ifndef TRE_USE_ALLOCA
795 1.4 rin if (buf)
796 1.4 rin xfree(buf);
797 1.4 rin #endif /* !TRE_USE_ALLOCA */
798 1.4 rin return ret;
799 1.1 agc }
800