tre-match-approx.c revision 1.3 1 1.1 agc /*
2 1.1 agc tre-match-approx.c - TRE approximate regex matching engine
3 1.1 agc
4 1.1 agc This software is released under a BSD-style license.
5 1.1 agc See the file LICENSE for details and copyright.
6 1.1 agc
7 1.1 agc */
8 1.1 agc #ifdef HAVE_CONFIG_H
9 1.1 agc #include <config.h>
10 1.1 agc #endif /* HAVE_CONFIG_H */
11 1.1 agc
12 1.2 christos #include "tre-alloca.h"
13 1.1 agc
14 1.1 agc #define __USE_STRING_INLINES
15 1.1 agc #undef __NO_INLINE__
16 1.1 agc
17 1.1 agc #include <assert.h>
18 1.1 agc #include <stdlib.h>
19 1.1 agc #include <string.h>
20 1.1 agc #include <limits.h>
21 1.1 agc #ifdef HAVE_WCHAR_H
22 1.1 agc #include <wchar.h>
23 1.1 agc #endif /* HAVE_WCHAR_H */
24 1.1 agc #ifdef HAVE_WCTYPE_H
25 1.1 agc #include <wctype.h>
26 1.1 agc #endif /* HAVE_WCTYPE_H */
27 1.1 agc #ifndef TRE_WCHAR
28 1.1 agc #include <ctype.h>
29 1.1 agc #endif /* !TRE_WCHAR */
30 1.1 agc #ifdef HAVE_MALLOC_H
31 1.1 agc #include <malloc.h>
32 1.1 agc #endif /* HAVE_MALLOC_H */
33 1.1 agc
34 1.1 agc #include "tre-internal.h"
35 1.1 agc #include "tre-match-utils.h"
36 1.1 agc #include "tre.h"
37 1.1 agc #include "xmalloc.h"
38 1.1 agc
39 1.1 agc #define TRE_M_COST 0
40 1.1 agc #define TRE_M_NUM_INS 1
41 1.1 agc #define TRE_M_NUM_DEL 2
42 1.1 agc #define TRE_M_NUM_SUBST 3
43 1.1 agc #define TRE_M_NUM_ERR 4
44 1.1 agc #define TRE_M_LAST 5
45 1.1 agc
46 1.1 agc #define TRE_M_MAX_DEPTH 3
47 1.1 agc
48 1.1 agc typedef struct {
49 1.1 agc /* State in the TNFA transition table. */
50 1.1 agc tre_tnfa_transition_t *state;
51 1.1 agc /* Position in input string. */
52 1.1 agc int pos;
53 1.1 agc /* Tag values. */
54 1.1 agc int *tags;
55 1.1 agc /* Matching parameters. */
56 1.1 agc regaparams_t params;
57 1.1 agc /* Nesting depth of parameters. This is used as an index in
58 1.1 agc the `costs' array. */
59 1.1 agc int depth;
60 1.1 agc /* Costs and counter values for different parameter nesting depths. */
61 1.1 agc int costs[TRE_M_MAX_DEPTH + 1][TRE_M_LAST];
62 1.1 agc } tre_tnfa_approx_reach_t;
63 1.1 agc
64 1.1 agc
65 1.1 agc #ifdef TRE_DEBUG
66 1.1 agc /* Prints the `reach' array in a readable fashion with DPRINT. */
67 1.1 agc static void
68 1.1 agc tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_approx_reach_t *reach,
69 1.2 christos int pos, size_t num_tags)
70 1.1 agc {
71 1.1 agc int id;
72 1.1 agc
73 1.1 agc /* Print each state on one line. */
74 1.1 agc DPRINT((" reach:\n"));
75 1.1 agc for (id = 0; id < tnfa->num_states; id++)
76 1.1 agc {
77 1.1 agc int i, j;
78 1.1 agc if (reach[id].pos < pos)
79 1.1 agc continue; /* Not reached. */
80 1.1 agc DPRINT((" %03d, costs ", id));
81 1.1 agc for (i = 0; i <= reach[id].depth; i++)
82 1.1 agc {
83 1.1 agc DPRINT(("["));
84 1.1 agc for (j = 0; j < TRE_M_LAST; j++)
85 1.1 agc {
86 1.1 agc DPRINT(("%2d", reach[id].costs[i][j]));
87 1.1 agc if (j + 1 < TRE_M_LAST)
88 1.1 agc DPRINT((","));
89 1.1 agc }
90 1.1 agc DPRINT(("]"));
91 1.1 agc if (i + 1 <= reach[id].depth)
92 1.1 agc DPRINT((", "));
93 1.1 agc }
94 1.1 agc DPRINT(("\n tags "));
95 1.1 agc for (i = 0; i < num_tags; i++)
96 1.1 agc {
97 1.1 agc DPRINT(("%02d", reach[id].tags[i]));
98 1.1 agc if (i + 1 < num_tags)
99 1.1 agc DPRINT((","));
100 1.1 agc }
101 1.1 agc DPRINT(("\n"));
102 1.1 agc }
103 1.1 agc DPRINT(("\n"));
104 1.1 agc }
105 1.1 agc #endif /* TRE_DEBUG */
106 1.1 agc
107 1.1 agc
108 1.1 agc /* Sets the matching parameters in `reach' to the ones defined in the `pa'
109 1.1 agc array. If `pa' specifies default values, they are taken from
110 1.1 agc `default_params'. */
111 1.1 agc inline static void
112 1.1 agc tre_set_params(tre_tnfa_approx_reach_t *reach,
113 1.1 agc int *pa, regaparams_t default_params)
114 1.1 agc {
115 1.1 agc int value;
116 1.1 agc
117 1.1 agc /* If depth is increased reset costs and counters to zero for the
118 1.1 agc new levels. */
119 1.1 agc value = pa[TRE_PARAM_DEPTH];
120 1.1 agc assert(value <= TRE_M_MAX_DEPTH);
121 1.1 agc if (value > reach->depth)
122 1.1 agc {
123 1.1 agc int i, j;
124 1.1 agc for (i = reach->depth + 1; i <= value; i++)
125 1.1 agc for (j = 0; j < TRE_M_LAST; j++)
126 1.1 agc reach->costs[i][j] = 0;
127 1.1 agc }
128 1.1 agc reach->depth = value;
129 1.1 agc
130 1.1 agc /* Set insert cost. */
131 1.1 agc value = pa[TRE_PARAM_COST_INS];
132 1.1 agc if (value == TRE_PARAM_DEFAULT)
133 1.1 agc reach->params.cost_ins = default_params.cost_ins;
134 1.1 agc else if (value != TRE_PARAM_UNSET)
135 1.1 agc reach->params.cost_ins = value;
136 1.1 agc
137 1.1 agc /* Set delete cost. */
138 1.1 agc value = pa[TRE_PARAM_COST_DEL];
139 1.1 agc if (value == TRE_PARAM_DEFAULT)
140 1.1 agc reach->params.cost_del = default_params.cost_del;
141 1.1 agc else if (value != TRE_PARAM_UNSET)
142 1.1 agc reach->params.cost_del = value;
143 1.1 agc
144 1.1 agc /* Set substitute cost. */
145 1.1 agc value = pa[TRE_PARAM_COST_SUBST];
146 1.1 agc if (value == TRE_PARAM_DEFAULT)
147 1.1 agc reach->params.cost_subst = default_params.cost_subst;
148 1.1 agc else
149 1.1 agc reach->params.cost_subst = value;
150 1.1 agc
151 1.1 agc /* Set maximum cost. */
152 1.1 agc value = pa[TRE_PARAM_COST_MAX];
153 1.1 agc if (value == TRE_PARAM_DEFAULT)
154 1.1 agc reach->params.max_cost = default_params.max_cost;
155 1.1 agc else if (value != TRE_PARAM_UNSET)
156 1.1 agc reach->params.max_cost = value;
157 1.1 agc
158 1.1 agc /* Set maximum inserts. */
159 1.1 agc value = pa[TRE_PARAM_MAX_INS];
160 1.1 agc if (value == TRE_PARAM_DEFAULT)
161 1.1 agc reach->params.max_ins = default_params.max_ins;
162 1.1 agc else if (value != TRE_PARAM_UNSET)
163 1.1 agc reach->params.max_ins = value;
164 1.1 agc
165 1.1 agc /* Set maximum deletes. */
166 1.1 agc value = pa[TRE_PARAM_MAX_DEL];
167 1.1 agc if (value == TRE_PARAM_DEFAULT)
168 1.1 agc reach->params.max_del = default_params.max_del;
169 1.1 agc else if (value != TRE_PARAM_UNSET)
170 1.1 agc reach->params.max_del = value;
171 1.1 agc
172 1.1 agc /* Set maximum substitutes. */
173 1.1 agc value = pa[TRE_PARAM_MAX_SUBST];
174 1.1 agc if (value == TRE_PARAM_DEFAULT)
175 1.1 agc reach->params.max_subst = default_params.max_subst;
176 1.1 agc else if (value != TRE_PARAM_UNSET)
177 1.1 agc reach->params.max_subst = value;
178 1.1 agc
179 1.1 agc /* Set maximum number of errors. */
180 1.1 agc value = pa[TRE_PARAM_MAX_ERR];
181 1.1 agc if (value == TRE_PARAM_DEFAULT)
182 1.1 agc reach->params.max_err = default_params.max_err;
183 1.1 agc else if (value != TRE_PARAM_UNSET)
184 1.1 agc reach->params.max_err = value;
185 1.1 agc }
186 1.1 agc
187 1.1 agc reg_errcode_t
188 1.1 agc tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
189 1.1 agc tre_str_type_t type, int *match_tags,
190 1.1 agc regamatch_t *match, regaparams_t default_params,
191 1.1 agc int eflags, int *match_end_ofs)
192 1.1 agc {
193 1.1 agc /* State variables required by GET_NEXT_WCHAR. */
194 1.1 agc tre_char_t prev_c = 0, next_c = 0;
195 1.1 agc const char *str_byte = string;
196 1.1 agc int pos = -1;
197 1.1 agc unsigned int pos_add_next = 1;
198 1.1 agc #ifdef TRE_WCHAR
199 1.1 agc const wchar_t *str_wide = string;
200 1.1 agc #ifdef TRE_MBSTATE
201 1.1 agc mbstate_t mbstate;
202 1.1 agc #endif /* !TRE_WCHAR */
203 1.1 agc #endif /* TRE_WCHAR */
204 1.1 agc int reg_notbol = eflags & REG_NOTBOL;
205 1.1 agc int reg_noteol = eflags & REG_NOTEOL;
206 1.1 agc int reg_newline = tnfa->cflags & REG_NEWLINE;
207 1.2 christos size_t str_user_end = 0;
208 1.1 agc
209 1.1 agc int prev_pos;
210 1.1 agc
211 1.1 agc /* Number of tags. */
212 1.2 christos size_t num_tags;
213 1.1 agc /* The reach tables. */
214 1.1 agc tre_tnfa_approx_reach_t *reach, *reach_next;
215 1.1 agc /* Tag array for temporary use. */
216 1.1 agc int *tmp_tags;
217 1.1 agc
218 1.1 agc /* End offset of best match so far, or -1 if no match found yet. */
219 1.1 agc int match_eo = -1;
220 1.1 agc /* Costs of the match. */
221 1.1 agc int match_costs[TRE_M_LAST];
222 1.1 agc
223 1.1 agc /* Space for temporary data required for matching. */
224 1.1 agc unsigned char *buf;
225 1.1 agc
226 1.2 christos size_t i, id;
227 1.1 agc
228 1.1 agc if (!match_tags)
229 1.1 agc num_tags = 0;
230 1.1 agc else
231 1.1 agc num_tags = tnfa->num_tags;
232 1.1 agc
233 1.1 agc #ifdef TRE_MBSTATE
234 1.1 agc memset(&mbstate, '\0', sizeof(mbstate));
235 1.1 agc #endif /* TRE_MBSTATE */
236 1.1 agc
237 1.1 agc DPRINT(("tre_tnfa_run_approx, input type %d, len %d, eflags %d, "
238 1.1 agc "match_tags %p\n",
239 1.1 agc type, len, eflags,
240 1.1 agc match_tags));
241 1.1 agc DPRINT(("max cost %d, ins %d, del %d, subst %d\n",
242 1.1 agc default_params.max_cost,
243 1.1 agc default_params.cost_ins,
244 1.1 agc default_params.cost_del,
245 1.1 agc default_params.cost_subst));
246 1.1 agc
247 1.1 agc /* Allocate memory for temporary data required for matching. This needs to
248 1.1 agc be done for every matching operation to be thread safe. This allocates
249 1.1 agc everything in a single large block from the stack frame using alloca()
250 1.1 agc or with malloc() if alloca is unavailable. */
251 1.1 agc {
252 1.1 agc unsigned char *buf_cursor;
253 1.1 agc /* Space needed for one array of tags. */
254 1.2 christos size_t tag_bytes = sizeof(*tmp_tags) * num_tags;
255 1.1 agc /* Space needed for one reach table. */
256 1.2 christos size_t reach_bytes = sizeof(*reach_next) * tnfa->num_states;
257 1.1 agc /* Total space needed. */
258 1.2 christos size_t total_bytes = reach_bytes * 2 + (tnfa->num_states * 2 + 1 ) * tag_bytes;
259 1.1 agc /* Add some extra to make sure we can align the pointers. The multiplier
260 1.1 agc used here must be equal to the number of ALIGN calls below. */
261 1.1 agc total_bytes += (sizeof(long) - 1) * 3;
262 1.1 agc
263 1.1 agc /* Allocate the memory. */
264 1.1 agc #ifdef TRE_USE_ALLOCA
265 1.1 agc buf = alloca(total_bytes);
266 1.1 agc #else /* !TRE_USE_ALLOCA */
267 1.1 agc buf = xmalloc((unsigned)total_bytes);
268 1.1 agc #endif /* !TRE_USE_ALLOCA */
269 1.1 agc if (!buf)
270 1.1 agc return REG_ESPACE;
271 1.1 agc memset(buf, 0, (size_t)total_bytes);
272 1.1 agc
273 1.1 agc /* Allocate `tmp_tags' from `buf'. */
274 1.1 agc tmp_tags = (void *)buf;
275 1.1 agc buf_cursor = buf + tag_bytes;
276 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
277 1.1 agc
278 1.1 agc /* Allocate `reach' from `buf'. */
279 1.1 agc reach = (void *)buf_cursor;
280 1.1 agc buf_cursor += reach_bytes;
281 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
282 1.1 agc
283 1.1 agc /* Allocate `reach_next' from `buf'. */
284 1.1 agc reach_next = (void *)buf_cursor;
285 1.1 agc buf_cursor += reach_bytes;
286 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
287 1.1 agc
288 1.1 agc /* Allocate tag arrays for `reach' and `reach_next' from `buf'. */
289 1.1 agc for (i = 0; i < tnfa->num_states; i++)
290 1.1 agc {
291 1.1 agc reach[i].tags = (void *)buf_cursor;
292 1.1 agc buf_cursor += tag_bytes;
293 1.1 agc reach_next[i].tags = (void *)buf_cursor;
294 1.1 agc buf_cursor += tag_bytes;
295 1.1 agc }
296 1.1 agc assert(buf_cursor <= buf + total_bytes);
297 1.1 agc }
298 1.1 agc
299 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
300 1.1 agc match_costs[i] = INT_MAX;
301 1.1 agc
302 1.1 agc /* Mark the reach arrays empty. */
303 1.1 agc for (i = 0; i < tnfa->num_states; i++)
304 1.1 agc reach[i].pos = reach_next[i].pos = -2;
305 1.1 agc
306 1.1 agc prev_pos = pos;
307 1.1 agc GET_NEXT_WCHAR();
308 1.1 agc pos = 0;
309 1.1 agc
310 1.3 rin while (/*CONSTCOND*/(void)1,1)
311 1.1 agc {
312 1.1 agc DPRINT(("%03d:%2lc/%05d\n", pos, (tre_cint_t)next_c, (int)next_c));
313 1.1 agc
314 1.1 agc /* Add initial states to `reach_next' if an exact match has not yet
315 1.1 agc been found. */
316 1.1 agc if (match_costs[TRE_M_COST] > 0)
317 1.1 agc {
318 1.1 agc tre_tnfa_transition_t *trans;
319 1.1 agc DPRINT((" init"));
320 1.1 agc for (trans = tnfa->initial; trans->state; trans++)
321 1.1 agc {
322 1.1 agc int stateid = trans->state_id;
323 1.1 agc
324 1.1 agc /* If this state is not currently in `reach_next', add it
325 1.1 agc there. */
326 1.1 agc if (reach_next[stateid].pos < pos)
327 1.1 agc {
328 1.1 agc if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
329 1.1 agc {
330 1.1 agc /* Assertions failed, don't add this state. */
331 1.1 agc DPRINT((" !%d (assert)", stateid));
332 1.1 agc continue;
333 1.1 agc }
334 1.1 agc DPRINT((" %d", stateid));
335 1.1 agc reach_next[stateid].state = trans->state;
336 1.1 agc reach_next[stateid].pos = pos;
337 1.1 agc
338 1.1 agc /* Compute tag values after this transition. */
339 1.1 agc for (i = 0; i < num_tags; i++)
340 1.1 agc reach_next[stateid].tags[i] = -1;
341 1.1 agc
342 1.1 agc if (trans->tags)
343 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
344 1.2 christos if ((size_t)trans->tags[i] < num_tags)
345 1.1 agc reach_next[stateid].tags[trans->tags[i]] = pos;
346 1.1 agc
347 1.1 agc /* Set the parameters, depth, and costs. */
348 1.1 agc reach_next[stateid].params = default_params;
349 1.1 agc reach_next[stateid].depth = 0;
350 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
351 1.1 agc reach_next[stateid].costs[0][i] = 0;
352 1.1 agc if (trans->params)
353 1.1 agc tre_set_params(&reach_next[stateid], trans->params,
354 1.1 agc default_params);
355 1.1 agc
356 1.1 agc /* If this is the final state, mark the exact match. */
357 1.1 agc if (trans->state == tnfa->final)
358 1.1 agc {
359 1.1 agc match_eo = pos;
360 1.1 agc for (i = 0; i < num_tags; i++)
361 1.1 agc match_tags[i] = reach_next[stateid].tags[i];
362 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
363 1.1 agc match_costs[i] = 0;
364 1.1 agc }
365 1.1 agc }
366 1.1 agc }
367 1.1 agc DPRINT(("\n"));
368 1.1 agc }
369 1.1 agc
370 1.1 agc
371 1.1 agc /* Handle inserts. This is done by pretending there's an epsilon
372 1.1 agc transition from each state in `reach' back to the same state.
373 1.1 agc We don't need to worry about the final state here; this will never
374 1.1 agc give a better match than what we already have. */
375 1.1 agc for (id = 0; id < tnfa->num_states; id++)
376 1.1 agc {
377 1.1 agc int depth;
378 1.1 agc int cost, cost0;
379 1.1 agc
380 1.1 agc if (reach[id].pos != prev_pos)
381 1.1 agc {
382 1.1 agc DPRINT((" insert: %d not reached\n", id));
383 1.1 agc continue; /* Not reached. */
384 1.1 agc }
385 1.1 agc
386 1.1 agc depth = reach[id].depth;
387 1.1 agc
388 1.1 agc /* Compute and check cost at current depth. */
389 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
390 1.1 agc if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
391 1.1 agc cost += reach[id].params.cost_ins;
392 1.1 agc if (cost > reach[id].params.max_cost)
393 1.1 agc continue; /* Cost too large. */
394 1.1 agc
395 1.1 agc /* Check number of inserts at current depth. */
396 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_INS] + 1
397 1.1 agc > reach[id].params.max_ins)
398 1.1 agc continue; /* Too many inserts. */
399 1.1 agc
400 1.1 agc /* Check total number of errors at current depth. */
401 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
402 1.1 agc > reach[id].params.max_err)
403 1.1 agc continue; /* Too many errors. */
404 1.1 agc
405 1.1 agc /* Compute overall cost. */
406 1.1 agc cost0 = cost;
407 1.1 agc if (depth > 0)
408 1.1 agc {
409 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
410 1.1 agc if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
411 1.1 agc cost0 += reach[id].params.cost_ins;
412 1.1 agc else
413 1.1 agc cost0 += default_params.cost_ins;
414 1.1 agc }
415 1.1 agc
416 1.1 agc DPRINT((" insert: from %d to %d, cost %d: ", id, id,
417 1.1 agc reach[id].costs[depth][TRE_M_COST]));
418 1.1 agc if (reach_next[id].pos == pos
419 1.1 agc && (cost0 >= reach_next[id].costs[0][TRE_M_COST]))
420 1.1 agc {
421 1.1 agc DPRINT(("lose\n"));
422 1.1 agc continue;
423 1.1 agc }
424 1.1 agc DPRINT(("win\n"));
425 1.1 agc
426 1.1 agc /* Copy state, position, tags, parameters, and depth. */
427 1.1 agc reach_next[id].state = reach[id].state;
428 1.1 agc reach_next[id].pos = pos;
429 1.1 agc for (i = 0; i < num_tags; i++)
430 1.1 agc reach_next[id].tags[i] = reach[id].tags[i];
431 1.1 agc reach_next[id].params = reach[id].params;
432 1.1 agc reach_next[id].depth = reach[id].depth;
433 1.1 agc
434 1.1 agc /* Set the costs after this transition. */
435 1.1 agc memcpy(reach_next[id].costs, reach[id].costs,
436 1.1 agc sizeof(reach_next[id].costs[0][0])
437 1.1 agc * TRE_M_LAST * (depth + 1));
438 1.1 agc reach_next[id].costs[depth][TRE_M_COST] = cost;
439 1.1 agc reach_next[id].costs[depth][TRE_M_NUM_INS]++;
440 1.1 agc reach_next[id].costs[depth][TRE_M_NUM_ERR]++;
441 1.1 agc if (depth > 0)
442 1.1 agc {
443 1.1 agc reach_next[id].costs[0][TRE_M_COST] = cost0;
444 1.1 agc reach_next[id].costs[0][TRE_M_NUM_INS]++;
445 1.1 agc reach_next[id].costs[0][TRE_M_NUM_ERR]++;
446 1.1 agc }
447 1.1 agc
448 1.1 agc }
449 1.1 agc
450 1.1 agc
451 1.1 agc /* Handle deletes. This is done by traversing through the whole TNFA
452 1.1 agc pretending that all transitions are epsilon transitions, until
453 1.1 agc no more states can be reached with better costs. */
454 1.1 agc {
455 1.1 agc /* XXX - dynamic ringbuffer size */
456 1.1 agc tre_tnfa_approx_reach_t *ringbuffer[512];
457 1.1 agc tre_tnfa_approx_reach_t **deque_start, **deque_end;
458 1.1 agc
459 1.1 agc deque_start = deque_end = ringbuffer;
460 1.1 agc
461 1.1 agc /* Add all states in `reach_next' to the deque. */
462 1.1 agc for (id = 0; id < tnfa->num_states; id++)
463 1.1 agc {
464 1.1 agc if (reach_next[id].pos != pos)
465 1.1 agc continue;
466 1.1 agc *deque_end = &reach_next[id];
467 1.1 agc deque_end++;
468 1.1 agc assert(deque_end != deque_start);
469 1.1 agc }
470 1.1 agc
471 1.1 agc /* Repeat until the deque is empty. */
472 1.1 agc while (deque_end != deque_start)
473 1.1 agc {
474 1.1 agc tre_tnfa_approx_reach_t *reach_p;
475 1.1 agc int depth;
476 1.1 agc int cost, cost0;
477 1.1 agc tre_tnfa_transition_t *trans;
478 1.1 agc
479 1.1 agc /* Pop the first item off the deque. */
480 1.1 agc reach_p = *deque_start;
481 1.1 agc id = reach_p - reach_next;
482 1.1 agc depth = reach_p->depth;
483 1.1 agc
484 1.1 agc /* Compute cost at current depth. */
485 1.1 agc cost = reach_p->costs[depth][TRE_M_COST];
486 1.1 agc if (reach_p->params.cost_del != TRE_PARAM_UNSET)
487 1.1 agc cost += reach_p->params.cost_del;
488 1.1 agc
489 1.1 agc /* Check cost, number of deletes, and total number of errors
490 1.1 agc at current depth. */
491 1.1 agc if (cost > reach_p->params.max_cost
492 1.1 agc || (reach_p->costs[depth][TRE_M_NUM_DEL] + 1
493 1.1 agc > reach_p->params.max_del)
494 1.1 agc || (reach_p->costs[depth][TRE_M_NUM_ERR] + 1
495 1.1 agc > reach_p->params.max_err))
496 1.1 agc {
497 1.1 agc /* Too many errors or cost too large. */
498 1.1 agc DPRINT((" delete: from %03d: cost too large\n", id));
499 1.1 agc deque_start++;
500 1.1 agc if (deque_start >= (ringbuffer + 512))
501 1.1 agc deque_start = ringbuffer;
502 1.1 agc continue;
503 1.1 agc }
504 1.1 agc
505 1.1 agc /* Compute overall cost. */
506 1.1 agc cost0 = cost;
507 1.1 agc if (depth > 0)
508 1.1 agc {
509 1.1 agc cost0 = reach_p->costs[0][TRE_M_COST];
510 1.1 agc if (reach_p->params.cost_del != TRE_PARAM_UNSET)
511 1.1 agc cost0 += reach_p->params.cost_del;
512 1.1 agc else
513 1.1 agc cost0 += default_params.cost_del;
514 1.1 agc }
515 1.1 agc
516 1.1 agc for (trans = reach_p->state; trans->state; trans++)
517 1.1 agc {
518 1.1 agc int dest_id = trans->state_id;
519 1.1 agc DPRINT((" delete: from %03d to %03d, cost %d (%d): ",
520 1.1 agc id, dest_id, cost0, reach_p->params.max_cost));
521 1.1 agc
522 1.1 agc if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
523 1.1 agc {
524 1.1 agc DPRINT(("assertion failed\n"));
525 1.1 agc continue;
526 1.1 agc }
527 1.1 agc
528 1.1 agc /* Compute tag values after this transition. */
529 1.1 agc for (i = 0; i < num_tags; i++)
530 1.1 agc tmp_tags[i] = reach_p->tags[i];
531 1.1 agc if (trans->tags)
532 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
533 1.2 christos if ((size_t)trans->tags[i] < num_tags)
534 1.1 agc tmp_tags[trans->tags[i]] = pos;
535 1.1 agc
536 1.1 agc /* If another path has also reached this state, choose the one
537 1.1 agc with the smallest cost or best tags if costs are equal. */
538 1.1 agc if (reach_next[dest_id].pos == pos
539 1.1 agc && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
540 1.1 agc || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
541 1.1 agc && (!match_tags
542 1.1 agc || !tre_tag_order(num_tags,
543 1.1 agc tnfa->tag_directions,
544 1.1 agc tmp_tags,
545 1.1 agc reach_next[dest_id].tags)))))
546 1.1 agc {
547 1.1 agc DPRINT(("lose, cost0 %d, have %d\n",
548 1.1 agc cost0, reach_next[dest_id].costs[0][TRE_M_COST]));
549 1.1 agc continue;
550 1.1 agc }
551 1.1 agc DPRINT(("win\n"));
552 1.1 agc
553 1.1 agc /* Set state, position, tags, parameters, depth, and costs. */
554 1.1 agc reach_next[dest_id].state = trans->state;
555 1.1 agc reach_next[dest_id].pos = pos;
556 1.1 agc for (i = 0; i < num_tags; i++)
557 1.1 agc reach_next[dest_id].tags[i] = tmp_tags[i];
558 1.1 agc
559 1.1 agc reach_next[dest_id].params = reach_p->params;
560 1.1 agc if (trans->params)
561 1.1 agc tre_set_params(&reach_next[dest_id], trans->params,
562 1.1 agc default_params);
563 1.1 agc
564 1.1 agc reach_next[dest_id].depth = reach_p->depth;
565 1.1 agc memcpy(&reach_next[dest_id].costs,
566 1.1 agc reach_p->costs,
567 1.1 agc sizeof(reach_p->costs[0][0])
568 1.1 agc * TRE_M_LAST * (depth + 1));
569 1.1 agc reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
570 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_DEL]++;
571 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_ERR]++;
572 1.1 agc if (depth > 0)
573 1.1 agc {
574 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
575 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_DEL]++;
576 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_ERR]++;
577 1.1 agc }
578 1.1 agc
579 1.1 agc if (trans->state == tnfa->final
580 1.1 agc && (match_eo < 0
581 1.1 agc || match_costs[TRE_M_COST] > cost0
582 1.1 agc || (match_costs[TRE_M_COST] == cost0
583 1.1 agc && (num_tags > 0
584 1.1 agc && tmp_tags[0] <= match_tags[0]))))
585 1.1 agc {
586 1.1 agc DPRINT((" setting new match at %d, cost %d\n",
587 1.1 agc pos, cost0));
588 1.1 agc match_eo = pos;
589 1.1 agc memcpy(match_costs, reach_next[dest_id].costs[0],
590 1.1 agc sizeof(match_costs[0]) * TRE_M_LAST);
591 1.1 agc for (i = 0; i < num_tags; i++)
592 1.1 agc match_tags[i] = tmp_tags[i];
593 1.1 agc }
594 1.1 agc
595 1.1 agc /* Add to the end of the deque. */
596 1.1 agc *deque_end = &reach_next[dest_id];
597 1.1 agc deque_end++;
598 1.1 agc if (deque_end >= (ringbuffer + 512))
599 1.1 agc deque_end = ringbuffer;
600 1.1 agc assert(deque_end != deque_start);
601 1.1 agc }
602 1.1 agc deque_start++;
603 1.1 agc if (deque_start >= (ringbuffer + 512))
604 1.1 agc deque_start = ringbuffer;
605 1.1 agc }
606 1.1 agc
607 1.1 agc }
608 1.1 agc
609 1.1 agc #ifdef TRE_DEBUG
610 1.1 agc tre_print_reach(tnfa, reach_next, pos, num_tags);
611 1.1 agc #endif /* TRE_DEBUG */
612 1.1 agc
613 1.1 agc /* Check for end of string. */
614 1.1 agc if (len < 0)
615 1.1 agc {
616 1.1 agc if (type == STR_USER)
617 1.1 agc {
618 1.1 agc if (str_user_end)
619 1.1 agc break;
620 1.1 agc }
621 1.1 agc else if (next_c == L'\0')
622 1.1 agc break;
623 1.1 agc }
624 1.1 agc else
625 1.1 agc {
626 1.1 agc if (pos >= len)
627 1.1 agc break;
628 1.1 agc }
629 1.1 agc
630 1.1 agc prev_pos = pos;
631 1.1 agc GET_NEXT_WCHAR();
632 1.1 agc
633 1.1 agc /* Swap `reach' and `reach_next'. */
634 1.1 agc {
635 1.1 agc tre_tnfa_approx_reach_t *tmp;
636 1.1 agc tmp = reach;
637 1.1 agc reach = reach_next;
638 1.1 agc reach_next = tmp;
639 1.1 agc }
640 1.1 agc
641 1.1 agc /* Handle exact matches and substitutions. */
642 1.1 agc for (id = 0; id < tnfa->num_states; id++)
643 1.1 agc {
644 1.1 agc tre_tnfa_transition_t *trans;
645 1.1 agc
646 1.1 agc if (reach[id].pos < prev_pos)
647 1.1 agc continue; /* Not reached. */
648 1.1 agc for (trans = reach[id].state; trans->state; trans++)
649 1.1 agc {
650 1.1 agc int dest_id;
651 1.1 agc int depth;
652 1.1 agc int cost, cost0, err;
653 1.1 agc
654 1.1 agc if (trans->assertions
655 1.1 agc && (CHECK_ASSERTIONS(trans->assertions)
656 1.1 agc || CHECK_CHAR_CLASSES(trans, tnfa, eflags)))
657 1.1 agc {
658 1.1 agc DPRINT((" exact, from %d: assert failed\n", id));
659 1.1 agc continue;
660 1.1 agc }
661 1.1 agc
662 1.1 agc depth = reach[id].depth;
663 1.1 agc dest_id = trans->state_id;
664 1.1 agc
665 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
666 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
667 1.1 agc err = 0;
668 1.1 agc
669 1.1 agc if (trans->code_min > (tre_cint_t)prev_c
670 1.1 agc || trans->code_max < (tre_cint_t)prev_c)
671 1.1 agc {
672 1.1 agc /* Handle substitutes. The required character was not in
673 1.1 agc the string, so match it in place of whatever was supposed
674 1.1 agc to be there and increase costs accordingly. */
675 1.1 agc err = 1;
676 1.1 agc
677 1.1 agc /* Compute and check cost at current depth. */
678 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
679 1.1 agc if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
680 1.1 agc cost += reach[id].params.cost_subst;
681 1.1 agc if (cost > reach[id].params.max_cost)
682 1.1 agc continue; /* Cost too large. */
683 1.1 agc
684 1.1 agc /* Check number of substitutes at current depth. */
685 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_SUBST] + 1
686 1.1 agc > reach[id].params.max_subst)
687 1.1 agc continue; /* Too many substitutes. */
688 1.1 agc
689 1.1 agc /* Check total number of errors at current depth. */
690 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
691 1.1 agc > reach[id].params.max_err)
692 1.1 agc continue; /* Too many errors. */
693 1.1 agc
694 1.1 agc /* Compute overall cost. */
695 1.1 agc cost0 = cost;
696 1.1 agc if (depth > 0)
697 1.1 agc {
698 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
699 1.1 agc if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
700 1.1 agc cost0 += reach[id].params.cost_subst;
701 1.1 agc else
702 1.1 agc cost0 += default_params.cost_subst;
703 1.1 agc }
704 1.1 agc DPRINT((" subst, from %03d to %03d, cost %d: ",
705 1.1 agc id, dest_id, cost0));
706 1.1 agc }
707 1.1 agc else
708 1.1 agc DPRINT((" exact, from %03d to %03d, cost %d: ",
709 1.1 agc id, dest_id, cost0));
710 1.1 agc
711 1.1 agc /* Compute tag values after this transition. */
712 1.1 agc for (i = 0; i < num_tags; i++)
713 1.1 agc tmp_tags[i] = reach[id].tags[i];
714 1.1 agc if (trans->tags)
715 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
716 1.2 christos if ((size_t)trans->tags[i] < num_tags)
717 1.1 agc tmp_tags[trans->tags[i]] = pos;
718 1.1 agc
719 1.1 agc /* If another path has also reached this state, choose the
720 1.1 agc one with the smallest cost or best tags if costs are equal. */
721 1.1 agc if (reach_next[dest_id].pos == pos
722 1.1 agc && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
723 1.1 agc || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
724 1.1 agc && !tre_tag_order(num_tags, tnfa->tag_directions,
725 1.1 agc tmp_tags,
726 1.1 agc reach_next[dest_id].tags))))
727 1.1 agc {
728 1.1 agc DPRINT(("lose\n"));
729 1.1 agc continue;
730 1.1 agc }
731 1.1 agc DPRINT(("win %d %d\n",
732 1.1 agc reach_next[dest_id].pos,
733 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST]));
734 1.1 agc
735 1.1 agc /* Set state, position, tags, and depth. */
736 1.1 agc reach_next[dest_id].state = trans->state;
737 1.1 agc reach_next[dest_id].pos = pos;
738 1.1 agc for (i = 0; i < num_tags; i++)
739 1.1 agc reach_next[dest_id].tags[i] = tmp_tags[i];
740 1.1 agc reach_next[dest_id].depth = reach[id].depth;
741 1.1 agc
742 1.1 agc /* Set parameters. */
743 1.1 agc reach_next[dest_id].params = reach[id].params;
744 1.1 agc if (trans->params)
745 1.1 agc tre_set_params(&reach_next[dest_id], trans->params,
746 1.1 agc default_params);
747 1.1 agc
748 1.1 agc /* Set the costs after this transition. */
749 1.1 agc memcpy(&reach_next[dest_id].costs,
750 1.1 agc reach[id].costs,
751 1.1 agc sizeof(reach[id].costs[0][0])
752 1.1 agc * TRE_M_LAST * (depth + 1));
753 1.1 agc reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
754 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_SUBST] += err;
755 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_ERR] += err;
756 1.1 agc if (depth > 0)
757 1.1 agc {
758 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
759 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_SUBST] += err;
760 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_ERR] += err;
761 1.1 agc }
762 1.1 agc
763 1.1 agc if (trans->state == tnfa->final
764 1.1 agc && (match_eo < 0
765 1.1 agc || cost0 < match_costs[TRE_M_COST]
766 1.1 agc || (cost0 == match_costs[TRE_M_COST]
767 1.1 agc && num_tags > 0 && tmp_tags[0] <= match_tags[0])))
768 1.1 agc {
769 1.1 agc DPRINT((" setting new match at %d, cost %d\n",
770 1.1 agc pos, cost0));
771 1.1 agc match_eo = pos;
772 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
773 1.1 agc match_costs[i] = reach_next[dest_id].costs[0][i];
774 1.1 agc for (i = 0; i < num_tags; i++)
775 1.1 agc match_tags[i] = tmp_tags[i];
776 1.1 agc }
777 1.1 agc }
778 1.1 agc }
779 1.1 agc }
780 1.1 agc
781 1.1 agc DPRINT(("match end offset = %d, match cost = %d\n", match_eo,
782 1.1 agc match_costs[TRE_M_COST]));
783 1.1 agc
784 1.1 agc #ifndef TRE_USE_ALLOCA
785 1.1 agc if (buf)
786 1.1 agc xfree(buf);
787 1.1 agc #endif /* !TRE_USE_ALLOCA */
788 1.1 agc
789 1.1 agc match->cost = match_costs[TRE_M_COST];
790 1.1 agc match->num_ins = match_costs[TRE_M_NUM_INS];
791 1.1 agc match->num_del = match_costs[TRE_M_NUM_DEL];
792 1.1 agc match->num_subst = match_costs[TRE_M_NUM_SUBST];
793 1.1 agc *match_end_ofs = match_eo;
794 1.1 agc
795 1.1 agc return match_eo >= 0 ? REG_OK : REG_NOMATCH;
796 1.1 agc }
797