tre-match-approx.c revision 1.1 1 1.1 agc /*
2 1.1 agc tre-match-approx.c - TRE approximate regex matching engine
3 1.1 agc
4 1.1 agc This software is released under a BSD-style license.
5 1.1 agc See the file LICENSE for details and copyright.
6 1.1 agc
7 1.1 agc */
8 1.1 agc
9 1.1 agc #ifdef HAVE_CONFIG_H
10 1.1 agc #include <config.h>
11 1.1 agc #endif /* HAVE_CONFIG_H */
12 1.1 agc
13 1.1 agc /* AIX requires this to be the first thing in the file. */
14 1.1 agc #ifdef TRE_USE_ALLOCA
15 1.1 agc #ifndef __GNUC__
16 1.1 agc # if HAVE_ALLOCA_H
17 1.1 agc # include <alloca.h>
18 1.1 agc # else
19 1.1 agc # ifdef _AIX
20 1.1 agc #pragma alloca
21 1.1 agc # else
22 1.1 agc # ifndef alloca /* predefined by HP cc +Olibcalls */
23 1.1 agc char *alloca ();
24 1.1 agc # endif
25 1.1 agc # endif
26 1.1 agc # endif
27 1.1 agc #endif
28 1.1 agc #endif /* TRE_USE_ALLOCA */
29 1.1 agc
30 1.1 agc #define __USE_STRING_INLINES
31 1.1 agc #undef __NO_INLINE__
32 1.1 agc
33 1.1 agc #include <assert.h>
34 1.1 agc #include <stdlib.h>
35 1.1 agc #include <string.h>
36 1.1 agc #include <limits.h>
37 1.1 agc #ifdef HAVE_WCHAR_H
38 1.1 agc #include <wchar.h>
39 1.1 agc #endif /* HAVE_WCHAR_H */
40 1.1 agc #ifdef HAVE_WCTYPE_H
41 1.1 agc #include <wctype.h>
42 1.1 agc #endif /* HAVE_WCTYPE_H */
43 1.1 agc #ifndef TRE_WCHAR
44 1.1 agc #include <ctype.h>
45 1.1 agc #endif /* !TRE_WCHAR */
46 1.1 agc #ifdef HAVE_MALLOC_H
47 1.1 agc #include <malloc.h>
48 1.1 agc #endif /* HAVE_MALLOC_H */
49 1.1 agc
50 1.1 agc #include "tre-internal.h"
51 1.1 agc #include "tre-match-utils.h"
52 1.1 agc #include "tre.h"
53 1.1 agc #include "xmalloc.h"
54 1.1 agc
55 1.1 agc #define TRE_M_COST 0
56 1.1 agc #define TRE_M_NUM_INS 1
57 1.1 agc #define TRE_M_NUM_DEL 2
58 1.1 agc #define TRE_M_NUM_SUBST 3
59 1.1 agc #define TRE_M_NUM_ERR 4
60 1.1 agc #define TRE_M_LAST 5
61 1.1 agc
62 1.1 agc #define TRE_M_MAX_DEPTH 3
63 1.1 agc
64 1.1 agc typedef struct {
65 1.1 agc /* State in the TNFA transition table. */
66 1.1 agc tre_tnfa_transition_t *state;
67 1.1 agc /* Position in input string. */
68 1.1 agc int pos;
69 1.1 agc /* Tag values. */
70 1.1 agc int *tags;
71 1.1 agc /* Matching parameters. */
72 1.1 agc regaparams_t params;
73 1.1 agc /* Nesting depth of parameters. This is used as an index in
74 1.1 agc the `costs' array. */
75 1.1 agc int depth;
76 1.1 agc /* Costs and counter values for different parameter nesting depths. */
77 1.1 agc int costs[TRE_M_MAX_DEPTH + 1][TRE_M_LAST];
78 1.1 agc } tre_tnfa_approx_reach_t;
79 1.1 agc
80 1.1 agc
81 1.1 agc #ifdef TRE_DEBUG
82 1.1 agc /* Prints the `reach' array in a readable fashion with DPRINT. */
83 1.1 agc static void
84 1.1 agc tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_approx_reach_t *reach,
85 1.1 agc int pos, int num_tags)
86 1.1 agc {
87 1.1 agc int id;
88 1.1 agc
89 1.1 agc /* Print each state on one line. */
90 1.1 agc DPRINT((" reach:\n"));
91 1.1 agc for (id = 0; id < tnfa->num_states; id++)
92 1.1 agc {
93 1.1 agc int i, j;
94 1.1 agc if (reach[id].pos < pos)
95 1.1 agc continue; /* Not reached. */
96 1.1 agc DPRINT((" %03d, costs ", id));
97 1.1 agc for (i = 0; i <= reach[id].depth; i++)
98 1.1 agc {
99 1.1 agc DPRINT(("["));
100 1.1 agc for (j = 0; j < TRE_M_LAST; j++)
101 1.1 agc {
102 1.1 agc DPRINT(("%2d", reach[id].costs[i][j]));
103 1.1 agc if (j + 1 < TRE_M_LAST)
104 1.1 agc DPRINT((","));
105 1.1 agc }
106 1.1 agc DPRINT(("]"));
107 1.1 agc if (i + 1 <= reach[id].depth)
108 1.1 agc DPRINT((", "));
109 1.1 agc }
110 1.1 agc DPRINT(("\n tags "));
111 1.1 agc for (i = 0; i < num_tags; i++)
112 1.1 agc {
113 1.1 agc DPRINT(("%02d", reach[id].tags[i]));
114 1.1 agc if (i + 1 < num_tags)
115 1.1 agc DPRINT((","));
116 1.1 agc }
117 1.1 agc DPRINT(("\n"));
118 1.1 agc }
119 1.1 agc DPRINT(("\n"));
120 1.1 agc }
121 1.1 agc #endif /* TRE_DEBUG */
122 1.1 agc
123 1.1 agc
124 1.1 agc /* Sets the matching parameters in `reach' to the ones defined in the `pa'
125 1.1 agc array. If `pa' specifies default values, they are taken from
126 1.1 agc `default_params'. */
127 1.1 agc inline static void
128 1.1 agc tre_set_params(tre_tnfa_approx_reach_t *reach,
129 1.1 agc int *pa, regaparams_t default_params)
130 1.1 agc {
131 1.1 agc int value;
132 1.1 agc
133 1.1 agc /* If depth is increased reset costs and counters to zero for the
134 1.1 agc new levels. */
135 1.1 agc value = pa[TRE_PARAM_DEPTH];
136 1.1 agc assert(value <= TRE_M_MAX_DEPTH);
137 1.1 agc if (value > reach->depth)
138 1.1 agc {
139 1.1 agc int i, j;
140 1.1 agc for (i = reach->depth + 1; i <= value; i++)
141 1.1 agc for (j = 0; j < TRE_M_LAST; j++)
142 1.1 agc reach->costs[i][j] = 0;
143 1.1 agc }
144 1.1 agc reach->depth = value;
145 1.1 agc
146 1.1 agc /* Set insert cost. */
147 1.1 agc value = pa[TRE_PARAM_COST_INS];
148 1.1 agc if (value == TRE_PARAM_DEFAULT)
149 1.1 agc reach->params.cost_ins = default_params.cost_ins;
150 1.1 agc else if (value != TRE_PARAM_UNSET)
151 1.1 agc reach->params.cost_ins = value;
152 1.1 agc
153 1.1 agc /* Set delete cost. */
154 1.1 agc value = pa[TRE_PARAM_COST_DEL];
155 1.1 agc if (value == TRE_PARAM_DEFAULT)
156 1.1 agc reach->params.cost_del = default_params.cost_del;
157 1.1 agc else if (value != TRE_PARAM_UNSET)
158 1.1 agc reach->params.cost_del = value;
159 1.1 agc
160 1.1 agc /* Set substitute cost. */
161 1.1 agc value = pa[TRE_PARAM_COST_SUBST];
162 1.1 agc if (value == TRE_PARAM_DEFAULT)
163 1.1 agc reach->params.cost_subst = default_params.cost_subst;
164 1.1 agc else
165 1.1 agc reach->params.cost_subst = value;
166 1.1 agc
167 1.1 agc /* Set maximum cost. */
168 1.1 agc value = pa[TRE_PARAM_COST_MAX];
169 1.1 agc if (value == TRE_PARAM_DEFAULT)
170 1.1 agc reach->params.max_cost = default_params.max_cost;
171 1.1 agc else if (value != TRE_PARAM_UNSET)
172 1.1 agc reach->params.max_cost = value;
173 1.1 agc
174 1.1 agc /* Set maximum inserts. */
175 1.1 agc value = pa[TRE_PARAM_MAX_INS];
176 1.1 agc if (value == TRE_PARAM_DEFAULT)
177 1.1 agc reach->params.max_ins = default_params.max_ins;
178 1.1 agc else if (value != TRE_PARAM_UNSET)
179 1.1 agc reach->params.max_ins = value;
180 1.1 agc
181 1.1 agc /* Set maximum deletes. */
182 1.1 agc value = pa[TRE_PARAM_MAX_DEL];
183 1.1 agc if (value == TRE_PARAM_DEFAULT)
184 1.1 agc reach->params.max_del = default_params.max_del;
185 1.1 agc else if (value != TRE_PARAM_UNSET)
186 1.1 agc reach->params.max_del = value;
187 1.1 agc
188 1.1 agc /* Set maximum substitutes. */
189 1.1 agc value = pa[TRE_PARAM_MAX_SUBST];
190 1.1 agc if (value == TRE_PARAM_DEFAULT)
191 1.1 agc reach->params.max_subst = default_params.max_subst;
192 1.1 agc else if (value != TRE_PARAM_UNSET)
193 1.1 agc reach->params.max_subst = value;
194 1.1 agc
195 1.1 agc /* Set maximum number of errors. */
196 1.1 agc value = pa[TRE_PARAM_MAX_ERR];
197 1.1 agc if (value == TRE_PARAM_DEFAULT)
198 1.1 agc reach->params.max_err = default_params.max_err;
199 1.1 agc else if (value != TRE_PARAM_UNSET)
200 1.1 agc reach->params.max_err = value;
201 1.1 agc }
202 1.1 agc
203 1.1 agc reg_errcode_t
204 1.1 agc tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
205 1.1 agc tre_str_type_t type, int *match_tags,
206 1.1 agc regamatch_t *match, regaparams_t default_params,
207 1.1 agc int eflags, int *match_end_ofs)
208 1.1 agc {
209 1.1 agc /* State variables required by GET_NEXT_WCHAR. */
210 1.1 agc tre_char_t prev_c = 0, next_c = 0;
211 1.1 agc const char *str_byte = string;
212 1.1 agc int pos = -1;
213 1.1 agc unsigned int pos_add_next = 1;
214 1.1 agc #ifdef TRE_WCHAR
215 1.1 agc const wchar_t *str_wide = string;
216 1.1 agc #ifdef TRE_MBSTATE
217 1.1 agc mbstate_t mbstate;
218 1.1 agc #endif /* !TRE_WCHAR */
219 1.1 agc #endif /* TRE_WCHAR */
220 1.1 agc int reg_notbol = eflags & REG_NOTBOL;
221 1.1 agc int reg_noteol = eflags & REG_NOTEOL;
222 1.1 agc int reg_newline = tnfa->cflags & REG_NEWLINE;
223 1.1 agc int str_user_end = 0;
224 1.1 agc
225 1.1 agc int prev_pos;
226 1.1 agc
227 1.1 agc /* Number of tags. */
228 1.1 agc int num_tags;
229 1.1 agc /* The reach tables. */
230 1.1 agc tre_tnfa_approx_reach_t *reach, *reach_next;
231 1.1 agc /* Tag array for temporary use. */
232 1.1 agc int *tmp_tags;
233 1.1 agc
234 1.1 agc /* End offset of best match so far, or -1 if no match found yet. */
235 1.1 agc int match_eo = -1;
236 1.1 agc /* Costs of the match. */
237 1.1 agc int match_costs[TRE_M_LAST];
238 1.1 agc
239 1.1 agc /* Space for temporary data required for matching. */
240 1.1 agc unsigned char *buf;
241 1.1 agc
242 1.1 agc int i, id;
243 1.1 agc
244 1.1 agc if (!match_tags)
245 1.1 agc num_tags = 0;
246 1.1 agc else
247 1.1 agc num_tags = tnfa->num_tags;
248 1.1 agc
249 1.1 agc #ifdef TRE_MBSTATE
250 1.1 agc memset(&mbstate, '\0', sizeof(mbstate));
251 1.1 agc #endif /* TRE_MBSTATE */
252 1.1 agc
253 1.1 agc DPRINT(("tre_tnfa_run_approx, input type %d, len %d, eflags %d, "
254 1.1 agc "match_tags %p\n",
255 1.1 agc type, len, eflags,
256 1.1 agc match_tags));
257 1.1 agc DPRINT(("max cost %d, ins %d, del %d, subst %d\n",
258 1.1 agc default_params.max_cost,
259 1.1 agc default_params.cost_ins,
260 1.1 agc default_params.cost_del,
261 1.1 agc default_params.cost_subst));
262 1.1 agc
263 1.1 agc /* Allocate memory for temporary data required for matching. This needs to
264 1.1 agc be done for every matching operation to be thread safe. This allocates
265 1.1 agc everything in a single large block from the stack frame using alloca()
266 1.1 agc or with malloc() if alloca is unavailable. */
267 1.1 agc {
268 1.1 agc unsigned char *buf_cursor;
269 1.1 agc /* Space needed for one array of tags. */
270 1.1 agc int tag_bytes = sizeof(*tmp_tags) * num_tags;
271 1.1 agc /* Space needed for one reach table. */
272 1.1 agc int reach_bytes = sizeof(*reach_next) * tnfa->num_states;
273 1.1 agc /* Total space needed. */
274 1.1 agc int total_bytes = reach_bytes * 2 + (tnfa->num_states * 2 + 1 ) * tag_bytes;
275 1.1 agc /* Add some extra to make sure we can align the pointers. The multiplier
276 1.1 agc used here must be equal to the number of ALIGN calls below. */
277 1.1 agc total_bytes += (sizeof(long) - 1) * 3;
278 1.1 agc
279 1.1 agc /* Allocate the memory. */
280 1.1 agc #ifdef TRE_USE_ALLOCA
281 1.1 agc buf = alloca(total_bytes);
282 1.1 agc #else /* !TRE_USE_ALLOCA */
283 1.1 agc buf = xmalloc((unsigned)total_bytes);
284 1.1 agc #endif /* !TRE_USE_ALLOCA */
285 1.1 agc if (!buf)
286 1.1 agc return REG_ESPACE;
287 1.1 agc memset(buf, 0, (size_t)total_bytes);
288 1.1 agc
289 1.1 agc /* Allocate `tmp_tags' from `buf'. */
290 1.1 agc tmp_tags = (void *)buf;
291 1.1 agc buf_cursor = buf + tag_bytes;
292 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
293 1.1 agc
294 1.1 agc /* Allocate `reach' from `buf'. */
295 1.1 agc reach = (void *)buf_cursor;
296 1.1 agc buf_cursor += reach_bytes;
297 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
298 1.1 agc
299 1.1 agc /* Allocate `reach_next' from `buf'. */
300 1.1 agc reach_next = (void *)buf_cursor;
301 1.1 agc buf_cursor += reach_bytes;
302 1.1 agc buf_cursor += ALIGN(buf_cursor, long);
303 1.1 agc
304 1.1 agc /* Allocate tag arrays for `reach' and `reach_next' from `buf'. */
305 1.1 agc for (i = 0; i < tnfa->num_states; i++)
306 1.1 agc {
307 1.1 agc reach[i].tags = (void *)buf_cursor;
308 1.1 agc buf_cursor += tag_bytes;
309 1.1 agc reach_next[i].tags = (void *)buf_cursor;
310 1.1 agc buf_cursor += tag_bytes;
311 1.1 agc }
312 1.1 agc assert(buf_cursor <= buf + total_bytes);
313 1.1 agc }
314 1.1 agc
315 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
316 1.1 agc match_costs[i] = INT_MAX;
317 1.1 agc
318 1.1 agc /* Mark the reach arrays empty. */
319 1.1 agc for (i = 0; i < tnfa->num_states; i++)
320 1.1 agc reach[i].pos = reach_next[i].pos = -2;
321 1.1 agc
322 1.1 agc prev_pos = pos;
323 1.1 agc GET_NEXT_WCHAR();
324 1.1 agc pos = 0;
325 1.1 agc
326 1.1 agc while (/*CONSTCOND*/1)
327 1.1 agc {
328 1.1 agc DPRINT(("%03d:%2lc/%05d\n", pos, (tre_cint_t)next_c, (int)next_c));
329 1.1 agc
330 1.1 agc /* Add initial states to `reach_next' if an exact match has not yet
331 1.1 agc been found. */
332 1.1 agc if (match_costs[TRE_M_COST] > 0)
333 1.1 agc {
334 1.1 agc tre_tnfa_transition_t *trans;
335 1.1 agc DPRINT((" init"));
336 1.1 agc for (trans = tnfa->initial; trans->state; trans++)
337 1.1 agc {
338 1.1 agc int stateid = trans->state_id;
339 1.1 agc
340 1.1 agc /* If this state is not currently in `reach_next', add it
341 1.1 agc there. */
342 1.1 agc if (reach_next[stateid].pos < pos)
343 1.1 agc {
344 1.1 agc if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
345 1.1 agc {
346 1.1 agc /* Assertions failed, don't add this state. */
347 1.1 agc DPRINT((" !%d (assert)", stateid));
348 1.1 agc continue;
349 1.1 agc }
350 1.1 agc DPRINT((" %d", stateid));
351 1.1 agc reach_next[stateid].state = trans->state;
352 1.1 agc reach_next[stateid].pos = pos;
353 1.1 agc
354 1.1 agc /* Compute tag values after this transition. */
355 1.1 agc for (i = 0; i < num_tags; i++)
356 1.1 agc reach_next[stateid].tags[i] = -1;
357 1.1 agc
358 1.1 agc if (trans->tags)
359 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
360 1.1 agc if (trans->tags[i] < num_tags)
361 1.1 agc reach_next[stateid].tags[trans->tags[i]] = pos;
362 1.1 agc
363 1.1 agc /* Set the parameters, depth, and costs. */
364 1.1 agc reach_next[stateid].params = default_params;
365 1.1 agc reach_next[stateid].depth = 0;
366 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
367 1.1 agc reach_next[stateid].costs[0][i] = 0;
368 1.1 agc if (trans->params)
369 1.1 agc tre_set_params(&reach_next[stateid], trans->params,
370 1.1 agc default_params);
371 1.1 agc
372 1.1 agc /* If this is the final state, mark the exact match. */
373 1.1 agc if (trans->state == tnfa->final)
374 1.1 agc {
375 1.1 agc match_eo = pos;
376 1.1 agc for (i = 0; i < num_tags; i++)
377 1.1 agc match_tags[i] = reach_next[stateid].tags[i];
378 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
379 1.1 agc match_costs[i] = 0;
380 1.1 agc }
381 1.1 agc }
382 1.1 agc }
383 1.1 agc DPRINT(("\n"));
384 1.1 agc }
385 1.1 agc
386 1.1 agc
387 1.1 agc /* Handle inserts. This is done by pretending there's an epsilon
388 1.1 agc transition from each state in `reach' back to the same state.
389 1.1 agc We don't need to worry about the final state here; this will never
390 1.1 agc give a better match than what we already have. */
391 1.1 agc for (id = 0; id < tnfa->num_states; id++)
392 1.1 agc {
393 1.1 agc int depth;
394 1.1 agc int cost, cost0;
395 1.1 agc
396 1.1 agc if (reach[id].pos != prev_pos)
397 1.1 agc {
398 1.1 agc DPRINT((" insert: %d not reached\n", id));
399 1.1 agc continue; /* Not reached. */
400 1.1 agc }
401 1.1 agc
402 1.1 agc depth = reach[id].depth;
403 1.1 agc
404 1.1 agc /* Compute and check cost at current depth. */
405 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
406 1.1 agc if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
407 1.1 agc cost += reach[id].params.cost_ins;
408 1.1 agc if (cost > reach[id].params.max_cost)
409 1.1 agc continue; /* Cost too large. */
410 1.1 agc
411 1.1 agc /* Check number of inserts at current depth. */
412 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_INS] + 1
413 1.1 agc > reach[id].params.max_ins)
414 1.1 agc continue; /* Too many inserts. */
415 1.1 agc
416 1.1 agc /* Check total number of errors at current depth. */
417 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
418 1.1 agc > reach[id].params.max_err)
419 1.1 agc continue; /* Too many errors. */
420 1.1 agc
421 1.1 agc /* Compute overall cost. */
422 1.1 agc cost0 = cost;
423 1.1 agc if (depth > 0)
424 1.1 agc {
425 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
426 1.1 agc if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
427 1.1 agc cost0 += reach[id].params.cost_ins;
428 1.1 agc else
429 1.1 agc cost0 += default_params.cost_ins;
430 1.1 agc }
431 1.1 agc
432 1.1 agc DPRINT((" insert: from %d to %d, cost %d: ", id, id,
433 1.1 agc reach[id].costs[depth][TRE_M_COST]));
434 1.1 agc if (reach_next[id].pos == pos
435 1.1 agc && (cost0 >= reach_next[id].costs[0][TRE_M_COST]))
436 1.1 agc {
437 1.1 agc DPRINT(("lose\n"));
438 1.1 agc continue;
439 1.1 agc }
440 1.1 agc DPRINT(("win\n"));
441 1.1 agc
442 1.1 agc /* Copy state, position, tags, parameters, and depth. */
443 1.1 agc reach_next[id].state = reach[id].state;
444 1.1 agc reach_next[id].pos = pos;
445 1.1 agc for (i = 0; i < num_tags; i++)
446 1.1 agc reach_next[id].tags[i] = reach[id].tags[i];
447 1.1 agc reach_next[id].params = reach[id].params;
448 1.1 agc reach_next[id].depth = reach[id].depth;
449 1.1 agc
450 1.1 agc /* Set the costs after this transition. */
451 1.1 agc memcpy(reach_next[id].costs, reach[id].costs,
452 1.1 agc sizeof(reach_next[id].costs[0][0])
453 1.1 agc * TRE_M_LAST * (depth + 1));
454 1.1 agc reach_next[id].costs[depth][TRE_M_COST] = cost;
455 1.1 agc reach_next[id].costs[depth][TRE_M_NUM_INS]++;
456 1.1 agc reach_next[id].costs[depth][TRE_M_NUM_ERR]++;
457 1.1 agc if (depth > 0)
458 1.1 agc {
459 1.1 agc reach_next[id].costs[0][TRE_M_COST] = cost0;
460 1.1 agc reach_next[id].costs[0][TRE_M_NUM_INS]++;
461 1.1 agc reach_next[id].costs[0][TRE_M_NUM_ERR]++;
462 1.1 agc }
463 1.1 agc
464 1.1 agc }
465 1.1 agc
466 1.1 agc
467 1.1 agc /* Handle deletes. This is done by traversing through the whole TNFA
468 1.1 agc pretending that all transitions are epsilon transitions, until
469 1.1 agc no more states can be reached with better costs. */
470 1.1 agc {
471 1.1 agc /* XXX - dynamic ringbuffer size */
472 1.1 agc tre_tnfa_approx_reach_t *ringbuffer[512];
473 1.1 agc tre_tnfa_approx_reach_t **deque_start, **deque_end;
474 1.1 agc
475 1.1 agc deque_start = deque_end = ringbuffer;
476 1.1 agc
477 1.1 agc /* Add all states in `reach_next' to the deque. */
478 1.1 agc for (id = 0; id < tnfa->num_states; id++)
479 1.1 agc {
480 1.1 agc if (reach_next[id].pos != pos)
481 1.1 agc continue;
482 1.1 agc *deque_end = &reach_next[id];
483 1.1 agc deque_end++;
484 1.1 agc assert(deque_end != deque_start);
485 1.1 agc }
486 1.1 agc
487 1.1 agc /* Repeat until the deque is empty. */
488 1.1 agc while (deque_end != deque_start)
489 1.1 agc {
490 1.1 agc tre_tnfa_approx_reach_t *reach_p;
491 1.1 agc int depth;
492 1.1 agc int cost, cost0;
493 1.1 agc tre_tnfa_transition_t *trans;
494 1.1 agc
495 1.1 agc /* Pop the first item off the deque. */
496 1.1 agc reach_p = *deque_start;
497 1.1 agc id = reach_p - reach_next;
498 1.1 agc depth = reach_p->depth;
499 1.1 agc
500 1.1 agc /* Compute cost at current depth. */
501 1.1 agc cost = reach_p->costs[depth][TRE_M_COST];
502 1.1 agc if (reach_p->params.cost_del != TRE_PARAM_UNSET)
503 1.1 agc cost += reach_p->params.cost_del;
504 1.1 agc
505 1.1 agc /* Check cost, number of deletes, and total number of errors
506 1.1 agc at current depth. */
507 1.1 agc if (cost > reach_p->params.max_cost
508 1.1 agc || (reach_p->costs[depth][TRE_M_NUM_DEL] + 1
509 1.1 agc > reach_p->params.max_del)
510 1.1 agc || (reach_p->costs[depth][TRE_M_NUM_ERR] + 1
511 1.1 agc > reach_p->params.max_err))
512 1.1 agc {
513 1.1 agc /* Too many errors or cost too large. */
514 1.1 agc DPRINT((" delete: from %03d: cost too large\n", id));
515 1.1 agc deque_start++;
516 1.1 agc if (deque_start >= (ringbuffer + 512))
517 1.1 agc deque_start = ringbuffer;
518 1.1 agc continue;
519 1.1 agc }
520 1.1 agc
521 1.1 agc /* Compute overall cost. */
522 1.1 agc cost0 = cost;
523 1.1 agc if (depth > 0)
524 1.1 agc {
525 1.1 agc cost0 = reach_p->costs[0][TRE_M_COST];
526 1.1 agc if (reach_p->params.cost_del != TRE_PARAM_UNSET)
527 1.1 agc cost0 += reach_p->params.cost_del;
528 1.1 agc else
529 1.1 agc cost0 += default_params.cost_del;
530 1.1 agc }
531 1.1 agc
532 1.1 agc for (trans = reach_p->state; trans->state; trans++)
533 1.1 agc {
534 1.1 agc int dest_id = trans->state_id;
535 1.1 agc DPRINT((" delete: from %03d to %03d, cost %d (%d): ",
536 1.1 agc id, dest_id, cost0, reach_p->params.max_cost));
537 1.1 agc
538 1.1 agc if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
539 1.1 agc {
540 1.1 agc DPRINT(("assertion failed\n"));
541 1.1 agc continue;
542 1.1 agc }
543 1.1 agc
544 1.1 agc /* Compute tag values after this transition. */
545 1.1 agc for (i = 0; i < num_tags; i++)
546 1.1 agc tmp_tags[i] = reach_p->tags[i];
547 1.1 agc if (trans->tags)
548 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
549 1.1 agc if (trans->tags[i] < num_tags)
550 1.1 agc tmp_tags[trans->tags[i]] = pos;
551 1.1 agc
552 1.1 agc /* If another path has also reached this state, choose the one
553 1.1 agc with the smallest cost or best tags if costs are equal. */
554 1.1 agc if (reach_next[dest_id].pos == pos
555 1.1 agc && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
556 1.1 agc || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
557 1.1 agc && (!match_tags
558 1.1 agc || !tre_tag_order(num_tags,
559 1.1 agc tnfa->tag_directions,
560 1.1 agc tmp_tags,
561 1.1 agc reach_next[dest_id].tags)))))
562 1.1 agc {
563 1.1 agc DPRINT(("lose, cost0 %d, have %d\n",
564 1.1 agc cost0, reach_next[dest_id].costs[0][TRE_M_COST]));
565 1.1 agc continue;
566 1.1 agc }
567 1.1 agc DPRINT(("win\n"));
568 1.1 agc
569 1.1 agc /* Set state, position, tags, parameters, depth, and costs. */
570 1.1 agc reach_next[dest_id].state = trans->state;
571 1.1 agc reach_next[dest_id].pos = pos;
572 1.1 agc for (i = 0; i < num_tags; i++)
573 1.1 agc reach_next[dest_id].tags[i] = tmp_tags[i];
574 1.1 agc
575 1.1 agc reach_next[dest_id].params = reach_p->params;
576 1.1 agc if (trans->params)
577 1.1 agc tre_set_params(&reach_next[dest_id], trans->params,
578 1.1 agc default_params);
579 1.1 agc
580 1.1 agc reach_next[dest_id].depth = reach_p->depth;
581 1.1 agc memcpy(&reach_next[dest_id].costs,
582 1.1 agc reach_p->costs,
583 1.1 agc sizeof(reach_p->costs[0][0])
584 1.1 agc * TRE_M_LAST * (depth + 1));
585 1.1 agc reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
586 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_DEL]++;
587 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_ERR]++;
588 1.1 agc if (depth > 0)
589 1.1 agc {
590 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
591 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_DEL]++;
592 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_ERR]++;
593 1.1 agc }
594 1.1 agc
595 1.1 agc if (trans->state == tnfa->final
596 1.1 agc && (match_eo < 0
597 1.1 agc || match_costs[TRE_M_COST] > cost0
598 1.1 agc || (match_costs[TRE_M_COST] == cost0
599 1.1 agc && (num_tags > 0
600 1.1 agc && tmp_tags[0] <= match_tags[0]))))
601 1.1 agc {
602 1.1 agc DPRINT((" setting new match at %d, cost %d\n",
603 1.1 agc pos, cost0));
604 1.1 agc match_eo = pos;
605 1.1 agc memcpy(match_costs, reach_next[dest_id].costs[0],
606 1.1 agc sizeof(match_costs[0]) * TRE_M_LAST);
607 1.1 agc for (i = 0; i < num_tags; i++)
608 1.1 agc match_tags[i] = tmp_tags[i];
609 1.1 agc }
610 1.1 agc
611 1.1 agc /* Add to the end of the deque. */
612 1.1 agc *deque_end = &reach_next[dest_id];
613 1.1 agc deque_end++;
614 1.1 agc if (deque_end >= (ringbuffer + 512))
615 1.1 agc deque_end = ringbuffer;
616 1.1 agc assert(deque_end != deque_start);
617 1.1 agc }
618 1.1 agc deque_start++;
619 1.1 agc if (deque_start >= (ringbuffer + 512))
620 1.1 agc deque_start = ringbuffer;
621 1.1 agc }
622 1.1 agc
623 1.1 agc }
624 1.1 agc
625 1.1 agc #ifdef TRE_DEBUG
626 1.1 agc tre_print_reach(tnfa, reach_next, pos, num_tags);
627 1.1 agc #endif /* TRE_DEBUG */
628 1.1 agc
629 1.1 agc /* Check for end of string. */
630 1.1 agc if (len < 0)
631 1.1 agc {
632 1.1 agc if (type == STR_USER)
633 1.1 agc {
634 1.1 agc if (str_user_end)
635 1.1 agc break;
636 1.1 agc }
637 1.1 agc else if (next_c == L'\0')
638 1.1 agc break;
639 1.1 agc }
640 1.1 agc else
641 1.1 agc {
642 1.1 agc if (pos >= len)
643 1.1 agc break;
644 1.1 agc }
645 1.1 agc
646 1.1 agc prev_pos = pos;
647 1.1 agc GET_NEXT_WCHAR();
648 1.1 agc
649 1.1 agc /* Swap `reach' and `reach_next'. */
650 1.1 agc {
651 1.1 agc tre_tnfa_approx_reach_t *tmp;
652 1.1 agc tmp = reach;
653 1.1 agc reach = reach_next;
654 1.1 agc reach_next = tmp;
655 1.1 agc }
656 1.1 agc
657 1.1 agc /* Handle exact matches and substitutions. */
658 1.1 agc for (id = 0; id < tnfa->num_states; id++)
659 1.1 agc {
660 1.1 agc tre_tnfa_transition_t *trans;
661 1.1 agc
662 1.1 agc if (reach[id].pos < prev_pos)
663 1.1 agc continue; /* Not reached. */
664 1.1 agc for (trans = reach[id].state; trans->state; trans++)
665 1.1 agc {
666 1.1 agc int dest_id;
667 1.1 agc int depth;
668 1.1 agc int cost, cost0, err;
669 1.1 agc
670 1.1 agc if (trans->assertions
671 1.1 agc && (CHECK_ASSERTIONS(trans->assertions)
672 1.1 agc || CHECK_CHAR_CLASSES(trans, tnfa, eflags)))
673 1.1 agc {
674 1.1 agc DPRINT((" exact, from %d: assert failed\n", id));
675 1.1 agc continue;
676 1.1 agc }
677 1.1 agc
678 1.1 agc depth = reach[id].depth;
679 1.1 agc dest_id = trans->state_id;
680 1.1 agc
681 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
682 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
683 1.1 agc err = 0;
684 1.1 agc
685 1.1 agc if (trans->code_min > (tre_cint_t)prev_c
686 1.1 agc || trans->code_max < (tre_cint_t)prev_c)
687 1.1 agc {
688 1.1 agc /* Handle substitutes. The required character was not in
689 1.1 agc the string, so match it in place of whatever was supposed
690 1.1 agc to be there and increase costs accordingly. */
691 1.1 agc err = 1;
692 1.1 agc
693 1.1 agc /* Compute and check cost at current depth. */
694 1.1 agc cost = reach[id].costs[depth][TRE_M_COST];
695 1.1 agc if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
696 1.1 agc cost += reach[id].params.cost_subst;
697 1.1 agc if (cost > reach[id].params.max_cost)
698 1.1 agc continue; /* Cost too large. */
699 1.1 agc
700 1.1 agc /* Check number of substitutes at current depth. */
701 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_SUBST] + 1
702 1.1 agc > reach[id].params.max_subst)
703 1.1 agc continue; /* Too many substitutes. */
704 1.1 agc
705 1.1 agc /* Check total number of errors at current depth. */
706 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
707 1.1 agc > reach[id].params.max_err)
708 1.1 agc continue; /* Too many errors. */
709 1.1 agc
710 1.1 agc /* Compute overall cost. */
711 1.1 agc cost0 = cost;
712 1.1 agc if (depth > 0)
713 1.1 agc {
714 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST];
715 1.1 agc if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
716 1.1 agc cost0 += reach[id].params.cost_subst;
717 1.1 agc else
718 1.1 agc cost0 += default_params.cost_subst;
719 1.1 agc }
720 1.1 agc DPRINT((" subst, from %03d to %03d, cost %d: ",
721 1.1 agc id, dest_id, cost0));
722 1.1 agc }
723 1.1 agc else
724 1.1 agc DPRINT((" exact, from %03d to %03d, cost %d: ",
725 1.1 agc id, dest_id, cost0));
726 1.1 agc
727 1.1 agc /* Compute tag values after this transition. */
728 1.1 agc for (i = 0; i < num_tags; i++)
729 1.1 agc tmp_tags[i] = reach[id].tags[i];
730 1.1 agc if (trans->tags)
731 1.1 agc for (i = 0; trans->tags[i] >= 0; i++)
732 1.1 agc if (trans->tags[i] < num_tags)
733 1.1 agc tmp_tags[trans->tags[i]] = pos;
734 1.1 agc
735 1.1 agc /* If another path has also reached this state, choose the
736 1.1 agc one with the smallest cost or best tags if costs are equal. */
737 1.1 agc if (reach_next[dest_id].pos == pos
738 1.1 agc && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
739 1.1 agc || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
740 1.1 agc && !tre_tag_order(num_tags, tnfa->tag_directions,
741 1.1 agc tmp_tags,
742 1.1 agc reach_next[dest_id].tags))))
743 1.1 agc {
744 1.1 agc DPRINT(("lose\n"));
745 1.1 agc continue;
746 1.1 agc }
747 1.1 agc DPRINT(("win %d %d\n",
748 1.1 agc reach_next[dest_id].pos,
749 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST]));
750 1.1 agc
751 1.1 agc /* Set state, position, tags, and depth. */
752 1.1 agc reach_next[dest_id].state = trans->state;
753 1.1 agc reach_next[dest_id].pos = pos;
754 1.1 agc for (i = 0; i < num_tags; i++)
755 1.1 agc reach_next[dest_id].tags[i] = tmp_tags[i];
756 1.1 agc reach_next[dest_id].depth = reach[id].depth;
757 1.1 agc
758 1.1 agc /* Set parameters. */
759 1.1 agc reach_next[dest_id].params = reach[id].params;
760 1.1 agc if (trans->params)
761 1.1 agc tre_set_params(&reach_next[dest_id], trans->params,
762 1.1 agc default_params);
763 1.1 agc
764 1.1 agc /* Set the costs after this transition. */
765 1.1 agc memcpy(&reach_next[dest_id].costs,
766 1.1 agc reach[id].costs,
767 1.1 agc sizeof(reach[id].costs[0][0])
768 1.1 agc * TRE_M_LAST * (depth + 1));
769 1.1 agc reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
770 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_SUBST] += err;
771 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_ERR] += err;
772 1.1 agc if (depth > 0)
773 1.1 agc {
774 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
775 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_SUBST] += err;
776 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_ERR] += err;
777 1.1 agc }
778 1.1 agc
779 1.1 agc if (trans->state == tnfa->final
780 1.1 agc && (match_eo < 0
781 1.1 agc || cost0 < match_costs[TRE_M_COST]
782 1.1 agc || (cost0 == match_costs[TRE_M_COST]
783 1.1 agc && num_tags > 0 && tmp_tags[0] <= match_tags[0])))
784 1.1 agc {
785 1.1 agc DPRINT((" setting new match at %d, cost %d\n",
786 1.1 agc pos, cost0));
787 1.1 agc match_eo = pos;
788 1.1 agc for (i = 0; i < TRE_M_LAST; i++)
789 1.1 agc match_costs[i] = reach_next[dest_id].costs[0][i];
790 1.1 agc for (i = 0; i < num_tags; i++)
791 1.1 agc match_tags[i] = tmp_tags[i];
792 1.1 agc }
793 1.1 agc }
794 1.1 agc }
795 1.1 agc }
796 1.1 agc
797 1.1 agc DPRINT(("match end offset = %d, match cost = %d\n", match_eo,
798 1.1 agc match_costs[TRE_M_COST]));
799 1.1 agc
800 1.1 agc #ifndef TRE_USE_ALLOCA
801 1.1 agc if (buf)
802 1.1 agc xfree(buf);
803 1.1 agc #endif /* !TRE_USE_ALLOCA */
804 1.1 agc
805 1.1 agc match->cost = match_costs[TRE_M_COST];
806 1.1 agc match->num_ins = match_costs[TRE_M_NUM_INS];
807 1.1 agc match->num_del = match_costs[TRE_M_NUM_DEL];
808 1.1 agc match->num_subst = match_costs[TRE_M_NUM_SUBST];
809 1.1 agc *match_end_ofs = match_eo;
810 1.1 agc
811 1.1 agc return match_eo >= 0 ? REG_OK : REG_NOMATCH;
812 1.1 agc }
813