1 1.1 agc /* 2 1.1 agc tre-match-approx.c - TRE approximate regex matching engine 3 1.1 agc 4 1.1 agc This software is released under a BSD-style license. 5 1.1 agc See the file LICENSE for details and copyright. 6 1.1 agc 7 1.1 agc */ 8 1.1 agc #ifdef HAVE_CONFIG_H 9 1.1 agc #include <config.h> 10 1.1 agc #endif /* HAVE_CONFIG_H */ 11 1.1 agc 12 1.2 christos #include "tre-alloca.h" 13 1.1 agc 14 1.1 agc #define __USE_STRING_INLINES 15 1.1 agc #undef __NO_INLINE__ 16 1.1 agc 17 1.1 agc #include <assert.h> 18 1.1 agc #include <stdlib.h> 19 1.1 agc #include <string.h> 20 1.1 agc #include <limits.h> 21 1.1 agc #ifdef HAVE_WCHAR_H 22 1.1 agc #include <wchar.h> 23 1.1 agc #endif /* HAVE_WCHAR_H */ 24 1.1 agc #ifdef HAVE_WCTYPE_H 25 1.1 agc #include <wctype.h> 26 1.1 agc #endif /* HAVE_WCTYPE_H */ 27 1.1 agc #ifndef TRE_WCHAR 28 1.1 agc #include <ctype.h> 29 1.1 agc #endif /* !TRE_WCHAR */ 30 1.1 agc #ifdef HAVE_MALLOC_H 31 1.1 agc #include <malloc.h> 32 1.1 agc #endif /* HAVE_MALLOC_H */ 33 1.6 rin #include <stdint.h> 34 1.1 agc 35 1.1 agc #include "tre-internal.h" 36 1.1 agc #include "tre-match-utils.h" 37 1.1 agc #include "tre.h" 38 1.1 agc #include "xmalloc.h" 39 1.1 agc 40 1.1 agc #define TRE_M_COST 0 41 1.1 agc #define TRE_M_NUM_INS 1 42 1.1 agc #define TRE_M_NUM_DEL 2 43 1.1 agc #define TRE_M_NUM_SUBST 3 44 1.1 agc #define TRE_M_NUM_ERR 4 45 1.1 agc #define TRE_M_LAST 5 46 1.1 agc 47 1.1 agc #define TRE_M_MAX_DEPTH 3 48 1.1 agc 49 1.1 agc typedef struct { 50 1.1 agc /* State in the TNFA transition table. */ 51 1.1 agc tre_tnfa_transition_t *state; 52 1.1 agc /* Position in input string. */ 53 1.1 agc int pos; 54 1.1 agc /* Tag values. */ 55 1.1 agc int *tags; 56 1.1 agc /* Matching parameters. */ 57 1.1 agc regaparams_t params; 58 1.1 agc /* Nesting depth of parameters. This is used as an index in 59 1.1 agc the `costs' array. */ 60 1.1 agc int depth; 61 1.1 agc /* Costs and counter values for different parameter nesting depths. */ 62 1.1 agc int costs[TRE_M_MAX_DEPTH + 1][TRE_M_LAST]; 63 1.1 agc } tre_tnfa_approx_reach_t; 64 1.1 agc 65 1.1 agc 66 1.1 agc #ifdef TRE_DEBUG 67 1.1 agc /* Prints the `reach' array in a readable fashion with DPRINT. */ 68 1.1 agc static void 69 1.1 agc tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_approx_reach_t *reach, 70 1.2 christos int pos, size_t num_tags) 71 1.1 agc { 72 1.1 agc int id; 73 1.1 agc 74 1.1 agc /* Print each state on one line. */ 75 1.1 agc DPRINT((" reach:\n")); 76 1.1 agc for (id = 0; id < tnfa->num_states; id++) 77 1.1 agc { 78 1.1 agc int i, j; 79 1.1 agc if (reach[id].pos < pos) 80 1.1 agc continue; /* Not reached. */ 81 1.1 agc DPRINT((" %03d, costs ", id)); 82 1.1 agc for (i = 0; i <= reach[id].depth; i++) 83 1.1 agc { 84 1.1 agc DPRINT(("[")); 85 1.1 agc for (j = 0; j < TRE_M_LAST; j++) 86 1.1 agc { 87 1.1 agc DPRINT(("%2d", reach[id].costs[i][j])); 88 1.1 agc if (j + 1 < TRE_M_LAST) 89 1.1 agc DPRINT((",")); 90 1.1 agc } 91 1.1 agc DPRINT(("]")); 92 1.1 agc if (i + 1 <= reach[id].depth) 93 1.1 agc DPRINT((", ")); 94 1.1 agc } 95 1.1 agc DPRINT(("\n tags ")); 96 1.1 agc for (i = 0; i < num_tags; i++) 97 1.1 agc { 98 1.1 agc DPRINT(("%02d", reach[id].tags[i])); 99 1.1 agc if (i + 1 < num_tags) 100 1.1 agc DPRINT((",")); 101 1.1 agc } 102 1.1 agc DPRINT(("\n")); 103 1.1 agc } 104 1.1 agc DPRINT(("\n")); 105 1.1 agc } 106 1.1 agc #endif /* TRE_DEBUG */ 107 1.1 agc 108 1.1 agc 109 1.1 agc /* Sets the matching parameters in `reach' to the ones defined in the `pa' 110 1.1 agc array. If `pa' specifies default values, they are taken from 111 1.1 agc `default_params'. */ 112 1.1 agc inline static void 113 1.1 agc tre_set_params(tre_tnfa_approx_reach_t *reach, 114 1.1 agc int *pa, regaparams_t default_params) 115 1.1 agc { 116 1.1 agc int value; 117 1.1 agc 118 1.1 agc /* If depth is increased reset costs and counters to zero for the 119 1.1 agc new levels. */ 120 1.1 agc value = pa[TRE_PARAM_DEPTH]; 121 1.1 agc assert(value <= TRE_M_MAX_DEPTH); 122 1.1 agc if (value > reach->depth) 123 1.1 agc { 124 1.1 agc int i, j; 125 1.1 agc for (i = reach->depth + 1; i <= value; i++) 126 1.1 agc for (j = 0; j < TRE_M_LAST; j++) 127 1.1 agc reach->costs[i][j] = 0; 128 1.1 agc } 129 1.1 agc reach->depth = value; 130 1.1 agc 131 1.1 agc /* Set insert cost. */ 132 1.1 agc value = pa[TRE_PARAM_COST_INS]; 133 1.1 agc if (value == TRE_PARAM_DEFAULT) 134 1.1 agc reach->params.cost_ins = default_params.cost_ins; 135 1.1 agc else if (value != TRE_PARAM_UNSET) 136 1.1 agc reach->params.cost_ins = value; 137 1.1 agc 138 1.1 agc /* Set delete cost. */ 139 1.1 agc value = pa[TRE_PARAM_COST_DEL]; 140 1.1 agc if (value == TRE_PARAM_DEFAULT) 141 1.1 agc reach->params.cost_del = default_params.cost_del; 142 1.1 agc else if (value != TRE_PARAM_UNSET) 143 1.1 agc reach->params.cost_del = value; 144 1.1 agc 145 1.1 agc /* Set substitute cost. */ 146 1.1 agc value = pa[TRE_PARAM_COST_SUBST]; 147 1.1 agc if (value == TRE_PARAM_DEFAULT) 148 1.1 agc reach->params.cost_subst = default_params.cost_subst; 149 1.1 agc else 150 1.1 agc reach->params.cost_subst = value; 151 1.1 agc 152 1.1 agc /* Set maximum cost. */ 153 1.1 agc value = pa[TRE_PARAM_COST_MAX]; 154 1.1 agc if (value == TRE_PARAM_DEFAULT) 155 1.1 agc reach->params.max_cost = default_params.max_cost; 156 1.1 agc else if (value != TRE_PARAM_UNSET) 157 1.1 agc reach->params.max_cost = value; 158 1.1 agc 159 1.1 agc /* Set maximum inserts. */ 160 1.1 agc value = pa[TRE_PARAM_MAX_INS]; 161 1.1 agc if (value == TRE_PARAM_DEFAULT) 162 1.1 agc reach->params.max_ins = default_params.max_ins; 163 1.1 agc else if (value != TRE_PARAM_UNSET) 164 1.1 agc reach->params.max_ins = value; 165 1.1 agc 166 1.1 agc /* Set maximum deletes. */ 167 1.1 agc value = pa[TRE_PARAM_MAX_DEL]; 168 1.1 agc if (value == TRE_PARAM_DEFAULT) 169 1.1 agc reach->params.max_del = default_params.max_del; 170 1.1 agc else if (value != TRE_PARAM_UNSET) 171 1.1 agc reach->params.max_del = value; 172 1.1 agc 173 1.1 agc /* Set maximum substitutes. */ 174 1.1 agc value = pa[TRE_PARAM_MAX_SUBST]; 175 1.1 agc if (value == TRE_PARAM_DEFAULT) 176 1.1 agc reach->params.max_subst = default_params.max_subst; 177 1.1 agc else if (value != TRE_PARAM_UNSET) 178 1.1 agc reach->params.max_subst = value; 179 1.1 agc 180 1.1 agc /* Set maximum number of errors. */ 181 1.1 agc value = pa[TRE_PARAM_MAX_ERR]; 182 1.1 agc if (value == TRE_PARAM_DEFAULT) 183 1.1 agc reach->params.max_err = default_params.max_err; 184 1.1 agc else if (value != TRE_PARAM_UNSET) 185 1.1 agc reach->params.max_err = value; 186 1.1 agc } 187 1.1 agc 188 1.1 agc reg_errcode_t 189 1.1 agc tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len, 190 1.1 agc tre_str_type_t type, int *match_tags, 191 1.1 agc regamatch_t *match, regaparams_t default_params, 192 1.1 agc int eflags, int *match_end_ofs) 193 1.1 agc { 194 1.1 agc /* State variables required by GET_NEXT_WCHAR. */ 195 1.1 agc tre_char_t prev_c = 0, next_c = 0; 196 1.1 agc const char *str_byte = string; 197 1.1 agc int pos = -1; 198 1.1 agc unsigned int pos_add_next = 1; 199 1.1 agc #ifdef TRE_WCHAR 200 1.1 agc const wchar_t *str_wide = string; 201 1.1 agc #ifdef TRE_MBSTATE 202 1.1 agc mbstate_t mbstate; 203 1.1 agc #endif /* !TRE_WCHAR */ 204 1.1 agc #endif /* TRE_WCHAR */ 205 1.1 agc int reg_notbol = eflags & REG_NOTBOL; 206 1.1 agc int reg_noteol = eflags & REG_NOTEOL; 207 1.1 agc int reg_newline = tnfa->cflags & REG_NEWLINE; 208 1.2 christos size_t str_user_end = 0; 209 1.1 agc 210 1.1 agc int prev_pos; 211 1.1 agc 212 1.1 agc /* Number of tags. */ 213 1.2 christos size_t num_tags; 214 1.1 agc /* The reach tables. */ 215 1.1 agc tre_tnfa_approx_reach_t *reach, *reach_next; 216 1.1 agc /* Tag array for temporary use. */ 217 1.1 agc int *tmp_tags; 218 1.1 agc 219 1.1 agc /* End offset of best match so far, or -1 if no match found yet. */ 220 1.1 agc int match_eo = -1; 221 1.1 agc /* Costs of the match. */ 222 1.1 agc int match_costs[TRE_M_LAST]; 223 1.1 agc 224 1.1 agc /* Space for temporary data required for matching. */ 225 1.1 agc unsigned char *buf; 226 1.1 agc 227 1.2 christos size_t i, id; 228 1.1 agc 229 1.4 rin reg_errcode_t ret; 230 1.4 rin 231 1.1 agc if (!match_tags) 232 1.1 agc num_tags = 0; 233 1.1 agc else 234 1.1 agc num_tags = tnfa->num_tags; 235 1.1 agc 236 1.1 agc #ifdef TRE_MBSTATE 237 1.1 agc memset(&mbstate, '\0', sizeof(mbstate)); 238 1.1 agc #endif /* TRE_MBSTATE */ 239 1.1 agc 240 1.1 agc DPRINT(("tre_tnfa_run_approx, input type %d, len %d, eflags %d, " 241 1.1 agc "match_tags %p\n", 242 1.1 agc type, len, eflags, 243 1.1 agc match_tags)); 244 1.1 agc DPRINT(("max cost %d, ins %d, del %d, subst %d\n", 245 1.1 agc default_params.max_cost, 246 1.1 agc default_params.cost_ins, 247 1.1 agc default_params.cost_del, 248 1.1 agc default_params.cost_subst)); 249 1.1 agc 250 1.1 agc /* Allocate memory for temporary data required for matching. This needs to 251 1.1 agc be done for every matching operation to be thread safe. This allocates 252 1.1 agc everything in a single large block from the stack frame using alloca() 253 1.1 agc or with malloc() if alloca is unavailable. */ 254 1.1 agc { 255 1.1 agc unsigned char *buf_cursor; 256 1.5 rin 257 1.5 rin /* Ensure that tag_bytes*num_states cannot overflow, and that it don't 258 1.5 rin * contribute more than 1/8 of SIZE_MAX to total_bytes. */ 259 1.5 rin if (num_tags > SIZE_MAX/(8 * sizeof(*tmp_tags) * tnfa->num_states)) 260 1.5 rin return REG_ESPACE; 261 1.5 rin 262 1.5 rin /* Likewise check reach_bytes. */ 263 1.5 rin if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_next))) 264 1.5 rin return REG_ESPACE; 265 1.5 rin 266 1.1 agc /* Space needed for one array of tags. */ 267 1.2 christos size_t tag_bytes = sizeof(*tmp_tags) * num_tags; 268 1.1 agc /* Space needed for one reach table. */ 269 1.2 christos size_t reach_bytes = sizeof(*reach_next) * tnfa->num_states; 270 1.1 agc /* Total space needed. */ 271 1.2 christos size_t total_bytes = reach_bytes * 2 + (tnfa->num_states * 2 + 1 ) * tag_bytes; 272 1.1 agc /* Add some extra to make sure we can align the pointers. The multiplier 273 1.1 agc used here must be equal to the number of ALIGN calls below. */ 274 1.1 agc total_bytes += (sizeof(long) - 1) * 3; 275 1.1 agc 276 1.1 agc /* Allocate the memory. */ 277 1.1 agc #ifdef TRE_USE_ALLOCA 278 1.1 agc buf = alloca(total_bytes); 279 1.1 agc #else /* !TRE_USE_ALLOCA */ 280 1.1 agc buf = xmalloc((unsigned)total_bytes); 281 1.1 agc #endif /* !TRE_USE_ALLOCA */ 282 1.1 agc if (!buf) 283 1.1 agc return REG_ESPACE; 284 1.1 agc memset(buf, 0, (size_t)total_bytes); 285 1.1 agc 286 1.1 agc /* Allocate `tmp_tags' from `buf'. */ 287 1.1 agc tmp_tags = (void *)buf; 288 1.1 agc buf_cursor = buf + tag_bytes; 289 1.1 agc buf_cursor += ALIGN(buf_cursor, long); 290 1.1 agc 291 1.1 agc /* Allocate `reach' from `buf'. */ 292 1.1 agc reach = (void *)buf_cursor; 293 1.1 agc buf_cursor += reach_bytes; 294 1.1 agc buf_cursor += ALIGN(buf_cursor, long); 295 1.1 agc 296 1.1 agc /* Allocate `reach_next' from `buf'. */ 297 1.1 agc reach_next = (void *)buf_cursor; 298 1.1 agc buf_cursor += reach_bytes; 299 1.1 agc buf_cursor += ALIGN(buf_cursor, long); 300 1.1 agc 301 1.1 agc /* Allocate tag arrays for `reach' and `reach_next' from `buf'. */ 302 1.1 agc for (i = 0; i < tnfa->num_states; i++) 303 1.1 agc { 304 1.1 agc reach[i].tags = (void *)buf_cursor; 305 1.1 agc buf_cursor += tag_bytes; 306 1.1 agc reach_next[i].tags = (void *)buf_cursor; 307 1.1 agc buf_cursor += tag_bytes; 308 1.1 agc } 309 1.1 agc assert(buf_cursor <= buf + total_bytes); 310 1.1 agc } 311 1.1 agc 312 1.1 agc for (i = 0; i < TRE_M_LAST; i++) 313 1.1 agc match_costs[i] = INT_MAX; 314 1.1 agc 315 1.1 agc /* Mark the reach arrays empty. */ 316 1.1 agc for (i = 0; i < tnfa->num_states; i++) 317 1.1 agc reach[i].pos = reach_next[i].pos = -2; 318 1.1 agc 319 1.1 agc prev_pos = pos; 320 1.1 agc GET_NEXT_WCHAR(); 321 1.1 agc pos = 0; 322 1.1 agc 323 1.3 rin while (/*CONSTCOND*/(void)1,1) 324 1.1 agc { 325 1.1 agc DPRINT(("%03d:%2lc/%05d\n", pos, (tre_cint_t)next_c, (int)next_c)); 326 1.1 agc 327 1.1 agc /* Add initial states to `reach_next' if an exact match has not yet 328 1.1 agc been found. */ 329 1.1 agc if (match_costs[TRE_M_COST] > 0) 330 1.1 agc { 331 1.1 agc tre_tnfa_transition_t *trans; 332 1.1 agc DPRINT((" init")); 333 1.1 agc for (trans = tnfa->initial; trans->state; trans++) 334 1.1 agc { 335 1.1 agc int stateid = trans->state_id; 336 1.1 agc 337 1.1 agc /* If this state is not currently in `reach_next', add it 338 1.1 agc there. */ 339 1.1 agc if (reach_next[stateid].pos < pos) 340 1.1 agc { 341 1.1 agc if (trans->assertions && CHECK_ASSERTIONS(trans->assertions)) 342 1.1 agc { 343 1.1 agc /* Assertions failed, don't add this state. */ 344 1.1 agc DPRINT((" !%d (assert)", stateid)); 345 1.1 agc continue; 346 1.1 agc } 347 1.1 agc DPRINT((" %d", stateid)); 348 1.1 agc reach_next[stateid].state = trans->state; 349 1.1 agc reach_next[stateid].pos = pos; 350 1.1 agc 351 1.1 agc /* Compute tag values after this transition. */ 352 1.1 agc for (i = 0; i < num_tags; i++) 353 1.1 agc reach_next[stateid].tags[i] = -1; 354 1.1 agc 355 1.1 agc if (trans->tags) 356 1.1 agc for (i = 0; trans->tags[i] >= 0; i++) 357 1.2 christos if ((size_t)trans->tags[i] < num_tags) 358 1.1 agc reach_next[stateid].tags[trans->tags[i]] = pos; 359 1.1 agc 360 1.1 agc /* Set the parameters, depth, and costs. */ 361 1.1 agc reach_next[stateid].params = default_params; 362 1.1 agc reach_next[stateid].depth = 0; 363 1.1 agc for (i = 0; i < TRE_M_LAST; i++) 364 1.1 agc reach_next[stateid].costs[0][i] = 0; 365 1.1 agc if (trans->params) 366 1.1 agc tre_set_params(&reach_next[stateid], trans->params, 367 1.1 agc default_params); 368 1.1 agc 369 1.1 agc /* If this is the final state, mark the exact match. */ 370 1.1 agc if (trans->state == tnfa->final) 371 1.1 agc { 372 1.1 agc match_eo = pos; 373 1.1 agc for (i = 0; i < num_tags; i++) 374 1.1 agc match_tags[i] = reach_next[stateid].tags[i]; 375 1.1 agc for (i = 0; i < TRE_M_LAST; i++) 376 1.1 agc match_costs[i] = 0; 377 1.1 agc } 378 1.1 agc } 379 1.1 agc } 380 1.1 agc DPRINT(("\n")); 381 1.1 agc } 382 1.1 agc 383 1.1 agc 384 1.1 agc /* Handle inserts. This is done by pretending there's an epsilon 385 1.1 agc transition from each state in `reach' back to the same state. 386 1.1 agc We don't need to worry about the final state here; this will never 387 1.1 agc give a better match than what we already have. */ 388 1.1 agc for (id = 0; id < tnfa->num_states; id++) 389 1.1 agc { 390 1.1 agc int depth; 391 1.1 agc int cost, cost0; 392 1.1 agc 393 1.1 agc if (reach[id].pos != prev_pos) 394 1.1 agc { 395 1.1 agc DPRINT((" insert: %d not reached\n", id)); 396 1.1 agc continue; /* Not reached. */ 397 1.1 agc } 398 1.1 agc 399 1.1 agc depth = reach[id].depth; 400 1.1 agc 401 1.1 agc /* Compute and check cost at current depth. */ 402 1.1 agc cost = reach[id].costs[depth][TRE_M_COST]; 403 1.1 agc if (reach[id].params.cost_ins != TRE_PARAM_UNSET) 404 1.1 agc cost += reach[id].params.cost_ins; 405 1.1 agc if (cost > reach[id].params.max_cost) 406 1.1 agc continue; /* Cost too large. */ 407 1.1 agc 408 1.1 agc /* Check number of inserts at current depth. */ 409 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_INS] + 1 410 1.1 agc > reach[id].params.max_ins) 411 1.1 agc continue; /* Too many inserts. */ 412 1.1 agc 413 1.1 agc /* Check total number of errors at current depth. */ 414 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1 415 1.1 agc > reach[id].params.max_err) 416 1.1 agc continue; /* Too many errors. */ 417 1.1 agc 418 1.1 agc /* Compute overall cost. */ 419 1.1 agc cost0 = cost; 420 1.1 agc if (depth > 0) 421 1.1 agc { 422 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST]; 423 1.1 agc if (reach[id].params.cost_ins != TRE_PARAM_UNSET) 424 1.1 agc cost0 += reach[id].params.cost_ins; 425 1.1 agc else 426 1.1 agc cost0 += default_params.cost_ins; 427 1.1 agc } 428 1.1 agc 429 1.1 agc DPRINT((" insert: from %d to %d, cost %d: ", id, id, 430 1.1 agc reach[id].costs[depth][TRE_M_COST])); 431 1.1 agc if (reach_next[id].pos == pos 432 1.1 agc && (cost0 >= reach_next[id].costs[0][TRE_M_COST])) 433 1.1 agc { 434 1.1 agc DPRINT(("lose\n")); 435 1.1 agc continue; 436 1.1 agc } 437 1.1 agc DPRINT(("win\n")); 438 1.1 agc 439 1.1 agc /* Copy state, position, tags, parameters, and depth. */ 440 1.1 agc reach_next[id].state = reach[id].state; 441 1.1 agc reach_next[id].pos = pos; 442 1.1 agc for (i = 0; i < num_tags; i++) 443 1.1 agc reach_next[id].tags[i] = reach[id].tags[i]; 444 1.1 agc reach_next[id].params = reach[id].params; 445 1.1 agc reach_next[id].depth = reach[id].depth; 446 1.1 agc 447 1.1 agc /* Set the costs after this transition. */ 448 1.1 agc memcpy(reach_next[id].costs, reach[id].costs, 449 1.1 agc sizeof(reach_next[id].costs[0][0]) 450 1.1 agc * TRE_M_LAST * (depth + 1)); 451 1.1 agc reach_next[id].costs[depth][TRE_M_COST] = cost; 452 1.1 agc reach_next[id].costs[depth][TRE_M_NUM_INS]++; 453 1.1 agc reach_next[id].costs[depth][TRE_M_NUM_ERR]++; 454 1.1 agc if (depth > 0) 455 1.1 agc { 456 1.1 agc reach_next[id].costs[0][TRE_M_COST] = cost0; 457 1.1 agc reach_next[id].costs[0][TRE_M_NUM_INS]++; 458 1.1 agc reach_next[id].costs[0][TRE_M_NUM_ERR]++; 459 1.1 agc } 460 1.1 agc 461 1.1 agc } 462 1.1 agc 463 1.1 agc 464 1.1 agc /* Handle deletes. This is done by traversing through the whole TNFA 465 1.1 agc pretending that all transitions are epsilon transitions, until 466 1.1 agc no more states can be reached with better costs. */ 467 1.1 agc { 468 1.1 agc /* XXX - dynamic ringbuffer size */ 469 1.1 agc tre_tnfa_approx_reach_t *ringbuffer[512]; 470 1.1 agc tre_tnfa_approx_reach_t **deque_start, **deque_end; 471 1.1 agc 472 1.1 agc deque_start = deque_end = ringbuffer; 473 1.1 agc 474 1.1 agc /* Add all states in `reach_next' to the deque. */ 475 1.1 agc for (id = 0; id < tnfa->num_states; id++) 476 1.1 agc { 477 1.1 agc if (reach_next[id].pos != pos) 478 1.1 agc continue; 479 1.1 agc *deque_end = &reach_next[id]; 480 1.1 agc deque_end++; 481 1.1 agc assert(deque_end != deque_start); 482 1.1 agc } 483 1.1 agc 484 1.1 agc /* Repeat until the deque is empty. */ 485 1.1 agc while (deque_end != deque_start) 486 1.1 agc { 487 1.1 agc tre_tnfa_approx_reach_t *reach_p; 488 1.1 agc int depth; 489 1.1 agc int cost, cost0; 490 1.1 agc tre_tnfa_transition_t *trans; 491 1.1 agc 492 1.1 agc /* Pop the first item off the deque. */ 493 1.1 agc reach_p = *deque_start; 494 1.1 agc id = reach_p - reach_next; 495 1.1 agc depth = reach_p->depth; 496 1.1 agc 497 1.1 agc /* Compute cost at current depth. */ 498 1.1 agc cost = reach_p->costs[depth][TRE_M_COST]; 499 1.1 agc if (reach_p->params.cost_del != TRE_PARAM_UNSET) 500 1.1 agc cost += reach_p->params.cost_del; 501 1.1 agc 502 1.1 agc /* Check cost, number of deletes, and total number of errors 503 1.1 agc at current depth. */ 504 1.1 agc if (cost > reach_p->params.max_cost 505 1.1 agc || (reach_p->costs[depth][TRE_M_NUM_DEL] + 1 506 1.1 agc > reach_p->params.max_del) 507 1.1 agc || (reach_p->costs[depth][TRE_M_NUM_ERR] + 1 508 1.1 agc > reach_p->params.max_err)) 509 1.1 agc { 510 1.1 agc /* Too many errors or cost too large. */ 511 1.1 agc DPRINT((" delete: from %03d: cost too large\n", id)); 512 1.1 agc deque_start++; 513 1.1 agc if (deque_start >= (ringbuffer + 512)) 514 1.1 agc deque_start = ringbuffer; 515 1.1 agc continue; 516 1.1 agc } 517 1.1 agc 518 1.1 agc /* Compute overall cost. */ 519 1.1 agc cost0 = cost; 520 1.1 agc if (depth > 0) 521 1.1 agc { 522 1.1 agc cost0 = reach_p->costs[0][TRE_M_COST]; 523 1.1 agc if (reach_p->params.cost_del != TRE_PARAM_UNSET) 524 1.1 agc cost0 += reach_p->params.cost_del; 525 1.1 agc else 526 1.1 agc cost0 += default_params.cost_del; 527 1.1 agc } 528 1.1 agc 529 1.1 agc for (trans = reach_p->state; trans->state; trans++) 530 1.1 agc { 531 1.1 agc int dest_id = trans->state_id; 532 1.1 agc DPRINT((" delete: from %03d to %03d, cost %d (%d): ", 533 1.1 agc id, dest_id, cost0, reach_p->params.max_cost)); 534 1.1 agc 535 1.1 agc if (trans->assertions && CHECK_ASSERTIONS(trans->assertions)) 536 1.1 agc { 537 1.1 agc DPRINT(("assertion failed\n")); 538 1.1 agc continue; 539 1.1 agc } 540 1.1 agc 541 1.1 agc /* Compute tag values after this transition. */ 542 1.1 agc for (i = 0; i < num_tags; i++) 543 1.1 agc tmp_tags[i] = reach_p->tags[i]; 544 1.1 agc if (trans->tags) 545 1.1 agc for (i = 0; trans->tags[i] >= 0; i++) 546 1.2 christos if ((size_t)trans->tags[i] < num_tags) 547 1.1 agc tmp_tags[trans->tags[i]] = pos; 548 1.1 agc 549 1.1 agc /* If another path has also reached this state, choose the one 550 1.1 agc with the smallest cost or best tags if costs are equal. */ 551 1.1 agc if (reach_next[dest_id].pos == pos 552 1.1 agc && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST] 553 1.1 agc || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST] 554 1.1 agc && (!match_tags 555 1.1 agc || !tre_tag_order(num_tags, 556 1.1 agc tnfa->tag_directions, 557 1.1 agc tmp_tags, 558 1.1 agc reach_next[dest_id].tags))))) 559 1.1 agc { 560 1.1 agc DPRINT(("lose, cost0 %d, have %d\n", 561 1.1 agc cost0, reach_next[dest_id].costs[0][TRE_M_COST])); 562 1.1 agc continue; 563 1.1 agc } 564 1.1 agc DPRINT(("win\n")); 565 1.1 agc 566 1.1 agc /* Set state, position, tags, parameters, depth, and costs. */ 567 1.1 agc reach_next[dest_id].state = trans->state; 568 1.1 agc reach_next[dest_id].pos = pos; 569 1.1 agc for (i = 0; i < num_tags; i++) 570 1.1 agc reach_next[dest_id].tags[i] = tmp_tags[i]; 571 1.1 agc 572 1.1 agc reach_next[dest_id].params = reach_p->params; 573 1.1 agc if (trans->params) 574 1.1 agc tre_set_params(&reach_next[dest_id], trans->params, 575 1.1 agc default_params); 576 1.1 agc 577 1.1 agc reach_next[dest_id].depth = reach_p->depth; 578 1.1 agc memcpy(&reach_next[dest_id].costs, 579 1.1 agc reach_p->costs, 580 1.1 agc sizeof(reach_p->costs[0][0]) 581 1.1 agc * TRE_M_LAST * (depth + 1)); 582 1.1 agc reach_next[dest_id].costs[depth][TRE_M_COST] = cost; 583 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_DEL]++; 584 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_ERR]++; 585 1.1 agc if (depth > 0) 586 1.1 agc { 587 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST] = cost0; 588 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_DEL]++; 589 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_ERR]++; 590 1.1 agc } 591 1.1 agc 592 1.1 agc if (trans->state == tnfa->final 593 1.1 agc && (match_eo < 0 594 1.1 agc || match_costs[TRE_M_COST] > cost0 595 1.1 agc || (match_costs[TRE_M_COST] == cost0 596 1.1 agc && (num_tags > 0 597 1.1 agc && tmp_tags[0] <= match_tags[0])))) 598 1.1 agc { 599 1.1 agc DPRINT((" setting new match at %d, cost %d\n", 600 1.1 agc pos, cost0)); 601 1.1 agc match_eo = pos; 602 1.1 agc memcpy(match_costs, reach_next[dest_id].costs[0], 603 1.1 agc sizeof(match_costs[0]) * TRE_M_LAST); 604 1.1 agc for (i = 0; i < num_tags; i++) 605 1.1 agc match_tags[i] = tmp_tags[i]; 606 1.1 agc } 607 1.1 agc 608 1.1 agc /* Add to the end of the deque. */ 609 1.1 agc *deque_end = &reach_next[dest_id]; 610 1.1 agc deque_end++; 611 1.1 agc if (deque_end >= (ringbuffer + 512)) 612 1.1 agc deque_end = ringbuffer; 613 1.1 agc assert(deque_end != deque_start); 614 1.1 agc } 615 1.1 agc deque_start++; 616 1.1 agc if (deque_start >= (ringbuffer + 512)) 617 1.1 agc deque_start = ringbuffer; 618 1.1 agc } 619 1.1 agc 620 1.1 agc } 621 1.1 agc 622 1.1 agc #ifdef TRE_DEBUG 623 1.1 agc tre_print_reach(tnfa, reach_next, pos, num_tags); 624 1.1 agc #endif /* TRE_DEBUG */ 625 1.1 agc 626 1.1 agc /* Check for end of string. */ 627 1.1 agc if (len < 0) 628 1.1 agc { 629 1.1 agc if (type == STR_USER) 630 1.1 agc { 631 1.1 agc if (str_user_end) 632 1.1 agc break; 633 1.1 agc } 634 1.1 agc else if (next_c == L'\0') 635 1.1 agc break; 636 1.1 agc } 637 1.1 agc else 638 1.1 agc { 639 1.1 agc if (pos >= len) 640 1.1 agc break; 641 1.1 agc } 642 1.1 agc 643 1.1 agc prev_pos = pos; 644 1.1 agc GET_NEXT_WCHAR(); 645 1.1 agc 646 1.1 agc /* Swap `reach' and `reach_next'. */ 647 1.1 agc { 648 1.1 agc tre_tnfa_approx_reach_t *tmp; 649 1.1 agc tmp = reach; 650 1.1 agc reach = reach_next; 651 1.1 agc reach_next = tmp; 652 1.1 agc } 653 1.1 agc 654 1.1 agc /* Handle exact matches and substitutions. */ 655 1.1 agc for (id = 0; id < tnfa->num_states; id++) 656 1.1 agc { 657 1.1 agc tre_tnfa_transition_t *trans; 658 1.1 agc 659 1.1 agc if (reach[id].pos < prev_pos) 660 1.1 agc continue; /* Not reached. */ 661 1.1 agc for (trans = reach[id].state; trans->state; trans++) 662 1.1 agc { 663 1.1 agc int dest_id; 664 1.1 agc int depth; 665 1.1 agc int cost, cost0, err; 666 1.1 agc 667 1.1 agc if (trans->assertions 668 1.1 agc && (CHECK_ASSERTIONS(trans->assertions) 669 1.1 agc || CHECK_CHAR_CLASSES(trans, tnfa, eflags))) 670 1.1 agc { 671 1.1 agc DPRINT((" exact, from %d: assert failed\n", id)); 672 1.1 agc continue; 673 1.1 agc } 674 1.1 agc 675 1.1 agc depth = reach[id].depth; 676 1.1 agc dest_id = trans->state_id; 677 1.1 agc 678 1.1 agc cost = reach[id].costs[depth][TRE_M_COST]; 679 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST]; 680 1.1 agc err = 0; 681 1.1 agc 682 1.1 agc if (trans->code_min > (tre_cint_t)prev_c 683 1.1 agc || trans->code_max < (tre_cint_t)prev_c) 684 1.1 agc { 685 1.1 agc /* Handle substitutes. The required character was not in 686 1.1 agc the string, so match it in place of whatever was supposed 687 1.1 agc to be there and increase costs accordingly. */ 688 1.1 agc err = 1; 689 1.1 agc 690 1.1 agc /* Compute and check cost at current depth. */ 691 1.1 agc cost = reach[id].costs[depth][TRE_M_COST]; 692 1.1 agc if (reach[id].params.cost_subst != TRE_PARAM_UNSET) 693 1.1 agc cost += reach[id].params.cost_subst; 694 1.1 agc if (cost > reach[id].params.max_cost) 695 1.1 agc continue; /* Cost too large. */ 696 1.1 agc 697 1.1 agc /* Check number of substitutes at current depth. */ 698 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_SUBST] + 1 699 1.1 agc > reach[id].params.max_subst) 700 1.1 agc continue; /* Too many substitutes. */ 701 1.1 agc 702 1.1 agc /* Check total number of errors at current depth. */ 703 1.1 agc if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1 704 1.1 agc > reach[id].params.max_err) 705 1.1 agc continue; /* Too many errors. */ 706 1.1 agc 707 1.1 agc /* Compute overall cost. */ 708 1.1 agc cost0 = cost; 709 1.1 agc if (depth > 0) 710 1.1 agc { 711 1.1 agc cost0 = reach[id].costs[0][TRE_M_COST]; 712 1.1 agc if (reach[id].params.cost_subst != TRE_PARAM_UNSET) 713 1.1 agc cost0 += reach[id].params.cost_subst; 714 1.1 agc else 715 1.1 agc cost0 += default_params.cost_subst; 716 1.1 agc } 717 1.1 agc DPRINT((" subst, from %03d to %03d, cost %d: ", 718 1.1 agc id, dest_id, cost0)); 719 1.1 agc } 720 1.1 agc else 721 1.1 agc DPRINT((" exact, from %03d to %03d, cost %d: ", 722 1.1 agc id, dest_id, cost0)); 723 1.1 agc 724 1.1 agc /* Compute tag values after this transition. */ 725 1.1 agc for (i = 0; i < num_tags; i++) 726 1.1 agc tmp_tags[i] = reach[id].tags[i]; 727 1.1 agc if (trans->tags) 728 1.1 agc for (i = 0; trans->tags[i] >= 0; i++) 729 1.2 christos if ((size_t)trans->tags[i] < num_tags) 730 1.1 agc tmp_tags[trans->tags[i]] = pos; 731 1.1 agc 732 1.1 agc /* If another path has also reached this state, choose the 733 1.1 agc one with the smallest cost or best tags if costs are equal. */ 734 1.1 agc if (reach_next[dest_id].pos == pos 735 1.1 agc && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST] 736 1.1 agc || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST] 737 1.1 agc && !tre_tag_order(num_tags, tnfa->tag_directions, 738 1.1 agc tmp_tags, 739 1.1 agc reach_next[dest_id].tags)))) 740 1.1 agc { 741 1.1 agc DPRINT(("lose\n")); 742 1.1 agc continue; 743 1.1 agc } 744 1.1 agc DPRINT(("win %d %d\n", 745 1.1 agc reach_next[dest_id].pos, 746 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST])); 747 1.1 agc 748 1.1 agc /* Set state, position, tags, and depth. */ 749 1.1 agc reach_next[dest_id].state = trans->state; 750 1.1 agc reach_next[dest_id].pos = pos; 751 1.1 agc for (i = 0; i < num_tags; i++) 752 1.1 agc reach_next[dest_id].tags[i] = tmp_tags[i]; 753 1.1 agc reach_next[dest_id].depth = reach[id].depth; 754 1.1 agc 755 1.1 agc /* Set parameters. */ 756 1.1 agc reach_next[dest_id].params = reach[id].params; 757 1.1 agc if (trans->params) 758 1.1 agc tre_set_params(&reach_next[dest_id], trans->params, 759 1.1 agc default_params); 760 1.1 agc 761 1.1 agc /* Set the costs after this transition. */ 762 1.7 joerg memcpy(&reach_next[dest_id].costs, 763 1.7 joerg reach[id].costs, 764 1.7 joerg sizeof(reach[id].costs[0][0]) 765 1.7 joerg * TRE_M_LAST * (depth + 1)); 766 1.1 agc reach_next[dest_id].costs[depth][TRE_M_COST] = cost; 767 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_SUBST] += err; 768 1.1 agc reach_next[dest_id].costs[depth][TRE_M_NUM_ERR] += err; 769 1.1 agc if (depth > 0) 770 1.1 agc { 771 1.1 agc reach_next[dest_id].costs[0][TRE_M_COST] = cost0; 772 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_SUBST] += err; 773 1.1 agc reach_next[dest_id].costs[0][TRE_M_NUM_ERR] += err; 774 1.1 agc } 775 1.1 agc 776 1.1 agc if (trans->state == tnfa->final 777 1.1 agc && (match_eo < 0 778 1.1 agc || cost0 < match_costs[TRE_M_COST] 779 1.1 agc || (cost0 == match_costs[TRE_M_COST] 780 1.1 agc && num_tags > 0 && tmp_tags[0] <= match_tags[0]))) 781 1.1 agc { 782 1.1 agc DPRINT((" setting new match at %d, cost %d\n", 783 1.1 agc pos, cost0)); 784 1.1 agc match_eo = pos; 785 1.1 agc for (i = 0; i < TRE_M_LAST; i++) 786 1.1 agc match_costs[i] = reach_next[dest_id].costs[0][i]; 787 1.1 agc for (i = 0; i < num_tags; i++) 788 1.1 agc match_tags[i] = tmp_tags[i]; 789 1.1 agc } 790 1.1 agc } 791 1.1 agc } 792 1.1 agc } 793 1.1 agc 794 1.1 agc DPRINT(("match end offset = %d, match cost = %d\n", match_eo, 795 1.1 agc match_costs[TRE_M_COST])); 796 1.1 agc 797 1.1 agc match->cost = match_costs[TRE_M_COST]; 798 1.1 agc match->num_ins = match_costs[TRE_M_NUM_INS]; 799 1.1 agc match->num_del = match_costs[TRE_M_NUM_DEL]; 800 1.1 agc match->num_subst = match_costs[TRE_M_NUM_SUBST]; 801 1.1 agc *match_end_ofs = match_eo; 802 1.1 agc 803 1.4 rin ret = match_eo >= 0 ? REG_OK : REG_NOMATCH; 804 1.4 rin error_exit: 805 1.4 rin #ifndef TRE_USE_ALLOCA 806 1.4 rin if (buf) 807 1.4 rin xfree(buf); 808 1.4 rin #endif /* !TRE_USE_ALLOCA */ 809 1.4 rin return ret; 810 1.1 agc } 811