Home | History | Annotate | Line # | Download | only in lib
tre-match-approx.c revision 1.4
      1  1.1       agc /*
      2  1.1       agc   tre-match-approx.c - TRE approximate regex matching engine
      3  1.1       agc 
      4  1.1       agc   This software is released under a BSD-style license.
      5  1.1       agc   See the file LICENSE for details and copyright.
      6  1.1       agc 
      7  1.1       agc */
      8  1.1       agc #ifdef HAVE_CONFIG_H
      9  1.1       agc #include <config.h>
     10  1.1       agc #endif /* HAVE_CONFIG_H */
     11  1.1       agc 
     12  1.2  christos #include "tre-alloca.h"
     13  1.1       agc 
     14  1.1       agc #define __USE_STRING_INLINES
     15  1.1       agc #undef __NO_INLINE__
     16  1.1       agc 
     17  1.1       agc #include <assert.h>
     18  1.1       agc #include <stdlib.h>
     19  1.1       agc #include <string.h>
     20  1.1       agc #include <limits.h>
     21  1.1       agc #ifdef HAVE_WCHAR_H
     22  1.1       agc #include <wchar.h>
     23  1.1       agc #endif /* HAVE_WCHAR_H */
     24  1.1       agc #ifdef HAVE_WCTYPE_H
     25  1.1       agc #include <wctype.h>
     26  1.1       agc #endif /* HAVE_WCTYPE_H */
     27  1.1       agc #ifndef TRE_WCHAR
     28  1.1       agc #include <ctype.h>
     29  1.1       agc #endif /* !TRE_WCHAR */
     30  1.1       agc #ifdef HAVE_MALLOC_H
     31  1.1       agc #include <malloc.h>
     32  1.1       agc #endif /* HAVE_MALLOC_H */
     33  1.1       agc 
     34  1.1       agc #include "tre-internal.h"
     35  1.1       agc #include "tre-match-utils.h"
     36  1.1       agc #include "tre.h"
     37  1.1       agc #include "xmalloc.h"
     38  1.1       agc 
     39  1.1       agc #define TRE_M_COST	0
     40  1.1       agc #define TRE_M_NUM_INS	1
     41  1.1       agc #define TRE_M_NUM_DEL	2
     42  1.1       agc #define TRE_M_NUM_SUBST 3
     43  1.1       agc #define TRE_M_NUM_ERR	4
     44  1.1       agc #define TRE_M_LAST	5
     45  1.1       agc 
     46  1.1       agc #define TRE_M_MAX_DEPTH 3
     47  1.1       agc 
     48  1.1       agc typedef struct {
     49  1.1       agc   /* State in the TNFA transition table. */
     50  1.1       agc   tre_tnfa_transition_t *state;
     51  1.1       agc   /* Position in input string. */
     52  1.1       agc   int pos;
     53  1.1       agc   /* Tag values. */
     54  1.1       agc   int *tags;
     55  1.1       agc   /* Matching parameters. */
     56  1.1       agc   regaparams_t params;
     57  1.1       agc   /* Nesting depth of parameters.  This is used as an index in
     58  1.1       agc      the `costs' array. */
     59  1.1       agc   int depth;
     60  1.1       agc   /* Costs and counter values for different parameter nesting depths. */
     61  1.1       agc   int costs[TRE_M_MAX_DEPTH + 1][TRE_M_LAST];
     62  1.1       agc } tre_tnfa_approx_reach_t;
     63  1.1       agc 
     64  1.1       agc 
     65  1.1       agc #ifdef TRE_DEBUG
     66  1.1       agc /* Prints the `reach' array in a readable fashion with DPRINT. */
     67  1.1       agc static void
     68  1.1       agc tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_approx_reach_t *reach,
     69  1.2  christos 		int pos, size_t num_tags)
     70  1.1       agc {
     71  1.1       agc   int id;
     72  1.1       agc 
     73  1.1       agc   /* Print each state on one line. */
     74  1.1       agc   DPRINT(("  reach:\n"));
     75  1.1       agc   for (id = 0; id < tnfa->num_states; id++)
     76  1.1       agc     {
     77  1.1       agc       int i, j;
     78  1.1       agc       if (reach[id].pos < pos)
     79  1.1       agc 	continue;  /* Not reached. */
     80  1.1       agc       DPRINT(("	 %03d, costs ", id));
     81  1.1       agc       for (i = 0; i <= reach[id].depth; i++)
     82  1.1       agc 	{
     83  1.1       agc 	  DPRINT(("["));
     84  1.1       agc 	  for (j = 0; j < TRE_M_LAST; j++)
     85  1.1       agc 	    {
     86  1.1       agc 	      DPRINT(("%2d", reach[id].costs[i][j]));
     87  1.1       agc 	      if (j + 1 < TRE_M_LAST)
     88  1.1       agc 		DPRINT((","));
     89  1.1       agc 	    }
     90  1.1       agc 	  DPRINT(("]"));
     91  1.1       agc 	  if (i + 1 <= reach[id].depth)
     92  1.1       agc 	    DPRINT((", "));
     93  1.1       agc 	}
     94  1.1       agc       DPRINT(("\n	tags "));
     95  1.1       agc       for (i = 0; i < num_tags; i++)
     96  1.1       agc 	{
     97  1.1       agc 	  DPRINT(("%02d", reach[id].tags[i]));
     98  1.1       agc 	  if (i + 1 < num_tags)
     99  1.1       agc 	    DPRINT((","));
    100  1.1       agc 	}
    101  1.1       agc       DPRINT(("\n"));
    102  1.1       agc     }
    103  1.1       agc   DPRINT(("\n"));
    104  1.1       agc }
    105  1.1       agc #endif /* TRE_DEBUG */
    106  1.1       agc 
    107  1.1       agc 
    108  1.1       agc /* Sets the matching parameters in `reach' to the ones defined in the `pa'
    109  1.1       agc    array.  If `pa' specifies default values, they are taken from
    110  1.1       agc    `default_params'. */
    111  1.1       agc inline static void
    112  1.1       agc tre_set_params(tre_tnfa_approx_reach_t *reach,
    113  1.1       agc 	       int *pa, regaparams_t default_params)
    114  1.1       agc {
    115  1.1       agc   int value;
    116  1.1       agc 
    117  1.1       agc   /* If depth is increased reset costs and counters to zero for the
    118  1.1       agc      new levels. */
    119  1.1       agc   value = pa[TRE_PARAM_DEPTH];
    120  1.1       agc   assert(value <= TRE_M_MAX_DEPTH);
    121  1.1       agc   if (value > reach->depth)
    122  1.1       agc     {
    123  1.1       agc       int i, j;
    124  1.1       agc       for (i = reach->depth + 1; i <= value; i++)
    125  1.1       agc 	for (j = 0; j < TRE_M_LAST; j++)
    126  1.1       agc 	  reach->costs[i][j] = 0;
    127  1.1       agc     }
    128  1.1       agc   reach->depth = value;
    129  1.1       agc 
    130  1.1       agc   /* Set insert cost. */
    131  1.1       agc   value = pa[TRE_PARAM_COST_INS];
    132  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    133  1.1       agc     reach->params.cost_ins = default_params.cost_ins;
    134  1.1       agc   else if (value != TRE_PARAM_UNSET)
    135  1.1       agc     reach->params.cost_ins = value;
    136  1.1       agc 
    137  1.1       agc   /* Set delete cost. */
    138  1.1       agc   value = pa[TRE_PARAM_COST_DEL];
    139  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    140  1.1       agc     reach->params.cost_del = default_params.cost_del;
    141  1.1       agc   else if (value != TRE_PARAM_UNSET)
    142  1.1       agc     reach->params.cost_del = value;
    143  1.1       agc 
    144  1.1       agc   /* Set substitute cost. */
    145  1.1       agc   value = pa[TRE_PARAM_COST_SUBST];
    146  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    147  1.1       agc     reach->params.cost_subst = default_params.cost_subst;
    148  1.1       agc   else
    149  1.1       agc     reach->params.cost_subst = value;
    150  1.1       agc 
    151  1.1       agc   /* Set maximum cost. */
    152  1.1       agc   value = pa[TRE_PARAM_COST_MAX];
    153  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    154  1.1       agc     reach->params.max_cost = default_params.max_cost;
    155  1.1       agc   else if (value != TRE_PARAM_UNSET)
    156  1.1       agc     reach->params.max_cost = value;
    157  1.1       agc 
    158  1.1       agc   /* Set maximum inserts. */
    159  1.1       agc   value = pa[TRE_PARAM_MAX_INS];
    160  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    161  1.1       agc     reach->params.max_ins = default_params.max_ins;
    162  1.1       agc   else if (value != TRE_PARAM_UNSET)
    163  1.1       agc     reach->params.max_ins = value;
    164  1.1       agc 
    165  1.1       agc   /* Set maximum deletes. */
    166  1.1       agc   value = pa[TRE_PARAM_MAX_DEL];
    167  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    168  1.1       agc     reach->params.max_del = default_params.max_del;
    169  1.1       agc   else if (value != TRE_PARAM_UNSET)
    170  1.1       agc     reach->params.max_del = value;
    171  1.1       agc 
    172  1.1       agc   /* Set maximum substitutes. */
    173  1.1       agc   value = pa[TRE_PARAM_MAX_SUBST];
    174  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    175  1.1       agc     reach->params.max_subst = default_params.max_subst;
    176  1.1       agc   else if (value != TRE_PARAM_UNSET)
    177  1.1       agc     reach->params.max_subst = value;
    178  1.1       agc 
    179  1.1       agc   /* Set maximum number of errors. */
    180  1.1       agc   value = pa[TRE_PARAM_MAX_ERR];
    181  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    182  1.1       agc     reach->params.max_err = default_params.max_err;
    183  1.1       agc   else if (value != TRE_PARAM_UNSET)
    184  1.1       agc     reach->params.max_err = value;
    185  1.1       agc }
    186  1.1       agc 
    187  1.1       agc reg_errcode_t
    188  1.1       agc tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
    189  1.1       agc 		    tre_str_type_t type, int *match_tags,
    190  1.1       agc 		    regamatch_t *match, regaparams_t default_params,
    191  1.1       agc 		    int eflags, int *match_end_ofs)
    192  1.1       agc {
    193  1.1       agc   /* State variables required by GET_NEXT_WCHAR. */
    194  1.1       agc   tre_char_t prev_c = 0, next_c = 0;
    195  1.1       agc   const char *str_byte = string;
    196  1.1       agc   int pos = -1;
    197  1.1       agc   unsigned int pos_add_next = 1;
    198  1.1       agc #ifdef TRE_WCHAR
    199  1.1       agc   const wchar_t *str_wide = string;
    200  1.1       agc #ifdef TRE_MBSTATE
    201  1.1       agc   mbstate_t mbstate;
    202  1.1       agc #endif /* !TRE_WCHAR */
    203  1.1       agc #endif /* TRE_WCHAR */
    204  1.1       agc   int reg_notbol = eflags & REG_NOTBOL;
    205  1.1       agc   int reg_noteol = eflags & REG_NOTEOL;
    206  1.1       agc   int reg_newline = tnfa->cflags & REG_NEWLINE;
    207  1.2  christos   size_t str_user_end = 0;
    208  1.1       agc 
    209  1.1       agc   int prev_pos;
    210  1.1       agc 
    211  1.1       agc   /* Number of tags. */
    212  1.2  christos   size_t num_tags;
    213  1.1       agc   /* The reach tables. */
    214  1.1       agc   tre_tnfa_approx_reach_t *reach, *reach_next;
    215  1.1       agc   /* Tag array for temporary use. */
    216  1.1       agc   int *tmp_tags;
    217  1.1       agc 
    218  1.1       agc   /* End offset of best match so far, or -1 if no match found yet. */
    219  1.1       agc   int match_eo = -1;
    220  1.1       agc   /* Costs of the match. */
    221  1.1       agc   int match_costs[TRE_M_LAST];
    222  1.1       agc 
    223  1.1       agc   /* Space for temporary data required for matching. */
    224  1.1       agc   unsigned char *buf;
    225  1.1       agc 
    226  1.2  christos   size_t i, id;
    227  1.1       agc 
    228  1.4       rin   reg_errcode_t ret;
    229  1.4       rin 
    230  1.1       agc   if (!match_tags)
    231  1.1       agc     num_tags = 0;
    232  1.1       agc   else
    233  1.1       agc     num_tags = tnfa->num_tags;
    234  1.1       agc 
    235  1.1       agc #ifdef TRE_MBSTATE
    236  1.1       agc   memset(&mbstate, '\0', sizeof(mbstate));
    237  1.1       agc #endif /* TRE_MBSTATE */
    238  1.1       agc 
    239  1.1       agc   DPRINT(("tre_tnfa_run_approx, input type %d, len %d, eflags %d, "
    240  1.1       agc 	  "match_tags %p\n",
    241  1.1       agc 	  type, len, eflags,
    242  1.1       agc 	  match_tags));
    243  1.1       agc   DPRINT(("max cost %d, ins %d, del %d, subst %d\n",
    244  1.1       agc 	  default_params.max_cost,
    245  1.1       agc 	  default_params.cost_ins,
    246  1.1       agc 	  default_params.cost_del,
    247  1.1       agc 	  default_params.cost_subst));
    248  1.1       agc 
    249  1.1       agc   /* Allocate memory for temporary data required for matching.	This needs to
    250  1.1       agc      be done for every matching operation to be thread safe.  This allocates
    251  1.1       agc      everything in a single large block from the stack frame using alloca()
    252  1.1       agc      or with malloc() if alloca is unavailable. */
    253  1.1       agc   {
    254  1.1       agc     unsigned char *buf_cursor;
    255  1.1       agc     /* Space needed for one array of tags. */
    256  1.2  christos     size_t tag_bytes = sizeof(*tmp_tags) * num_tags;
    257  1.1       agc     /* Space needed for one reach table. */
    258  1.2  christos     size_t reach_bytes = sizeof(*reach_next) * tnfa->num_states;
    259  1.1       agc     /* Total space needed. */
    260  1.2  christos     size_t total_bytes = reach_bytes * 2 + (tnfa->num_states * 2 + 1 ) * tag_bytes;
    261  1.1       agc     /* Add some extra to make sure we can align the pointers.  The multiplier
    262  1.1       agc        used here must be equal to the number of ALIGN calls below. */
    263  1.1       agc     total_bytes += (sizeof(long) - 1) * 3;
    264  1.1       agc 
    265  1.1       agc     /* Allocate the memory. */
    266  1.1       agc #ifdef TRE_USE_ALLOCA
    267  1.1       agc     buf = alloca(total_bytes);
    268  1.1       agc #else /* !TRE_USE_ALLOCA */
    269  1.1       agc     buf = xmalloc((unsigned)total_bytes);
    270  1.1       agc #endif /* !TRE_USE_ALLOCA */
    271  1.1       agc     if (!buf)
    272  1.1       agc       return REG_ESPACE;
    273  1.1       agc     memset(buf, 0, (size_t)total_bytes);
    274  1.1       agc 
    275  1.1       agc     /* Allocate `tmp_tags' from `buf'. */
    276  1.1       agc     tmp_tags = (void *)buf;
    277  1.1       agc     buf_cursor = buf + tag_bytes;
    278  1.1       agc     buf_cursor += ALIGN(buf_cursor, long);
    279  1.1       agc 
    280  1.1       agc     /* Allocate `reach' from `buf'. */
    281  1.1       agc     reach = (void *)buf_cursor;
    282  1.1       agc     buf_cursor += reach_bytes;
    283  1.1       agc     buf_cursor += ALIGN(buf_cursor, long);
    284  1.1       agc 
    285  1.1       agc     /* Allocate `reach_next' from `buf'. */
    286  1.1       agc     reach_next = (void *)buf_cursor;
    287  1.1       agc     buf_cursor += reach_bytes;
    288  1.1       agc     buf_cursor += ALIGN(buf_cursor, long);
    289  1.1       agc 
    290  1.1       agc     /* Allocate tag arrays for `reach' and `reach_next' from `buf'. */
    291  1.1       agc     for (i = 0; i < tnfa->num_states; i++)
    292  1.1       agc       {
    293  1.1       agc 	reach[i].tags = (void *)buf_cursor;
    294  1.1       agc 	buf_cursor += tag_bytes;
    295  1.1       agc 	reach_next[i].tags = (void *)buf_cursor;
    296  1.1       agc 	buf_cursor += tag_bytes;
    297  1.1       agc       }
    298  1.1       agc     assert(buf_cursor <= buf + total_bytes);
    299  1.1       agc   }
    300  1.1       agc 
    301  1.1       agc   for (i = 0; i < TRE_M_LAST; i++)
    302  1.1       agc     match_costs[i] = INT_MAX;
    303  1.1       agc 
    304  1.1       agc   /* Mark the reach arrays empty. */
    305  1.1       agc   for (i = 0; i < tnfa->num_states; i++)
    306  1.1       agc     reach[i].pos = reach_next[i].pos = -2;
    307  1.1       agc 
    308  1.1       agc   prev_pos = pos;
    309  1.1       agc   GET_NEXT_WCHAR();
    310  1.1       agc   pos = 0;
    311  1.1       agc 
    312  1.3       rin   while (/*CONSTCOND*/(void)1,1)
    313  1.1       agc     {
    314  1.1       agc       DPRINT(("%03d:%2lc/%05d\n", pos, (tre_cint_t)next_c, (int)next_c));
    315  1.1       agc 
    316  1.1       agc       /* Add initial states to `reach_next' if an exact match has not yet
    317  1.1       agc 	 been found. */
    318  1.1       agc       if (match_costs[TRE_M_COST] > 0)
    319  1.1       agc 	{
    320  1.1       agc 	  tre_tnfa_transition_t *trans;
    321  1.1       agc 	  DPRINT(("  init"));
    322  1.1       agc 	  for (trans = tnfa->initial; trans->state; trans++)
    323  1.1       agc 	    {
    324  1.1       agc 	      int stateid = trans->state_id;
    325  1.1       agc 
    326  1.1       agc 	      /* If this state is not currently in `reach_next', add it
    327  1.1       agc 		 there. */
    328  1.1       agc 	      if (reach_next[stateid].pos < pos)
    329  1.1       agc 		{
    330  1.1       agc 		  if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
    331  1.1       agc 		    {
    332  1.1       agc 		      /* Assertions failed, don't add this state. */
    333  1.1       agc 		      DPRINT((" !%d (assert)", stateid));
    334  1.1       agc 		      continue;
    335  1.1       agc 		    }
    336  1.1       agc 		  DPRINT((" %d", stateid));
    337  1.1       agc 		  reach_next[stateid].state = trans->state;
    338  1.1       agc 		  reach_next[stateid].pos = pos;
    339  1.1       agc 
    340  1.1       agc 		  /* Compute tag values after this transition. */
    341  1.1       agc 		  for (i = 0; i < num_tags; i++)
    342  1.1       agc 		    reach_next[stateid].tags[i] = -1;
    343  1.1       agc 
    344  1.1       agc 		  if (trans->tags)
    345  1.1       agc 		    for (i = 0; trans->tags[i] >= 0; i++)
    346  1.2  christos 		      if ((size_t)trans->tags[i] < num_tags)
    347  1.1       agc 			reach_next[stateid].tags[trans->tags[i]] = pos;
    348  1.1       agc 
    349  1.1       agc 		  /* Set the parameters, depth, and costs. */
    350  1.1       agc 		  reach_next[stateid].params = default_params;
    351  1.1       agc 		  reach_next[stateid].depth = 0;
    352  1.1       agc 		  for (i = 0; i < TRE_M_LAST; i++)
    353  1.1       agc 		    reach_next[stateid].costs[0][i] = 0;
    354  1.1       agc 		  if (trans->params)
    355  1.1       agc 		    tre_set_params(&reach_next[stateid], trans->params,
    356  1.1       agc 				   default_params);
    357  1.1       agc 
    358  1.1       agc 		  /* If this is the final state, mark the exact match. */
    359  1.1       agc 		  if (trans->state == tnfa->final)
    360  1.1       agc 		    {
    361  1.1       agc 		      match_eo = pos;
    362  1.1       agc 		      for (i = 0; i < num_tags; i++)
    363  1.1       agc 			match_tags[i] = reach_next[stateid].tags[i];
    364  1.1       agc 		      for (i = 0; i < TRE_M_LAST; i++)
    365  1.1       agc 			match_costs[i] = 0;
    366  1.1       agc 		    }
    367  1.1       agc 		}
    368  1.1       agc 	    }
    369  1.1       agc 	    DPRINT(("\n"));
    370  1.1       agc 	}
    371  1.1       agc 
    372  1.1       agc 
    373  1.1       agc       /* Handle inserts.  This is done by pretending there's an epsilon
    374  1.1       agc 	 transition from each state in `reach' back to the same state.
    375  1.1       agc 	 We don't need to worry about the final state here; this will never
    376  1.1       agc 	 give a better match than what we already have. */
    377  1.1       agc       for (id = 0; id < tnfa->num_states; id++)
    378  1.1       agc 	{
    379  1.1       agc 	  int depth;
    380  1.1       agc 	  int cost, cost0;
    381  1.1       agc 
    382  1.1       agc 	  if (reach[id].pos != prev_pos)
    383  1.1       agc 	    {
    384  1.1       agc 	      DPRINT(("	 insert: %d not reached\n", id));
    385  1.1       agc 	      continue;	 /* Not reached. */
    386  1.1       agc 	    }
    387  1.1       agc 
    388  1.1       agc 	  depth = reach[id].depth;
    389  1.1       agc 
    390  1.1       agc 	  /* Compute and check cost at current depth. */
    391  1.1       agc 	  cost = reach[id].costs[depth][TRE_M_COST];
    392  1.1       agc 	  if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
    393  1.1       agc 	    cost += reach[id].params.cost_ins;
    394  1.1       agc 	  if (cost > reach[id].params.max_cost)
    395  1.1       agc 	    continue;  /* Cost too large. */
    396  1.1       agc 
    397  1.1       agc 	  /* Check number of inserts at current depth. */
    398  1.1       agc 	  if (reach[id].costs[depth][TRE_M_NUM_INS] + 1
    399  1.1       agc 	      > reach[id].params.max_ins)
    400  1.1       agc 	    continue;  /* Too many inserts. */
    401  1.1       agc 
    402  1.1       agc 	  /* Check total number of errors at current depth. */
    403  1.1       agc 	  if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
    404  1.1       agc 	      > reach[id].params.max_err)
    405  1.1       agc 	    continue;  /* Too many errors. */
    406  1.1       agc 
    407  1.1       agc 	  /* Compute overall cost. */
    408  1.1       agc 	  cost0 = cost;
    409  1.1       agc 	  if (depth > 0)
    410  1.1       agc 	    {
    411  1.1       agc 	      cost0 = reach[id].costs[0][TRE_M_COST];
    412  1.1       agc 	      if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
    413  1.1       agc 		cost0 += reach[id].params.cost_ins;
    414  1.1       agc 	      else
    415  1.1       agc 		cost0 += default_params.cost_ins;
    416  1.1       agc 	    }
    417  1.1       agc 
    418  1.1       agc 	  DPRINT(("  insert: from %d to %d, cost %d: ", id, id,
    419  1.1       agc 		  reach[id].costs[depth][TRE_M_COST]));
    420  1.1       agc 	  if (reach_next[id].pos == pos
    421  1.1       agc 	      && (cost0 >= reach_next[id].costs[0][TRE_M_COST]))
    422  1.1       agc 	    {
    423  1.1       agc 	      DPRINT(("lose\n"));
    424  1.1       agc 	      continue;
    425  1.1       agc 	    }
    426  1.1       agc 	  DPRINT(("win\n"));
    427  1.1       agc 
    428  1.1       agc 	  /* Copy state, position, tags, parameters, and depth. */
    429  1.1       agc 	  reach_next[id].state = reach[id].state;
    430  1.1       agc 	  reach_next[id].pos = pos;
    431  1.1       agc 	  for (i = 0; i < num_tags; i++)
    432  1.1       agc 	    reach_next[id].tags[i] = reach[id].tags[i];
    433  1.1       agc 	  reach_next[id].params = reach[id].params;
    434  1.1       agc 	  reach_next[id].depth = reach[id].depth;
    435  1.1       agc 
    436  1.1       agc 	  /* Set the costs after this transition. */
    437  1.1       agc 	  memcpy(reach_next[id].costs, reach[id].costs,
    438  1.1       agc 		 sizeof(reach_next[id].costs[0][0])
    439  1.1       agc 		 * TRE_M_LAST * (depth + 1));
    440  1.1       agc 	  reach_next[id].costs[depth][TRE_M_COST] = cost;
    441  1.1       agc 	  reach_next[id].costs[depth][TRE_M_NUM_INS]++;
    442  1.1       agc 	  reach_next[id].costs[depth][TRE_M_NUM_ERR]++;
    443  1.1       agc 	  if (depth > 0)
    444  1.1       agc 	    {
    445  1.1       agc 	      reach_next[id].costs[0][TRE_M_COST] = cost0;
    446  1.1       agc 	      reach_next[id].costs[0][TRE_M_NUM_INS]++;
    447  1.1       agc 	      reach_next[id].costs[0][TRE_M_NUM_ERR]++;
    448  1.1       agc 	    }
    449  1.1       agc 
    450  1.1       agc 	}
    451  1.1       agc 
    452  1.1       agc 
    453  1.1       agc       /* Handle deletes.  This is done by traversing through the whole TNFA
    454  1.1       agc 	 pretending that all transitions are epsilon transitions, until
    455  1.1       agc 	 no more states can be reached with better costs. */
    456  1.1       agc       {
    457  1.1       agc 	/* XXX - dynamic ringbuffer size */
    458  1.1       agc 	tre_tnfa_approx_reach_t *ringbuffer[512];
    459  1.1       agc 	tre_tnfa_approx_reach_t **deque_start, **deque_end;
    460  1.1       agc 
    461  1.1       agc 	deque_start = deque_end = ringbuffer;
    462  1.1       agc 
    463  1.1       agc 	/* Add all states in `reach_next' to the deque. */
    464  1.1       agc 	for (id = 0; id < tnfa->num_states; id++)
    465  1.1       agc 	  {
    466  1.1       agc 	    if (reach_next[id].pos != pos)
    467  1.1       agc 	      continue;
    468  1.1       agc 	    *deque_end = &reach_next[id];
    469  1.1       agc 	    deque_end++;
    470  1.1       agc 	    assert(deque_end != deque_start);
    471  1.1       agc 	  }
    472  1.1       agc 
    473  1.1       agc 	/* Repeat until the deque is empty. */
    474  1.1       agc 	while (deque_end != deque_start)
    475  1.1       agc 	  {
    476  1.1       agc 	    tre_tnfa_approx_reach_t *reach_p;
    477  1.1       agc 	    int depth;
    478  1.1       agc 	    int cost, cost0;
    479  1.1       agc 	    tre_tnfa_transition_t *trans;
    480  1.1       agc 
    481  1.1       agc 	    /* Pop the first item off the deque. */
    482  1.1       agc 	    reach_p = *deque_start;
    483  1.1       agc 	    id = reach_p - reach_next;
    484  1.1       agc 	    depth = reach_p->depth;
    485  1.1       agc 
    486  1.1       agc 	    /* Compute cost at current depth. */
    487  1.1       agc 	    cost = reach_p->costs[depth][TRE_M_COST];
    488  1.1       agc 	    if (reach_p->params.cost_del != TRE_PARAM_UNSET)
    489  1.1       agc 	      cost += reach_p->params.cost_del;
    490  1.1       agc 
    491  1.1       agc 	    /* Check cost, number of deletes, and total number of errors
    492  1.1       agc 	       at current depth. */
    493  1.1       agc 	    if (cost > reach_p->params.max_cost
    494  1.1       agc 		|| (reach_p->costs[depth][TRE_M_NUM_DEL] + 1
    495  1.1       agc 		    > reach_p->params.max_del)
    496  1.1       agc 		|| (reach_p->costs[depth][TRE_M_NUM_ERR] + 1
    497  1.1       agc 		    > reach_p->params.max_err))
    498  1.1       agc 	      {
    499  1.1       agc 		/* Too many errors or cost too large. */
    500  1.1       agc 		DPRINT(("  delete: from %03d: cost too large\n", id));
    501  1.1       agc 		deque_start++;
    502  1.1       agc 		if (deque_start >= (ringbuffer + 512))
    503  1.1       agc 		  deque_start = ringbuffer;
    504  1.1       agc 		continue;
    505  1.1       agc 	      }
    506  1.1       agc 
    507  1.1       agc 	    /* Compute overall cost. */
    508  1.1       agc 	    cost0 = cost;
    509  1.1       agc 	    if (depth > 0)
    510  1.1       agc 	      {
    511  1.1       agc 		cost0 = reach_p->costs[0][TRE_M_COST];
    512  1.1       agc 		if (reach_p->params.cost_del != TRE_PARAM_UNSET)
    513  1.1       agc 		  cost0 += reach_p->params.cost_del;
    514  1.1       agc 		else
    515  1.1       agc 		  cost0 += default_params.cost_del;
    516  1.1       agc 	      }
    517  1.1       agc 
    518  1.1       agc 	    for (trans = reach_p->state; trans->state; trans++)
    519  1.1       agc 	      {
    520  1.1       agc 		int dest_id = trans->state_id;
    521  1.1       agc 		DPRINT(("  delete: from %03d to %03d, cost %d (%d): ",
    522  1.1       agc 			id, dest_id, cost0, reach_p->params.max_cost));
    523  1.1       agc 
    524  1.1       agc 		if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
    525  1.1       agc 		  {
    526  1.1       agc 		    DPRINT(("assertion failed\n"));
    527  1.1       agc 		    continue;
    528  1.1       agc 		  }
    529  1.1       agc 
    530  1.1       agc 		/* Compute tag values after this transition. */
    531  1.1       agc 		for (i = 0; i < num_tags; i++)
    532  1.1       agc 		  tmp_tags[i] = reach_p->tags[i];
    533  1.1       agc 		if (trans->tags)
    534  1.1       agc 		  for (i = 0; trans->tags[i] >= 0; i++)
    535  1.2  christos 		    if ((size_t)trans->tags[i] < num_tags)
    536  1.1       agc 		      tmp_tags[trans->tags[i]] = pos;
    537  1.1       agc 
    538  1.1       agc 		/* If another path has also reached this state, choose the one
    539  1.1       agc 		   with the smallest cost or best tags if costs are equal. */
    540  1.1       agc 		if (reach_next[dest_id].pos == pos
    541  1.1       agc 		    && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
    542  1.1       agc 			|| (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
    543  1.1       agc 			    && (!match_tags
    544  1.1       agc 				|| !tre_tag_order(num_tags,
    545  1.1       agc 						  tnfa->tag_directions,
    546  1.1       agc 						  tmp_tags,
    547  1.1       agc 						  reach_next[dest_id].tags)))))
    548  1.1       agc 		  {
    549  1.1       agc 		    DPRINT(("lose, cost0 %d, have %d\n",
    550  1.1       agc 			    cost0, reach_next[dest_id].costs[0][TRE_M_COST]));
    551  1.1       agc 		    continue;
    552  1.1       agc 		  }
    553  1.1       agc 		DPRINT(("win\n"));
    554  1.1       agc 
    555  1.1       agc 		/* Set state, position, tags, parameters, depth, and costs. */
    556  1.1       agc 		reach_next[dest_id].state = trans->state;
    557  1.1       agc 		reach_next[dest_id].pos = pos;
    558  1.1       agc 		for (i = 0; i < num_tags; i++)
    559  1.1       agc 		  reach_next[dest_id].tags[i] = tmp_tags[i];
    560  1.1       agc 
    561  1.1       agc 		reach_next[dest_id].params = reach_p->params;
    562  1.1       agc 		if (trans->params)
    563  1.1       agc 		  tre_set_params(&reach_next[dest_id], trans->params,
    564  1.1       agc 				 default_params);
    565  1.1       agc 
    566  1.1       agc 		reach_next[dest_id].depth = reach_p->depth;
    567  1.1       agc 		memcpy(&reach_next[dest_id].costs,
    568  1.1       agc 		       reach_p->costs,
    569  1.1       agc 		       sizeof(reach_p->costs[0][0])
    570  1.1       agc 		       * TRE_M_LAST * (depth + 1));
    571  1.1       agc 		reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
    572  1.1       agc 		reach_next[dest_id].costs[depth][TRE_M_NUM_DEL]++;
    573  1.1       agc 		reach_next[dest_id].costs[depth][TRE_M_NUM_ERR]++;
    574  1.1       agc 		if (depth > 0)
    575  1.1       agc 		  {
    576  1.1       agc 		    reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
    577  1.1       agc 		    reach_next[dest_id].costs[0][TRE_M_NUM_DEL]++;
    578  1.1       agc 		    reach_next[dest_id].costs[0][TRE_M_NUM_ERR]++;
    579  1.1       agc 		  }
    580  1.1       agc 
    581  1.1       agc 		if (trans->state == tnfa->final
    582  1.1       agc 		    && (match_eo < 0
    583  1.1       agc 			|| match_costs[TRE_M_COST] > cost0
    584  1.1       agc 			|| (match_costs[TRE_M_COST] == cost0
    585  1.1       agc 			    && (num_tags > 0
    586  1.1       agc 				&& tmp_tags[0] <= match_tags[0]))))
    587  1.1       agc 		  {
    588  1.1       agc 		    DPRINT(("	 setting new match at %d, cost %d\n",
    589  1.1       agc 			    pos, cost0));
    590  1.1       agc 		    match_eo = pos;
    591  1.1       agc 		    memcpy(match_costs, reach_next[dest_id].costs[0],
    592  1.1       agc 			   sizeof(match_costs[0]) * TRE_M_LAST);
    593  1.1       agc 		    for (i = 0; i < num_tags; i++)
    594  1.1       agc 		      match_tags[i] = tmp_tags[i];
    595  1.1       agc 		  }
    596  1.1       agc 
    597  1.1       agc 		/* Add to the end of the deque. */
    598  1.1       agc 		*deque_end = &reach_next[dest_id];
    599  1.1       agc 		deque_end++;
    600  1.1       agc 		if (deque_end >= (ringbuffer + 512))
    601  1.1       agc 		  deque_end = ringbuffer;
    602  1.1       agc 		assert(deque_end != deque_start);
    603  1.1       agc 	      }
    604  1.1       agc 	    deque_start++;
    605  1.1       agc 	    if (deque_start >= (ringbuffer + 512))
    606  1.1       agc 	      deque_start = ringbuffer;
    607  1.1       agc 	  }
    608  1.1       agc 
    609  1.1       agc       }
    610  1.1       agc 
    611  1.1       agc #ifdef TRE_DEBUG
    612  1.1       agc       tre_print_reach(tnfa, reach_next, pos, num_tags);
    613  1.1       agc #endif /* TRE_DEBUG */
    614  1.1       agc 
    615  1.1       agc       /* Check for end of string. */
    616  1.1       agc       if (len < 0)
    617  1.1       agc 	{
    618  1.1       agc 	  if (type == STR_USER)
    619  1.1       agc 	    {
    620  1.1       agc 	      if (str_user_end)
    621  1.1       agc 		break;
    622  1.1       agc 	    }
    623  1.1       agc 	  else if (next_c == L'\0')
    624  1.1       agc 	    break;
    625  1.1       agc 	}
    626  1.1       agc       else
    627  1.1       agc 	{
    628  1.1       agc 	  if (pos >= len)
    629  1.1       agc 	    break;
    630  1.1       agc 	}
    631  1.1       agc 
    632  1.1       agc       prev_pos = pos;
    633  1.1       agc       GET_NEXT_WCHAR();
    634  1.1       agc 
    635  1.1       agc       /* Swap `reach' and `reach_next'. */
    636  1.1       agc       {
    637  1.1       agc 	tre_tnfa_approx_reach_t *tmp;
    638  1.1       agc 	tmp = reach;
    639  1.1       agc 	reach = reach_next;
    640  1.1       agc 	reach_next = tmp;
    641  1.1       agc       }
    642  1.1       agc 
    643  1.1       agc       /* Handle exact matches and substitutions. */
    644  1.1       agc       for (id = 0; id < tnfa->num_states; id++)
    645  1.1       agc 	{
    646  1.1       agc 	  tre_tnfa_transition_t *trans;
    647  1.1       agc 
    648  1.1       agc 	  if (reach[id].pos < prev_pos)
    649  1.1       agc 	    continue;  /* Not reached. */
    650  1.1       agc 	  for (trans = reach[id].state; trans->state; trans++)
    651  1.1       agc 	    {
    652  1.1       agc 	      int dest_id;
    653  1.1       agc 	      int depth;
    654  1.1       agc 	      int cost, cost0, err;
    655  1.1       agc 
    656  1.1       agc 	      if (trans->assertions
    657  1.1       agc 		  && (CHECK_ASSERTIONS(trans->assertions)
    658  1.1       agc 		      || CHECK_CHAR_CLASSES(trans, tnfa, eflags)))
    659  1.1       agc 		{
    660  1.1       agc 		  DPRINT(("  exact,  from %d: assert failed\n", id));
    661  1.1       agc 		  continue;
    662  1.1       agc 		}
    663  1.1       agc 
    664  1.1       agc 	      depth = reach[id].depth;
    665  1.1       agc 	      dest_id = trans->state_id;
    666  1.1       agc 
    667  1.1       agc 	      cost = reach[id].costs[depth][TRE_M_COST];
    668  1.1       agc 	      cost0 = reach[id].costs[0][TRE_M_COST];
    669  1.1       agc 	      err = 0;
    670  1.1       agc 
    671  1.1       agc 	      if (trans->code_min > (tre_cint_t)prev_c
    672  1.1       agc 		  || trans->code_max < (tre_cint_t)prev_c)
    673  1.1       agc 		{
    674  1.1       agc 		  /* Handle substitutes.  The required character was not in
    675  1.1       agc 		     the string, so match it in place of whatever was supposed
    676  1.1       agc 		     to be there and increase costs accordingly. */
    677  1.1       agc 		  err = 1;
    678  1.1       agc 
    679  1.1       agc 		  /* Compute and check cost at current depth. */
    680  1.1       agc 		  cost = reach[id].costs[depth][TRE_M_COST];
    681  1.1       agc 		  if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
    682  1.1       agc 		    cost += reach[id].params.cost_subst;
    683  1.1       agc 		  if (cost > reach[id].params.max_cost)
    684  1.1       agc 		    continue; /* Cost too large. */
    685  1.1       agc 
    686  1.1       agc 		  /* Check number of substitutes at current depth. */
    687  1.1       agc 		  if (reach[id].costs[depth][TRE_M_NUM_SUBST] + 1
    688  1.1       agc 		      > reach[id].params.max_subst)
    689  1.1       agc 		    continue; /* Too many substitutes. */
    690  1.1       agc 
    691  1.1       agc 		  /* Check total number of errors at current depth. */
    692  1.1       agc 		  if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
    693  1.1       agc 		      > reach[id].params.max_err)
    694  1.1       agc 		    continue; /* Too many errors. */
    695  1.1       agc 
    696  1.1       agc 		  /* Compute overall cost. */
    697  1.1       agc 		  cost0 = cost;
    698  1.1       agc 		  if (depth > 0)
    699  1.1       agc 		    {
    700  1.1       agc 		      cost0 = reach[id].costs[0][TRE_M_COST];
    701  1.1       agc 		      if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
    702  1.1       agc 			cost0 += reach[id].params.cost_subst;
    703  1.1       agc 		      else
    704  1.1       agc 			cost0 += default_params.cost_subst;
    705  1.1       agc 		    }
    706  1.1       agc 		  DPRINT(("  subst,  from %03d to %03d, cost %d: ",
    707  1.1       agc 			  id, dest_id, cost0));
    708  1.1       agc 		}
    709  1.1       agc 	      else
    710  1.1       agc 		DPRINT(("  exact,  from %03d to %03d, cost %d: ",
    711  1.1       agc 			id, dest_id, cost0));
    712  1.1       agc 
    713  1.1       agc 	      /* Compute tag values after this transition. */
    714  1.1       agc 	      for (i = 0; i < num_tags; i++)
    715  1.1       agc 		tmp_tags[i] = reach[id].tags[i];
    716  1.1       agc 	      if (trans->tags)
    717  1.1       agc 		for (i = 0; trans->tags[i] >= 0; i++)
    718  1.2  christos 		  if ((size_t)trans->tags[i] < num_tags)
    719  1.1       agc 		    tmp_tags[trans->tags[i]] = pos;
    720  1.1       agc 
    721  1.1       agc 	      /* If another path has also reached this state, choose the
    722  1.1       agc 		 one with the smallest cost or best tags if costs are equal. */
    723  1.1       agc 	      if (reach_next[dest_id].pos == pos
    724  1.1       agc 		  && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
    725  1.1       agc 		      || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
    726  1.1       agc 			  && !tre_tag_order(num_tags, tnfa->tag_directions,
    727  1.1       agc 					    tmp_tags,
    728  1.1       agc 					    reach_next[dest_id].tags))))
    729  1.1       agc 		{
    730  1.1       agc 		  DPRINT(("lose\n"));
    731  1.1       agc 		  continue;
    732  1.1       agc 		}
    733  1.1       agc 	      DPRINT(("win %d %d\n",
    734  1.1       agc 		      reach_next[dest_id].pos,
    735  1.1       agc 		      reach_next[dest_id].costs[0][TRE_M_COST]));
    736  1.1       agc 
    737  1.1       agc 	      /* Set state, position, tags, and depth. */
    738  1.1       agc 	      reach_next[dest_id].state = trans->state;
    739  1.1       agc 	      reach_next[dest_id].pos = pos;
    740  1.1       agc 	      for (i = 0; i < num_tags; i++)
    741  1.1       agc 		reach_next[dest_id].tags[i] = tmp_tags[i];
    742  1.1       agc 	      reach_next[dest_id].depth = reach[id].depth;
    743  1.1       agc 
    744  1.1       agc 	      /* Set parameters. */
    745  1.1       agc 	      reach_next[dest_id].params = reach[id].params;
    746  1.1       agc 	      if (trans->params)
    747  1.1       agc 		tre_set_params(&reach_next[dest_id], trans->params,
    748  1.1       agc 			       default_params);
    749  1.1       agc 
    750  1.1       agc 	      /* Set the costs after this transition. */
    751  1.1       agc 		memcpy(&reach_next[dest_id].costs,
    752  1.1       agc 		       reach[id].costs,
    753  1.1       agc 		       sizeof(reach[id].costs[0][0])
    754  1.1       agc 		       * TRE_M_LAST * (depth + 1));
    755  1.1       agc 	      reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
    756  1.1       agc 	      reach_next[dest_id].costs[depth][TRE_M_NUM_SUBST] += err;
    757  1.1       agc 	      reach_next[dest_id].costs[depth][TRE_M_NUM_ERR] += err;
    758  1.1       agc 	      if (depth > 0)
    759  1.1       agc 		{
    760  1.1       agc 		  reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
    761  1.1       agc 		  reach_next[dest_id].costs[0][TRE_M_NUM_SUBST] += err;
    762  1.1       agc 		  reach_next[dest_id].costs[0][TRE_M_NUM_ERR] += err;
    763  1.1       agc 		}
    764  1.1       agc 
    765  1.1       agc 	      if (trans->state == tnfa->final
    766  1.1       agc 		  && (match_eo < 0
    767  1.1       agc 		      || cost0 < match_costs[TRE_M_COST]
    768  1.1       agc 		      || (cost0 == match_costs[TRE_M_COST]
    769  1.1       agc 			  && num_tags > 0 && tmp_tags[0] <= match_tags[0])))
    770  1.1       agc 		{
    771  1.1       agc 		  DPRINT(("    setting new match at %d, cost %d\n",
    772  1.1       agc 			  pos, cost0));
    773  1.1       agc 		  match_eo = pos;
    774  1.1       agc 		  for (i = 0; i < TRE_M_LAST; i++)
    775  1.1       agc 		    match_costs[i] = reach_next[dest_id].costs[0][i];
    776  1.1       agc 		  for (i = 0; i < num_tags; i++)
    777  1.1       agc 		    match_tags[i] = tmp_tags[i];
    778  1.1       agc 		}
    779  1.1       agc 	    }
    780  1.1       agc 	}
    781  1.1       agc     }
    782  1.1       agc 
    783  1.1       agc   DPRINT(("match end offset = %d, match cost = %d\n", match_eo,
    784  1.1       agc 	  match_costs[TRE_M_COST]));
    785  1.1       agc 
    786  1.1       agc   match->cost = match_costs[TRE_M_COST];
    787  1.1       agc   match->num_ins = match_costs[TRE_M_NUM_INS];
    788  1.1       agc   match->num_del = match_costs[TRE_M_NUM_DEL];
    789  1.1       agc   match->num_subst = match_costs[TRE_M_NUM_SUBST];
    790  1.1       agc   *match_end_ofs = match_eo;
    791  1.1       agc 
    792  1.4       rin   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
    793  1.4       rin error_exit:
    794  1.4       rin #ifndef TRE_USE_ALLOCA
    795  1.4       rin   if (buf)
    796  1.4       rin     xfree(buf);
    797  1.4       rin #endif /* !TRE_USE_ALLOCA */
    798  1.4       rin   return ret;
    799  1.1       agc }
    800