Home | History | Annotate | Line # | Download | only in lib
tre-match-approx.c revision 1.3
      1  1.1       agc /*
      2  1.1       agc   tre-match-approx.c - TRE approximate regex matching engine
      3  1.1       agc 
      4  1.1       agc   This software is released under a BSD-style license.
      5  1.1       agc   See the file LICENSE for details and copyright.
      6  1.1       agc 
      7  1.1       agc */
      8  1.1       agc #ifdef HAVE_CONFIG_H
      9  1.1       agc #include <config.h>
     10  1.1       agc #endif /* HAVE_CONFIG_H */
     11  1.1       agc 
     12  1.2  christos #include "tre-alloca.h"
     13  1.1       agc 
     14  1.1       agc #define __USE_STRING_INLINES
     15  1.1       agc #undef __NO_INLINE__
     16  1.1       agc 
     17  1.1       agc #include <assert.h>
     18  1.1       agc #include <stdlib.h>
     19  1.1       agc #include <string.h>
     20  1.1       agc #include <limits.h>
     21  1.1       agc #ifdef HAVE_WCHAR_H
     22  1.1       agc #include <wchar.h>
     23  1.1       agc #endif /* HAVE_WCHAR_H */
     24  1.1       agc #ifdef HAVE_WCTYPE_H
     25  1.1       agc #include <wctype.h>
     26  1.1       agc #endif /* HAVE_WCTYPE_H */
     27  1.1       agc #ifndef TRE_WCHAR
     28  1.1       agc #include <ctype.h>
     29  1.1       agc #endif /* !TRE_WCHAR */
     30  1.1       agc #ifdef HAVE_MALLOC_H
     31  1.1       agc #include <malloc.h>
     32  1.1       agc #endif /* HAVE_MALLOC_H */
     33  1.1       agc 
     34  1.1       agc #include "tre-internal.h"
     35  1.1       agc #include "tre-match-utils.h"
     36  1.1       agc #include "tre.h"
     37  1.1       agc #include "xmalloc.h"
     38  1.1       agc 
     39  1.1       agc #define TRE_M_COST	0
     40  1.1       agc #define TRE_M_NUM_INS	1
     41  1.1       agc #define TRE_M_NUM_DEL	2
     42  1.1       agc #define TRE_M_NUM_SUBST 3
     43  1.1       agc #define TRE_M_NUM_ERR	4
     44  1.1       agc #define TRE_M_LAST	5
     45  1.1       agc 
     46  1.1       agc #define TRE_M_MAX_DEPTH 3
     47  1.1       agc 
     48  1.1       agc typedef struct {
     49  1.1       agc   /* State in the TNFA transition table. */
     50  1.1       agc   tre_tnfa_transition_t *state;
     51  1.1       agc   /* Position in input string. */
     52  1.1       agc   int pos;
     53  1.1       agc   /* Tag values. */
     54  1.1       agc   int *tags;
     55  1.1       agc   /* Matching parameters. */
     56  1.1       agc   regaparams_t params;
     57  1.1       agc   /* Nesting depth of parameters.  This is used as an index in
     58  1.1       agc      the `costs' array. */
     59  1.1       agc   int depth;
     60  1.1       agc   /* Costs and counter values for different parameter nesting depths. */
     61  1.1       agc   int costs[TRE_M_MAX_DEPTH + 1][TRE_M_LAST];
     62  1.1       agc } tre_tnfa_approx_reach_t;
     63  1.1       agc 
     64  1.1       agc 
     65  1.1       agc #ifdef TRE_DEBUG
     66  1.1       agc /* Prints the `reach' array in a readable fashion with DPRINT. */
     67  1.1       agc static void
     68  1.1       agc tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_approx_reach_t *reach,
     69  1.2  christos 		int pos, size_t num_tags)
     70  1.1       agc {
     71  1.1       agc   int id;
     72  1.1       agc 
     73  1.1       agc   /* Print each state on one line. */
     74  1.1       agc   DPRINT(("  reach:\n"));
     75  1.1       agc   for (id = 0; id < tnfa->num_states; id++)
     76  1.1       agc     {
     77  1.1       agc       int i, j;
     78  1.1       agc       if (reach[id].pos < pos)
     79  1.1       agc 	continue;  /* Not reached. */
     80  1.1       agc       DPRINT(("	 %03d, costs ", id));
     81  1.1       agc       for (i = 0; i <= reach[id].depth; i++)
     82  1.1       agc 	{
     83  1.1       agc 	  DPRINT(("["));
     84  1.1       agc 	  for (j = 0; j < TRE_M_LAST; j++)
     85  1.1       agc 	    {
     86  1.1       agc 	      DPRINT(("%2d", reach[id].costs[i][j]));
     87  1.1       agc 	      if (j + 1 < TRE_M_LAST)
     88  1.1       agc 		DPRINT((","));
     89  1.1       agc 	    }
     90  1.1       agc 	  DPRINT(("]"));
     91  1.1       agc 	  if (i + 1 <= reach[id].depth)
     92  1.1       agc 	    DPRINT((", "));
     93  1.1       agc 	}
     94  1.1       agc       DPRINT(("\n	tags "));
     95  1.1       agc       for (i = 0; i < num_tags; i++)
     96  1.1       agc 	{
     97  1.1       agc 	  DPRINT(("%02d", reach[id].tags[i]));
     98  1.1       agc 	  if (i + 1 < num_tags)
     99  1.1       agc 	    DPRINT((","));
    100  1.1       agc 	}
    101  1.1       agc       DPRINT(("\n"));
    102  1.1       agc     }
    103  1.1       agc   DPRINT(("\n"));
    104  1.1       agc }
    105  1.1       agc #endif /* TRE_DEBUG */
    106  1.1       agc 
    107  1.1       agc 
    108  1.1       agc /* Sets the matching parameters in `reach' to the ones defined in the `pa'
    109  1.1       agc    array.  If `pa' specifies default values, they are taken from
    110  1.1       agc    `default_params'. */
    111  1.1       agc inline static void
    112  1.1       agc tre_set_params(tre_tnfa_approx_reach_t *reach,
    113  1.1       agc 	       int *pa, regaparams_t default_params)
    114  1.1       agc {
    115  1.1       agc   int value;
    116  1.1       agc 
    117  1.1       agc   /* If depth is increased reset costs and counters to zero for the
    118  1.1       agc      new levels. */
    119  1.1       agc   value = pa[TRE_PARAM_DEPTH];
    120  1.1       agc   assert(value <= TRE_M_MAX_DEPTH);
    121  1.1       agc   if (value > reach->depth)
    122  1.1       agc     {
    123  1.1       agc       int i, j;
    124  1.1       agc       for (i = reach->depth + 1; i <= value; i++)
    125  1.1       agc 	for (j = 0; j < TRE_M_LAST; j++)
    126  1.1       agc 	  reach->costs[i][j] = 0;
    127  1.1       agc     }
    128  1.1       agc   reach->depth = value;
    129  1.1       agc 
    130  1.1       agc   /* Set insert cost. */
    131  1.1       agc   value = pa[TRE_PARAM_COST_INS];
    132  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    133  1.1       agc     reach->params.cost_ins = default_params.cost_ins;
    134  1.1       agc   else if (value != TRE_PARAM_UNSET)
    135  1.1       agc     reach->params.cost_ins = value;
    136  1.1       agc 
    137  1.1       agc   /* Set delete cost. */
    138  1.1       agc   value = pa[TRE_PARAM_COST_DEL];
    139  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    140  1.1       agc     reach->params.cost_del = default_params.cost_del;
    141  1.1       agc   else if (value != TRE_PARAM_UNSET)
    142  1.1       agc     reach->params.cost_del = value;
    143  1.1       agc 
    144  1.1       agc   /* Set substitute cost. */
    145  1.1       agc   value = pa[TRE_PARAM_COST_SUBST];
    146  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    147  1.1       agc     reach->params.cost_subst = default_params.cost_subst;
    148  1.1       agc   else
    149  1.1       agc     reach->params.cost_subst = value;
    150  1.1       agc 
    151  1.1       agc   /* Set maximum cost. */
    152  1.1       agc   value = pa[TRE_PARAM_COST_MAX];
    153  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    154  1.1       agc     reach->params.max_cost = default_params.max_cost;
    155  1.1       agc   else if (value != TRE_PARAM_UNSET)
    156  1.1       agc     reach->params.max_cost = value;
    157  1.1       agc 
    158  1.1       agc   /* Set maximum inserts. */
    159  1.1       agc   value = pa[TRE_PARAM_MAX_INS];
    160  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    161  1.1       agc     reach->params.max_ins = default_params.max_ins;
    162  1.1       agc   else if (value != TRE_PARAM_UNSET)
    163  1.1       agc     reach->params.max_ins = value;
    164  1.1       agc 
    165  1.1       agc   /* Set maximum deletes. */
    166  1.1       agc   value = pa[TRE_PARAM_MAX_DEL];
    167  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    168  1.1       agc     reach->params.max_del = default_params.max_del;
    169  1.1       agc   else if (value != TRE_PARAM_UNSET)
    170  1.1       agc     reach->params.max_del = value;
    171  1.1       agc 
    172  1.1       agc   /* Set maximum substitutes. */
    173  1.1       agc   value = pa[TRE_PARAM_MAX_SUBST];
    174  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    175  1.1       agc     reach->params.max_subst = default_params.max_subst;
    176  1.1       agc   else if (value != TRE_PARAM_UNSET)
    177  1.1       agc     reach->params.max_subst = value;
    178  1.1       agc 
    179  1.1       agc   /* Set maximum number of errors. */
    180  1.1       agc   value = pa[TRE_PARAM_MAX_ERR];
    181  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    182  1.1       agc     reach->params.max_err = default_params.max_err;
    183  1.1       agc   else if (value != TRE_PARAM_UNSET)
    184  1.1       agc     reach->params.max_err = value;
    185  1.1       agc }
    186  1.1       agc 
    187  1.1       agc reg_errcode_t
    188  1.1       agc tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
    189  1.1       agc 		    tre_str_type_t type, int *match_tags,
    190  1.1       agc 		    regamatch_t *match, regaparams_t default_params,
    191  1.1       agc 		    int eflags, int *match_end_ofs)
    192  1.1       agc {
    193  1.1       agc   /* State variables required by GET_NEXT_WCHAR. */
    194  1.1       agc   tre_char_t prev_c = 0, next_c = 0;
    195  1.1       agc   const char *str_byte = string;
    196  1.1       agc   int pos = -1;
    197  1.1       agc   unsigned int pos_add_next = 1;
    198  1.1       agc #ifdef TRE_WCHAR
    199  1.1       agc   const wchar_t *str_wide = string;
    200  1.1       agc #ifdef TRE_MBSTATE
    201  1.1       agc   mbstate_t mbstate;
    202  1.1       agc #endif /* !TRE_WCHAR */
    203  1.1       agc #endif /* TRE_WCHAR */
    204  1.1       agc   int reg_notbol = eflags & REG_NOTBOL;
    205  1.1       agc   int reg_noteol = eflags & REG_NOTEOL;
    206  1.1       agc   int reg_newline = tnfa->cflags & REG_NEWLINE;
    207  1.2  christos   size_t str_user_end = 0;
    208  1.1       agc 
    209  1.1       agc   int prev_pos;
    210  1.1       agc 
    211  1.1       agc   /* Number of tags. */
    212  1.2  christos   size_t num_tags;
    213  1.1       agc   /* The reach tables. */
    214  1.1       agc   tre_tnfa_approx_reach_t *reach, *reach_next;
    215  1.1       agc   /* Tag array for temporary use. */
    216  1.1       agc   int *tmp_tags;
    217  1.1       agc 
    218  1.1       agc   /* End offset of best match so far, or -1 if no match found yet. */
    219  1.1       agc   int match_eo = -1;
    220  1.1       agc   /* Costs of the match. */
    221  1.1       agc   int match_costs[TRE_M_LAST];
    222  1.1       agc 
    223  1.1       agc   /* Space for temporary data required for matching. */
    224  1.1       agc   unsigned char *buf;
    225  1.1       agc 
    226  1.2  christos   size_t i, id;
    227  1.1       agc 
    228  1.1       agc   if (!match_tags)
    229  1.1       agc     num_tags = 0;
    230  1.1       agc   else
    231  1.1       agc     num_tags = tnfa->num_tags;
    232  1.1       agc 
    233  1.1       agc #ifdef TRE_MBSTATE
    234  1.1       agc   memset(&mbstate, '\0', sizeof(mbstate));
    235  1.1       agc #endif /* TRE_MBSTATE */
    236  1.1       agc 
    237  1.1       agc   DPRINT(("tre_tnfa_run_approx, input type %d, len %d, eflags %d, "
    238  1.1       agc 	  "match_tags %p\n",
    239  1.1       agc 	  type, len, eflags,
    240  1.1       agc 	  match_tags));
    241  1.1       agc   DPRINT(("max cost %d, ins %d, del %d, subst %d\n",
    242  1.1       agc 	  default_params.max_cost,
    243  1.1       agc 	  default_params.cost_ins,
    244  1.1       agc 	  default_params.cost_del,
    245  1.1       agc 	  default_params.cost_subst));
    246  1.1       agc 
    247  1.1       agc   /* Allocate memory for temporary data required for matching.	This needs to
    248  1.1       agc      be done for every matching operation to be thread safe.  This allocates
    249  1.1       agc      everything in a single large block from the stack frame using alloca()
    250  1.1       agc      or with malloc() if alloca is unavailable. */
    251  1.1       agc   {
    252  1.1       agc     unsigned char *buf_cursor;
    253  1.1       agc     /* Space needed for one array of tags. */
    254  1.2  christos     size_t tag_bytes = sizeof(*tmp_tags) * num_tags;
    255  1.1       agc     /* Space needed for one reach table. */
    256  1.2  christos     size_t reach_bytes = sizeof(*reach_next) * tnfa->num_states;
    257  1.1       agc     /* Total space needed. */
    258  1.2  christos     size_t total_bytes = reach_bytes * 2 + (tnfa->num_states * 2 + 1 ) * tag_bytes;
    259  1.1       agc     /* Add some extra to make sure we can align the pointers.  The multiplier
    260  1.1       agc        used here must be equal to the number of ALIGN calls below. */
    261  1.1       agc     total_bytes += (sizeof(long) - 1) * 3;
    262  1.1       agc 
    263  1.1       agc     /* Allocate the memory. */
    264  1.1       agc #ifdef TRE_USE_ALLOCA
    265  1.1       agc     buf = alloca(total_bytes);
    266  1.1       agc #else /* !TRE_USE_ALLOCA */
    267  1.1       agc     buf = xmalloc((unsigned)total_bytes);
    268  1.1       agc #endif /* !TRE_USE_ALLOCA */
    269  1.1       agc     if (!buf)
    270  1.1       agc       return REG_ESPACE;
    271  1.1       agc     memset(buf, 0, (size_t)total_bytes);
    272  1.1       agc 
    273  1.1       agc     /* Allocate `tmp_tags' from `buf'. */
    274  1.1       agc     tmp_tags = (void *)buf;
    275  1.1       agc     buf_cursor = buf + tag_bytes;
    276  1.1       agc     buf_cursor += ALIGN(buf_cursor, long);
    277  1.1       agc 
    278  1.1       agc     /* Allocate `reach' from `buf'. */
    279  1.1       agc     reach = (void *)buf_cursor;
    280  1.1       agc     buf_cursor += reach_bytes;
    281  1.1       agc     buf_cursor += ALIGN(buf_cursor, long);
    282  1.1       agc 
    283  1.1       agc     /* Allocate `reach_next' from `buf'. */
    284  1.1       agc     reach_next = (void *)buf_cursor;
    285  1.1       agc     buf_cursor += reach_bytes;
    286  1.1       agc     buf_cursor += ALIGN(buf_cursor, long);
    287  1.1       agc 
    288  1.1       agc     /* Allocate tag arrays for `reach' and `reach_next' from `buf'. */
    289  1.1       agc     for (i = 0; i < tnfa->num_states; i++)
    290  1.1       agc       {
    291  1.1       agc 	reach[i].tags = (void *)buf_cursor;
    292  1.1       agc 	buf_cursor += tag_bytes;
    293  1.1       agc 	reach_next[i].tags = (void *)buf_cursor;
    294  1.1       agc 	buf_cursor += tag_bytes;
    295  1.1       agc       }
    296  1.1       agc     assert(buf_cursor <= buf + total_bytes);
    297  1.1       agc   }
    298  1.1       agc 
    299  1.1       agc   for (i = 0; i < TRE_M_LAST; i++)
    300  1.1       agc     match_costs[i] = INT_MAX;
    301  1.1       agc 
    302  1.1       agc   /* Mark the reach arrays empty. */
    303  1.1       agc   for (i = 0; i < tnfa->num_states; i++)
    304  1.1       agc     reach[i].pos = reach_next[i].pos = -2;
    305  1.1       agc 
    306  1.1       agc   prev_pos = pos;
    307  1.1       agc   GET_NEXT_WCHAR();
    308  1.1       agc   pos = 0;
    309  1.1       agc 
    310  1.3       rin   while (/*CONSTCOND*/(void)1,1)
    311  1.1       agc     {
    312  1.1       agc       DPRINT(("%03d:%2lc/%05d\n", pos, (tre_cint_t)next_c, (int)next_c));
    313  1.1       agc 
    314  1.1       agc       /* Add initial states to `reach_next' if an exact match has not yet
    315  1.1       agc 	 been found. */
    316  1.1       agc       if (match_costs[TRE_M_COST] > 0)
    317  1.1       agc 	{
    318  1.1       agc 	  tre_tnfa_transition_t *trans;
    319  1.1       agc 	  DPRINT(("  init"));
    320  1.1       agc 	  for (trans = tnfa->initial; trans->state; trans++)
    321  1.1       agc 	    {
    322  1.1       agc 	      int stateid = trans->state_id;
    323  1.1       agc 
    324  1.1       agc 	      /* If this state is not currently in `reach_next', add it
    325  1.1       agc 		 there. */
    326  1.1       agc 	      if (reach_next[stateid].pos < pos)
    327  1.1       agc 		{
    328  1.1       agc 		  if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
    329  1.1       agc 		    {
    330  1.1       agc 		      /* Assertions failed, don't add this state. */
    331  1.1       agc 		      DPRINT((" !%d (assert)", stateid));
    332  1.1       agc 		      continue;
    333  1.1       agc 		    }
    334  1.1       agc 		  DPRINT((" %d", stateid));
    335  1.1       agc 		  reach_next[stateid].state = trans->state;
    336  1.1       agc 		  reach_next[stateid].pos = pos;
    337  1.1       agc 
    338  1.1       agc 		  /* Compute tag values after this transition. */
    339  1.1       agc 		  for (i = 0; i < num_tags; i++)
    340  1.1       agc 		    reach_next[stateid].tags[i] = -1;
    341  1.1       agc 
    342  1.1       agc 		  if (trans->tags)
    343  1.1       agc 		    for (i = 0; trans->tags[i] >= 0; i++)
    344  1.2  christos 		      if ((size_t)trans->tags[i] < num_tags)
    345  1.1       agc 			reach_next[stateid].tags[trans->tags[i]] = pos;
    346  1.1       agc 
    347  1.1       agc 		  /* Set the parameters, depth, and costs. */
    348  1.1       agc 		  reach_next[stateid].params = default_params;
    349  1.1       agc 		  reach_next[stateid].depth = 0;
    350  1.1       agc 		  for (i = 0; i < TRE_M_LAST; i++)
    351  1.1       agc 		    reach_next[stateid].costs[0][i] = 0;
    352  1.1       agc 		  if (trans->params)
    353  1.1       agc 		    tre_set_params(&reach_next[stateid], trans->params,
    354  1.1       agc 				   default_params);
    355  1.1       agc 
    356  1.1       agc 		  /* If this is the final state, mark the exact match. */
    357  1.1       agc 		  if (trans->state == tnfa->final)
    358  1.1       agc 		    {
    359  1.1       agc 		      match_eo = pos;
    360  1.1       agc 		      for (i = 0; i < num_tags; i++)
    361  1.1       agc 			match_tags[i] = reach_next[stateid].tags[i];
    362  1.1       agc 		      for (i = 0; i < TRE_M_LAST; i++)
    363  1.1       agc 			match_costs[i] = 0;
    364  1.1       agc 		    }
    365  1.1       agc 		}
    366  1.1       agc 	    }
    367  1.1       agc 	    DPRINT(("\n"));
    368  1.1       agc 	}
    369  1.1       agc 
    370  1.1       agc 
    371  1.1       agc       /* Handle inserts.  This is done by pretending there's an epsilon
    372  1.1       agc 	 transition from each state in `reach' back to the same state.
    373  1.1       agc 	 We don't need to worry about the final state here; this will never
    374  1.1       agc 	 give a better match than what we already have. */
    375  1.1       agc       for (id = 0; id < tnfa->num_states; id++)
    376  1.1       agc 	{
    377  1.1       agc 	  int depth;
    378  1.1       agc 	  int cost, cost0;
    379  1.1       agc 
    380  1.1       agc 	  if (reach[id].pos != prev_pos)
    381  1.1       agc 	    {
    382  1.1       agc 	      DPRINT(("	 insert: %d not reached\n", id));
    383  1.1       agc 	      continue;	 /* Not reached. */
    384  1.1       agc 	    }
    385  1.1       agc 
    386  1.1       agc 	  depth = reach[id].depth;
    387  1.1       agc 
    388  1.1       agc 	  /* Compute and check cost at current depth. */
    389  1.1       agc 	  cost = reach[id].costs[depth][TRE_M_COST];
    390  1.1       agc 	  if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
    391  1.1       agc 	    cost += reach[id].params.cost_ins;
    392  1.1       agc 	  if (cost > reach[id].params.max_cost)
    393  1.1       agc 	    continue;  /* Cost too large. */
    394  1.1       agc 
    395  1.1       agc 	  /* Check number of inserts at current depth. */
    396  1.1       agc 	  if (reach[id].costs[depth][TRE_M_NUM_INS] + 1
    397  1.1       agc 	      > reach[id].params.max_ins)
    398  1.1       agc 	    continue;  /* Too many inserts. */
    399  1.1       agc 
    400  1.1       agc 	  /* Check total number of errors at current depth. */
    401  1.1       agc 	  if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
    402  1.1       agc 	      > reach[id].params.max_err)
    403  1.1       agc 	    continue;  /* Too many errors. */
    404  1.1       agc 
    405  1.1       agc 	  /* Compute overall cost. */
    406  1.1       agc 	  cost0 = cost;
    407  1.1       agc 	  if (depth > 0)
    408  1.1       agc 	    {
    409  1.1       agc 	      cost0 = reach[id].costs[0][TRE_M_COST];
    410  1.1       agc 	      if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
    411  1.1       agc 		cost0 += reach[id].params.cost_ins;
    412  1.1       agc 	      else
    413  1.1       agc 		cost0 += default_params.cost_ins;
    414  1.1       agc 	    }
    415  1.1       agc 
    416  1.1       agc 	  DPRINT(("  insert: from %d to %d, cost %d: ", id, id,
    417  1.1       agc 		  reach[id].costs[depth][TRE_M_COST]));
    418  1.1       agc 	  if (reach_next[id].pos == pos
    419  1.1       agc 	      && (cost0 >= reach_next[id].costs[0][TRE_M_COST]))
    420  1.1       agc 	    {
    421  1.1       agc 	      DPRINT(("lose\n"));
    422  1.1       agc 	      continue;
    423  1.1       agc 	    }
    424  1.1       agc 	  DPRINT(("win\n"));
    425  1.1       agc 
    426  1.1       agc 	  /* Copy state, position, tags, parameters, and depth. */
    427  1.1       agc 	  reach_next[id].state = reach[id].state;
    428  1.1       agc 	  reach_next[id].pos = pos;
    429  1.1       agc 	  for (i = 0; i < num_tags; i++)
    430  1.1       agc 	    reach_next[id].tags[i] = reach[id].tags[i];
    431  1.1       agc 	  reach_next[id].params = reach[id].params;
    432  1.1       agc 	  reach_next[id].depth = reach[id].depth;
    433  1.1       agc 
    434  1.1       agc 	  /* Set the costs after this transition. */
    435  1.1       agc 	  memcpy(reach_next[id].costs, reach[id].costs,
    436  1.1       agc 		 sizeof(reach_next[id].costs[0][0])
    437  1.1       agc 		 * TRE_M_LAST * (depth + 1));
    438  1.1       agc 	  reach_next[id].costs[depth][TRE_M_COST] = cost;
    439  1.1       agc 	  reach_next[id].costs[depth][TRE_M_NUM_INS]++;
    440  1.1       agc 	  reach_next[id].costs[depth][TRE_M_NUM_ERR]++;
    441  1.1       agc 	  if (depth > 0)
    442  1.1       agc 	    {
    443  1.1       agc 	      reach_next[id].costs[0][TRE_M_COST] = cost0;
    444  1.1       agc 	      reach_next[id].costs[0][TRE_M_NUM_INS]++;
    445  1.1       agc 	      reach_next[id].costs[0][TRE_M_NUM_ERR]++;
    446  1.1       agc 	    }
    447  1.1       agc 
    448  1.1       agc 	}
    449  1.1       agc 
    450  1.1       agc 
    451  1.1       agc       /* Handle deletes.  This is done by traversing through the whole TNFA
    452  1.1       agc 	 pretending that all transitions are epsilon transitions, until
    453  1.1       agc 	 no more states can be reached with better costs. */
    454  1.1       agc       {
    455  1.1       agc 	/* XXX - dynamic ringbuffer size */
    456  1.1       agc 	tre_tnfa_approx_reach_t *ringbuffer[512];
    457  1.1       agc 	tre_tnfa_approx_reach_t **deque_start, **deque_end;
    458  1.1       agc 
    459  1.1       agc 	deque_start = deque_end = ringbuffer;
    460  1.1       agc 
    461  1.1       agc 	/* Add all states in `reach_next' to the deque. */
    462  1.1       agc 	for (id = 0; id < tnfa->num_states; id++)
    463  1.1       agc 	  {
    464  1.1       agc 	    if (reach_next[id].pos != pos)
    465  1.1       agc 	      continue;
    466  1.1       agc 	    *deque_end = &reach_next[id];
    467  1.1       agc 	    deque_end++;
    468  1.1       agc 	    assert(deque_end != deque_start);
    469  1.1       agc 	  }
    470  1.1       agc 
    471  1.1       agc 	/* Repeat until the deque is empty. */
    472  1.1       agc 	while (deque_end != deque_start)
    473  1.1       agc 	  {
    474  1.1       agc 	    tre_tnfa_approx_reach_t *reach_p;
    475  1.1       agc 	    int depth;
    476  1.1       agc 	    int cost, cost0;
    477  1.1       agc 	    tre_tnfa_transition_t *trans;
    478  1.1       agc 
    479  1.1       agc 	    /* Pop the first item off the deque. */
    480  1.1       agc 	    reach_p = *deque_start;
    481  1.1       agc 	    id = reach_p - reach_next;
    482  1.1       agc 	    depth = reach_p->depth;
    483  1.1       agc 
    484  1.1       agc 	    /* Compute cost at current depth. */
    485  1.1       agc 	    cost = reach_p->costs[depth][TRE_M_COST];
    486  1.1       agc 	    if (reach_p->params.cost_del != TRE_PARAM_UNSET)
    487  1.1       agc 	      cost += reach_p->params.cost_del;
    488  1.1       agc 
    489  1.1       agc 	    /* Check cost, number of deletes, and total number of errors
    490  1.1       agc 	       at current depth. */
    491  1.1       agc 	    if (cost > reach_p->params.max_cost
    492  1.1       agc 		|| (reach_p->costs[depth][TRE_M_NUM_DEL] + 1
    493  1.1       agc 		    > reach_p->params.max_del)
    494  1.1       agc 		|| (reach_p->costs[depth][TRE_M_NUM_ERR] + 1
    495  1.1       agc 		    > reach_p->params.max_err))
    496  1.1       agc 	      {
    497  1.1       agc 		/* Too many errors or cost too large. */
    498  1.1       agc 		DPRINT(("  delete: from %03d: cost too large\n", id));
    499  1.1       agc 		deque_start++;
    500  1.1       agc 		if (deque_start >= (ringbuffer + 512))
    501  1.1       agc 		  deque_start = ringbuffer;
    502  1.1       agc 		continue;
    503  1.1       agc 	      }
    504  1.1       agc 
    505  1.1       agc 	    /* Compute overall cost. */
    506  1.1       agc 	    cost0 = cost;
    507  1.1       agc 	    if (depth > 0)
    508  1.1       agc 	      {
    509  1.1       agc 		cost0 = reach_p->costs[0][TRE_M_COST];
    510  1.1       agc 		if (reach_p->params.cost_del != TRE_PARAM_UNSET)
    511  1.1       agc 		  cost0 += reach_p->params.cost_del;
    512  1.1       agc 		else
    513  1.1       agc 		  cost0 += default_params.cost_del;
    514  1.1       agc 	      }
    515  1.1       agc 
    516  1.1       agc 	    for (trans = reach_p->state; trans->state; trans++)
    517  1.1       agc 	      {
    518  1.1       agc 		int dest_id = trans->state_id;
    519  1.1       agc 		DPRINT(("  delete: from %03d to %03d, cost %d (%d): ",
    520  1.1       agc 			id, dest_id, cost0, reach_p->params.max_cost));
    521  1.1       agc 
    522  1.1       agc 		if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
    523  1.1       agc 		  {
    524  1.1       agc 		    DPRINT(("assertion failed\n"));
    525  1.1       agc 		    continue;
    526  1.1       agc 		  }
    527  1.1       agc 
    528  1.1       agc 		/* Compute tag values after this transition. */
    529  1.1       agc 		for (i = 0; i < num_tags; i++)
    530  1.1       agc 		  tmp_tags[i] = reach_p->tags[i];
    531  1.1       agc 		if (trans->tags)
    532  1.1       agc 		  for (i = 0; trans->tags[i] >= 0; i++)
    533  1.2  christos 		    if ((size_t)trans->tags[i] < num_tags)
    534  1.1       agc 		      tmp_tags[trans->tags[i]] = pos;
    535  1.1       agc 
    536  1.1       agc 		/* If another path has also reached this state, choose the one
    537  1.1       agc 		   with the smallest cost or best tags if costs are equal. */
    538  1.1       agc 		if (reach_next[dest_id].pos == pos
    539  1.1       agc 		    && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
    540  1.1       agc 			|| (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
    541  1.1       agc 			    && (!match_tags
    542  1.1       agc 				|| !tre_tag_order(num_tags,
    543  1.1       agc 						  tnfa->tag_directions,
    544  1.1       agc 						  tmp_tags,
    545  1.1       agc 						  reach_next[dest_id].tags)))))
    546  1.1       agc 		  {
    547  1.1       agc 		    DPRINT(("lose, cost0 %d, have %d\n",
    548  1.1       agc 			    cost0, reach_next[dest_id].costs[0][TRE_M_COST]));
    549  1.1       agc 		    continue;
    550  1.1       agc 		  }
    551  1.1       agc 		DPRINT(("win\n"));
    552  1.1       agc 
    553  1.1       agc 		/* Set state, position, tags, parameters, depth, and costs. */
    554  1.1       agc 		reach_next[dest_id].state = trans->state;
    555  1.1       agc 		reach_next[dest_id].pos = pos;
    556  1.1       agc 		for (i = 0; i < num_tags; i++)
    557  1.1       agc 		  reach_next[dest_id].tags[i] = tmp_tags[i];
    558  1.1       agc 
    559  1.1       agc 		reach_next[dest_id].params = reach_p->params;
    560  1.1       agc 		if (trans->params)
    561  1.1       agc 		  tre_set_params(&reach_next[dest_id], trans->params,
    562  1.1       agc 				 default_params);
    563  1.1       agc 
    564  1.1       agc 		reach_next[dest_id].depth = reach_p->depth;
    565  1.1       agc 		memcpy(&reach_next[dest_id].costs,
    566  1.1       agc 		       reach_p->costs,
    567  1.1       agc 		       sizeof(reach_p->costs[0][0])
    568  1.1       agc 		       * TRE_M_LAST * (depth + 1));
    569  1.1       agc 		reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
    570  1.1       agc 		reach_next[dest_id].costs[depth][TRE_M_NUM_DEL]++;
    571  1.1       agc 		reach_next[dest_id].costs[depth][TRE_M_NUM_ERR]++;
    572  1.1       agc 		if (depth > 0)
    573  1.1       agc 		  {
    574  1.1       agc 		    reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
    575  1.1       agc 		    reach_next[dest_id].costs[0][TRE_M_NUM_DEL]++;
    576  1.1       agc 		    reach_next[dest_id].costs[0][TRE_M_NUM_ERR]++;
    577  1.1       agc 		  }
    578  1.1       agc 
    579  1.1       agc 		if (trans->state == tnfa->final
    580  1.1       agc 		    && (match_eo < 0
    581  1.1       agc 			|| match_costs[TRE_M_COST] > cost0
    582  1.1       agc 			|| (match_costs[TRE_M_COST] == cost0
    583  1.1       agc 			    && (num_tags > 0
    584  1.1       agc 				&& tmp_tags[0] <= match_tags[0]))))
    585  1.1       agc 		  {
    586  1.1       agc 		    DPRINT(("	 setting new match at %d, cost %d\n",
    587  1.1       agc 			    pos, cost0));
    588  1.1       agc 		    match_eo = pos;
    589  1.1       agc 		    memcpy(match_costs, reach_next[dest_id].costs[0],
    590  1.1       agc 			   sizeof(match_costs[0]) * TRE_M_LAST);
    591  1.1       agc 		    for (i = 0; i < num_tags; i++)
    592  1.1       agc 		      match_tags[i] = tmp_tags[i];
    593  1.1       agc 		  }
    594  1.1       agc 
    595  1.1       agc 		/* Add to the end of the deque. */
    596  1.1       agc 		*deque_end = &reach_next[dest_id];
    597  1.1       agc 		deque_end++;
    598  1.1       agc 		if (deque_end >= (ringbuffer + 512))
    599  1.1       agc 		  deque_end = ringbuffer;
    600  1.1       agc 		assert(deque_end != deque_start);
    601  1.1       agc 	      }
    602  1.1       agc 	    deque_start++;
    603  1.1       agc 	    if (deque_start >= (ringbuffer + 512))
    604  1.1       agc 	      deque_start = ringbuffer;
    605  1.1       agc 	  }
    606  1.1       agc 
    607  1.1       agc       }
    608  1.1       agc 
    609  1.1       agc #ifdef TRE_DEBUG
    610  1.1       agc       tre_print_reach(tnfa, reach_next, pos, num_tags);
    611  1.1       agc #endif /* TRE_DEBUG */
    612  1.1       agc 
    613  1.1       agc       /* Check for end of string. */
    614  1.1       agc       if (len < 0)
    615  1.1       agc 	{
    616  1.1       agc 	  if (type == STR_USER)
    617  1.1       agc 	    {
    618  1.1       agc 	      if (str_user_end)
    619  1.1       agc 		break;
    620  1.1       agc 	    }
    621  1.1       agc 	  else if (next_c == L'\0')
    622  1.1       agc 	    break;
    623  1.1       agc 	}
    624  1.1       agc       else
    625  1.1       agc 	{
    626  1.1       agc 	  if (pos >= len)
    627  1.1       agc 	    break;
    628  1.1       agc 	}
    629  1.1       agc 
    630  1.1       agc       prev_pos = pos;
    631  1.1       agc       GET_NEXT_WCHAR();
    632  1.1       agc 
    633  1.1       agc       /* Swap `reach' and `reach_next'. */
    634  1.1       agc       {
    635  1.1       agc 	tre_tnfa_approx_reach_t *tmp;
    636  1.1       agc 	tmp = reach;
    637  1.1       agc 	reach = reach_next;
    638  1.1       agc 	reach_next = tmp;
    639  1.1       agc       }
    640  1.1       agc 
    641  1.1       agc       /* Handle exact matches and substitutions. */
    642  1.1       agc       for (id = 0; id < tnfa->num_states; id++)
    643  1.1       agc 	{
    644  1.1       agc 	  tre_tnfa_transition_t *trans;
    645  1.1       agc 
    646  1.1       agc 	  if (reach[id].pos < prev_pos)
    647  1.1       agc 	    continue;  /* Not reached. */
    648  1.1       agc 	  for (trans = reach[id].state; trans->state; trans++)
    649  1.1       agc 	    {
    650  1.1       agc 	      int dest_id;
    651  1.1       agc 	      int depth;
    652  1.1       agc 	      int cost, cost0, err;
    653  1.1       agc 
    654  1.1       agc 	      if (trans->assertions
    655  1.1       agc 		  && (CHECK_ASSERTIONS(trans->assertions)
    656  1.1       agc 		      || CHECK_CHAR_CLASSES(trans, tnfa, eflags)))
    657  1.1       agc 		{
    658  1.1       agc 		  DPRINT(("  exact,  from %d: assert failed\n", id));
    659  1.1       agc 		  continue;
    660  1.1       agc 		}
    661  1.1       agc 
    662  1.1       agc 	      depth = reach[id].depth;
    663  1.1       agc 	      dest_id = trans->state_id;
    664  1.1       agc 
    665  1.1       agc 	      cost = reach[id].costs[depth][TRE_M_COST];
    666  1.1       agc 	      cost0 = reach[id].costs[0][TRE_M_COST];
    667  1.1       agc 	      err = 0;
    668  1.1       agc 
    669  1.1       agc 	      if (trans->code_min > (tre_cint_t)prev_c
    670  1.1       agc 		  || trans->code_max < (tre_cint_t)prev_c)
    671  1.1       agc 		{
    672  1.1       agc 		  /* Handle substitutes.  The required character was not in
    673  1.1       agc 		     the string, so match it in place of whatever was supposed
    674  1.1       agc 		     to be there and increase costs accordingly. */
    675  1.1       agc 		  err = 1;
    676  1.1       agc 
    677  1.1       agc 		  /* Compute and check cost at current depth. */
    678  1.1       agc 		  cost = reach[id].costs[depth][TRE_M_COST];
    679  1.1       agc 		  if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
    680  1.1       agc 		    cost += reach[id].params.cost_subst;
    681  1.1       agc 		  if (cost > reach[id].params.max_cost)
    682  1.1       agc 		    continue; /* Cost too large. */
    683  1.1       agc 
    684  1.1       agc 		  /* Check number of substitutes at current depth. */
    685  1.1       agc 		  if (reach[id].costs[depth][TRE_M_NUM_SUBST] + 1
    686  1.1       agc 		      > reach[id].params.max_subst)
    687  1.1       agc 		    continue; /* Too many substitutes. */
    688  1.1       agc 
    689  1.1       agc 		  /* Check total number of errors at current depth. */
    690  1.1       agc 		  if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
    691  1.1       agc 		      > reach[id].params.max_err)
    692  1.1       agc 		    continue; /* Too many errors. */
    693  1.1       agc 
    694  1.1       agc 		  /* Compute overall cost. */
    695  1.1       agc 		  cost0 = cost;
    696  1.1       agc 		  if (depth > 0)
    697  1.1       agc 		    {
    698  1.1       agc 		      cost0 = reach[id].costs[0][TRE_M_COST];
    699  1.1       agc 		      if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
    700  1.1       agc 			cost0 += reach[id].params.cost_subst;
    701  1.1       agc 		      else
    702  1.1       agc 			cost0 += default_params.cost_subst;
    703  1.1       agc 		    }
    704  1.1       agc 		  DPRINT(("  subst,  from %03d to %03d, cost %d: ",
    705  1.1       agc 			  id, dest_id, cost0));
    706  1.1       agc 		}
    707  1.1       agc 	      else
    708  1.1       agc 		DPRINT(("  exact,  from %03d to %03d, cost %d: ",
    709  1.1       agc 			id, dest_id, cost0));
    710  1.1       agc 
    711  1.1       agc 	      /* Compute tag values after this transition. */
    712  1.1       agc 	      for (i = 0; i < num_tags; i++)
    713  1.1       agc 		tmp_tags[i] = reach[id].tags[i];
    714  1.1       agc 	      if (trans->tags)
    715  1.1       agc 		for (i = 0; trans->tags[i] >= 0; i++)
    716  1.2  christos 		  if ((size_t)trans->tags[i] < num_tags)
    717  1.1       agc 		    tmp_tags[trans->tags[i]] = pos;
    718  1.1       agc 
    719  1.1       agc 	      /* If another path has also reached this state, choose the
    720  1.1       agc 		 one with the smallest cost or best tags if costs are equal. */
    721  1.1       agc 	      if (reach_next[dest_id].pos == pos
    722  1.1       agc 		  && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
    723  1.1       agc 		      || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
    724  1.1       agc 			  && !tre_tag_order(num_tags, tnfa->tag_directions,
    725  1.1       agc 					    tmp_tags,
    726  1.1       agc 					    reach_next[dest_id].tags))))
    727  1.1       agc 		{
    728  1.1       agc 		  DPRINT(("lose\n"));
    729  1.1       agc 		  continue;
    730  1.1       agc 		}
    731  1.1       agc 	      DPRINT(("win %d %d\n",
    732  1.1       agc 		      reach_next[dest_id].pos,
    733  1.1       agc 		      reach_next[dest_id].costs[0][TRE_M_COST]));
    734  1.1       agc 
    735  1.1       agc 	      /* Set state, position, tags, and depth. */
    736  1.1       agc 	      reach_next[dest_id].state = trans->state;
    737  1.1       agc 	      reach_next[dest_id].pos = pos;
    738  1.1       agc 	      for (i = 0; i < num_tags; i++)
    739  1.1       agc 		reach_next[dest_id].tags[i] = tmp_tags[i];
    740  1.1       agc 	      reach_next[dest_id].depth = reach[id].depth;
    741  1.1       agc 
    742  1.1       agc 	      /* Set parameters. */
    743  1.1       agc 	      reach_next[dest_id].params = reach[id].params;
    744  1.1       agc 	      if (trans->params)
    745  1.1       agc 		tre_set_params(&reach_next[dest_id], trans->params,
    746  1.1       agc 			       default_params);
    747  1.1       agc 
    748  1.1       agc 	      /* Set the costs after this transition. */
    749  1.1       agc 		memcpy(&reach_next[dest_id].costs,
    750  1.1       agc 		       reach[id].costs,
    751  1.1       agc 		       sizeof(reach[id].costs[0][0])
    752  1.1       agc 		       * TRE_M_LAST * (depth + 1));
    753  1.1       agc 	      reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
    754  1.1       agc 	      reach_next[dest_id].costs[depth][TRE_M_NUM_SUBST] += err;
    755  1.1       agc 	      reach_next[dest_id].costs[depth][TRE_M_NUM_ERR] += err;
    756  1.1       agc 	      if (depth > 0)
    757  1.1       agc 		{
    758  1.1       agc 		  reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
    759  1.1       agc 		  reach_next[dest_id].costs[0][TRE_M_NUM_SUBST] += err;
    760  1.1       agc 		  reach_next[dest_id].costs[0][TRE_M_NUM_ERR] += err;
    761  1.1       agc 		}
    762  1.1       agc 
    763  1.1       agc 	      if (trans->state == tnfa->final
    764  1.1       agc 		  && (match_eo < 0
    765  1.1       agc 		      || cost0 < match_costs[TRE_M_COST]
    766  1.1       agc 		      || (cost0 == match_costs[TRE_M_COST]
    767  1.1       agc 			  && num_tags > 0 && tmp_tags[0] <= match_tags[0])))
    768  1.1       agc 		{
    769  1.1       agc 		  DPRINT(("    setting new match at %d, cost %d\n",
    770  1.1       agc 			  pos, cost0));
    771  1.1       agc 		  match_eo = pos;
    772  1.1       agc 		  for (i = 0; i < TRE_M_LAST; i++)
    773  1.1       agc 		    match_costs[i] = reach_next[dest_id].costs[0][i];
    774  1.1       agc 		  for (i = 0; i < num_tags; i++)
    775  1.1       agc 		    match_tags[i] = tmp_tags[i];
    776  1.1       agc 		}
    777  1.1       agc 	    }
    778  1.1       agc 	}
    779  1.1       agc     }
    780  1.1       agc 
    781  1.1       agc   DPRINT(("match end offset = %d, match cost = %d\n", match_eo,
    782  1.1       agc 	  match_costs[TRE_M_COST]));
    783  1.1       agc 
    784  1.1       agc #ifndef TRE_USE_ALLOCA
    785  1.1       agc   if (buf)
    786  1.1       agc     xfree(buf);
    787  1.1       agc #endif /* !TRE_USE_ALLOCA */
    788  1.1       agc 
    789  1.1       agc   match->cost = match_costs[TRE_M_COST];
    790  1.1       agc   match->num_ins = match_costs[TRE_M_NUM_INS];
    791  1.1       agc   match->num_del = match_costs[TRE_M_NUM_DEL];
    792  1.1       agc   match->num_subst = match_costs[TRE_M_NUM_SUBST];
    793  1.1       agc   *match_end_ofs = match_eo;
    794  1.1       agc 
    795  1.1       agc   return match_eo >= 0 ? REG_OK : REG_NOMATCH;
    796  1.1       agc }
    797