Home | History | Annotate | Line # | Download | only in lib
tre-match-approx.c revision 1.1
      1  1.1  agc /*
      2  1.1  agc   tre-match-approx.c - TRE approximate regex matching engine
      3  1.1  agc 
      4  1.1  agc   This software is released under a BSD-style license.
      5  1.1  agc   See the file LICENSE for details and copyright.
      6  1.1  agc 
      7  1.1  agc */
      8  1.1  agc 
      9  1.1  agc #ifdef HAVE_CONFIG_H
     10  1.1  agc #include <config.h>
     11  1.1  agc #endif /* HAVE_CONFIG_H */
     12  1.1  agc 
     13  1.1  agc /* AIX requires this to be the first thing in the file.	 */
     14  1.1  agc #ifdef TRE_USE_ALLOCA
     15  1.1  agc #ifndef __GNUC__
     16  1.1  agc # if HAVE_ALLOCA_H
     17  1.1  agc #  include <alloca.h>
     18  1.1  agc # else
     19  1.1  agc #  ifdef _AIX
     20  1.1  agc  #pragma alloca
     21  1.1  agc #  else
     22  1.1  agc #   ifndef alloca /* predefined by HP cc +Olibcalls */
     23  1.1  agc char *alloca ();
     24  1.1  agc #   endif
     25  1.1  agc #  endif
     26  1.1  agc # endif
     27  1.1  agc #endif
     28  1.1  agc #endif /* TRE_USE_ALLOCA */
     29  1.1  agc 
     30  1.1  agc #define __USE_STRING_INLINES
     31  1.1  agc #undef __NO_INLINE__
     32  1.1  agc 
     33  1.1  agc #include <assert.h>
     34  1.1  agc #include <stdlib.h>
     35  1.1  agc #include <string.h>
     36  1.1  agc #include <limits.h>
     37  1.1  agc #ifdef HAVE_WCHAR_H
     38  1.1  agc #include <wchar.h>
     39  1.1  agc #endif /* HAVE_WCHAR_H */
     40  1.1  agc #ifdef HAVE_WCTYPE_H
     41  1.1  agc #include <wctype.h>
     42  1.1  agc #endif /* HAVE_WCTYPE_H */
     43  1.1  agc #ifndef TRE_WCHAR
     44  1.1  agc #include <ctype.h>
     45  1.1  agc #endif /* !TRE_WCHAR */
     46  1.1  agc #ifdef HAVE_MALLOC_H
     47  1.1  agc #include <malloc.h>
     48  1.1  agc #endif /* HAVE_MALLOC_H */
     49  1.1  agc 
     50  1.1  agc #include "tre-internal.h"
     51  1.1  agc #include "tre-match-utils.h"
     52  1.1  agc #include "tre.h"
     53  1.1  agc #include "xmalloc.h"
     54  1.1  agc 
     55  1.1  agc #define TRE_M_COST	0
     56  1.1  agc #define TRE_M_NUM_INS	1
     57  1.1  agc #define TRE_M_NUM_DEL	2
     58  1.1  agc #define TRE_M_NUM_SUBST 3
     59  1.1  agc #define TRE_M_NUM_ERR	4
     60  1.1  agc #define TRE_M_LAST	5
     61  1.1  agc 
     62  1.1  agc #define TRE_M_MAX_DEPTH 3
     63  1.1  agc 
     64  1.1  agc typedef struct {
     65  1.1  agc   /* State in the TNFA transition table. */
     66  1.1  agc   tre_tnfa_transition_t *state;
     67  1.1  agc   /* Position in input string. */
     68  1.1  agc   int pos;
     69  1.1  agc   /* Tag values. */
     70  1.1  agc   int *tags;
     71  1.1  agc   /* Matching parameters. */
     72  1.1  agc   regaparams_t params;
     73  1.1  agc   /* Nesting depth of parameters.  This is used as an index in
     74  1.1  agc      the `costs' array. */
     75  1.1  agc   int depth;
     76  1.1  agc   /* Costs and counter values for different parameter nesting depths. */
     77  1.1  agc   int costs[TRE_M_MAX_DEPTH + 1][TRE_M_LAST];
     78  1.1  agc } tre_tnfa_approx_reach_t;
     79  1.1  agc 
     80  1.1  agc 
     81  1.1  agc #ifdef TRE_DEBUG
     82  1.1  agc /* Prints the `reach' array in a readable fashion with DPRINT. */
     83  1.1  agc static void
     84  1.1  agc tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_approx_reach_t *reach,
     85  1.1  agc 		int pos, int num_tags)
     86  1.1  agc {
     87  1.1  agc   int id;
     88  1.1  agc 
     89  1.1  agc   /* Print each state on one line. */
     90  1.1  agc   DPRINT(("  reach:\n"));
     91  1.1  agc   for (id = 0; id < tnfa->num_states; id++)
     92  1.1  agc     {
     93  1.1  agc       int i, j;
     94  1.1  agc       if (reach[id].pos < pos)
     95  1.1  agc 	continue;  /* Not reached. */
     96  1.1  agc       DPRINT(("	 %03d, costs ", id));
     97  1.1  agc       for (i = 0; i <= reach[id].depth; i++)
     98  1.1  agc 	{
     99  1.1  agc 	  DPRINT(("["));
    100  1.1  agc 	  for (j = 0; j < TRE_M_LAST; j++)
    101  1.1  agc 	    {
    102  1.1  agc 	      DPRINT(("%2d", reach[id].costs[i][j]));
    103  1.1  agc 	      if (j + 1 < TRE_M_LAST)
    104  1.1  agc 		DPRINT((","));
    105  1.1  agc 	    }
    106  1.1  agc 	  DPRINT(("]"));
    107  1.1  agc 	  if (i + 1 <= reach[id].depth)
    108  1.1  agc 	    DPRINT((", "));
    109  1.1  agc 	}
    110  1.1  agc       DPRINT(("\n	tags "));
    111  1.1  agc       for (i = 0; i < num_tags; i++)
    112  1.1  agc 	{
    113  1.1  agc 	  DPRINT(("%02d", reach[id].tags[i]));
    114  1.1  agc 	  if (i + 1 < num_tags)
    115  1.1  agc 	    DPRINT((","));
    116  1.1  agc 	}
    117  1.1  agc       DPRINT(("\n"));
    118  1.1  agc     }
    119  1.1  agc   DPRINT(("\n"));
    120  1.1  agc }
    121  1.1  agc #endif /* TRE_DEBUG */
    122  1.1  agc 
    123  1.1  agc 
    124  1.1  agc /* Sets the matching parameters in `reach' to the ones defined in the `pa'
    125  1.1  agc    array.  If `pa' specifies default values, they are taken from
    126  1.1  agc    `default_params'. */
    127  1.1  agc inline static void
    128  1.1  agc tre_set_params(tre_tnfa_approx_reach_t *reach,
    129  1.1  agc 	       int *pa, regaparams_t default_params)
    130  1.1  agc {
    131  1.1  agc   int value;
    132  1.1  agc 
    133  1.1  agc   /* If depth is increased reset costs and counters to zero for the
    134  1.1  agc      new levels. */
    135  1.1  agc   value = pa[TRE_PARAM_DEPTH];
    136  1.1  agc   assert(value <= TRE_M_MAX_DEPTH);
    137  1.1  agc   if (value > reach->depth)
    138  1.1  agc     {
    139  1.1  agc       int i, j;
    140  1.1  agc       for (i = reach->depth + 1; i <= value; i++)
    141  1.1  agc 	for (j = 0; j < TRE_M_LAST; j++)
    142  1.1  agc 	  reach->costs[i][j] = 0;
    143  1.1  agc     }
    144  1.1  agc   reach->depth = value;
    145  1.1  agc 
    146  1.1  agc   /* Set insert cost. */
    147  1.1  agc   value = pa[TRE_PARAM_COST_INS];
    148  1.1  agc   if (value == TRE_PARAM_DEFAULT)
    149  1.1  agc     reach->params.cost_ins = default_params.cost_ins;
    150  1.1  agc   else if (value != TRE_PARAM_UNSET)
    151  1.1  agc     reach->params.cost_ins = value;
    152  1.1  agc 
    153  1.1  agc   /* Set delete cost. */
    154  1.1  agc   value = pa[TRE_PARAM_COST_DEL];
    155  1.1  agc   if (value == TRE_PARAM_DEFAULT)
    156  1.1  agc     reach->params.cost_del = default_params.cost_del;
    157  1.1  agc   else if (value != TRE_PARAM_UNSET)
    158  1.1  agc     reach->params.cost_del = value;
    159  1.1  agc 
    160  1.1  agc   /* Set substitute cost. */
    161  1.1  agc   value = pa[TRE_PARAM_COST_SUBST];
    162  1.1  agc   if (value == TRE_PARAM_DEFAULT)
    163  1.1  agc     reach->params.cost_subst = default_params.cost_subst;
    164  1.1  agc   else
    165  1.1  agc     reach->params.cost_subst = value;
    166  1.1  agc 
    167  1.1  agc   /* Set maximum cost. */
    168  1.1  agc   value = pa[TRE_PARAM_COST_MAX];
    169  1.1  agc   if (value == TRE_PARAM_DEFAULT)
    170  1.1  agc     reach->params.max_cost = default_params.max_cost;
    171  1.1  agc   else if (value != TRE_PARAM_UNSET)
    172  1.1  agc     reach->params.max_cost = value;
    173  1.1  agc 
    174  1.1  agc   /* Set maximum inserts. */
    175  1.1  agc   value = pa[TRE_PARAM_MAX_INS];
    176  1.1  agc   if (value == TRE_PARAM_DEFAULT)
    177  1.1  agc     reach->params.max_ins = default_params.max_ins;
    178  1.1  agc   else if (value != TRE_PARAM_UNSET)
    179  1.1  agc     reach->params.max_ins = value;
    180  1.1  agc 
    181  1.1  agc   /* Set maximum deletes. */
    182  1.1  agc   value = pa[TRE_PARAM_MAX_DEL];
    183  1.1  agc   if (value == TRE_PARAM_DEFAULT)
    184  1.1  agc     reach->params.max_del = default_params.max_del;
    185  1.1  agc   else if (value != TRE_PARAM_UNSET)
    186  1.1  agc     reach->params.max_del = value;
    187  1.1  agc 
    188  1.1  agc   /* Set maximum substitutes. */
    189  1.1  agc   value = pa[TRE_PARAM_MAX_SUBST];
    190  1.1  agc   if (value == TRE_PARAM_DEFAULT)
    191  1.1  agc     reach->params.max_subst = default_params.max_subst;
    192  1.1  agc   else if (value != TRE_PARAM_UNSET)
    193  1.1  agc     reach->params.max_subst = value;
    194  1.1  agc 
    195  1.1  agc   /* Set maximum number of errors. */
    196  1.1  agc   value = pa[TRE_PARAM_MAX_ERR];
    197  1.1  agc   if (value == TRE_PARAM_DEFAULT)
    198  1.1  agc     reach->params.max_err = default_params.max_err;
    199  1.1  agc   else if (value != TRE_PARAM_UNSET)
    200  1.1  agc     reach->params.max_err = value;
    201  1.1  agc }
    202  1.1  agc 
    203  1.1  agc reg_errcode_t
    204  1.1  agc tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
    205  1.1  agc 		    tre_str_type_t type, int *match_tags,
    206  1.1  agc 		    regamatch_t *match, regaparams_t default_params,
    207  1.1  agc 		    int eflags, int *match_end_ofs)
    208  1.1  agc {
    209  1.1  agc   /* State variables required by GET_NEXT_WCHAR. */
    210  1.1  agc   tre_char_t prev_c = 0, next_c = 0;
    211  1.1  agc   const char *str_byte = string;
    212  1.1  agc   int pos = -1;
    213  1.1  agc   unsigned int pos_add_next = 1;
    214  1.1  agc #ifdef TRE_WCHAR
    215  1.1  agc   const wchar_t *str_wide = string;
    216  1.1  agc #ifdef TRE_MBSTATE
    217  1.1  agc   mbstate_t mbstate;
    218  1.1  agc #endif /* !TRE_WCHAR */
    219  1.1  agc #endif /* TRE_WCHAR */
    220  1.1  agc   int reg_notbol = eflags & REG_NOTBOL;
    221  1.1  agc   int reg_noteol = eflags & REG_NOTEOL;
    222  1.1  agc   int reg_newline = tnfa->cflags & REG_NEWLINE;
    223  1.1  agc   int str_user_end = 0;
    224  1.1  agc 
    225  1.1  agc   int prev_pos;
    226  1.1  agc 
    227  1.1  agc   /* Number of tags. */
    228  1.1  agc   int num_tags;
    229  1.1  agc   /* The reach tables. */
    230  1.1  agc   tre_tnfa_approx_reach_t *reach, *reach_next;
    231  1.1  agc   /* Tag array for temporary use. */
    232  1.1  agc   int *tmp_tags;
    233  1.1  agc 
    234  1.1  agc   /* End offset of best match so far, or -1 if no match found yet. */
    235  1.1  agc   int match_eo = -1;
    236  1.1  agc   /* Costs of the match. */
    237  1.1  agc   int match_costs[TRE_M_LAST];
    238  1.1  agc 
    239  1.1  agc   /* Space for temporary data required for matching. */
    240  1.1  agc   unsigned char *buf;
    241  1.1  agc 
    242  1.1  agc   int i, id;
    243  1.1  agc 
    244  1.1  agc   if (!match_tags)
    245  1.1  agc     num_tags = 0;
    246  1.1  agc   else
    247  1.1  agc     num_tags = tnfa->num_tags;
    248  1.1  agc 
    249  1.1  agc #ifdef TRE_MBSTATE
    250  1.1  agc   memset(&mbstate, '\0', sizeof(mbstate));
    251  1.1  agc #endif /* TRE_MBSTATE */
    252  1.1  agc 
    253  1.1  agc   DPRINT(("tre_tnfa_run_approx, input type %d, len %d, eflags %d, "
    254  1.1  agc 	  "match_tags %p\n",
    255  1.1  agc 	  type, len, eflags,
    256  1.1  agc 	  match_tags));
    257  1.1  agc   DPRINT(("max cost %d, ins %d, del %d, subst %d\n",
    258  1.1  agc 	  default_params.max_cost,
    259  1.1  agc 	  default_params.cost_ins,
    260  1.1  agc 	  default_params.cost_del,
    261  1.1  agc 	  default_params.cost_subst));
    262  1.1  agc 
    263  1.1  agc   /* Allocate memory for temporary data required for matching.	This needs to
    264  1.1  agc      be done for every matching operation to be thread safe.  This allocates
    265  1.1  agc      everything in a single large block from the stack frame using alloca()
    266  1.1  agc      or with malloc() if alloca is unavailable. */
    267  1.1  agc   {
    268  1.1  agc     unsigned char *buf_cursor;
    269  1.1  agc     /* Space needed for one array of tags. */
    270  1.1  agc     int tag_bytes = sizeof(*tmp_tags) * num_tags;
    271  1.1  agc     /* Space needed for one reach table. */
    272  1.1  agc     int reach_bytes = sizeof(*reach_next) * tnfa->num_states;
    273  1.1  agc     /* Total space needed. */
    274  1.1  agc     int total_bytes = reach_bytes * 2 + (tnfa->num_states * 2 + 1 ) * tag_bytes;
    275  1.1  agc     /* Add some extra to make sure we can align the pointers.  The multiplier
    276  1.1  agc        used here must be equal to the number of ALIGN calls below. */
    277  1.1  agc     total_bytes += (sizeof(long) - 1) * 3;
    278  1.1  agc 
    279  1.1  agc     /* Allocate the memory. */
    280  1.1  agc #ifdef TRE_USE_ALLOCA
    281  1.1  agc     buf = alloca(total_bytes);
    282  1.1  agc #else /* !TRE_USE_ALLOCA */
    283  1.1  agc     buf = xmalloc((unsigned)total_bytes);
    284  1.1  agc #endif /* !TRE_USE_ALLOCA */
    285  1.1  agc     if (!buf)
    286  1.1  agc       return REG_ESPACE;
    287  1.1  agc     memset(buf, 0, (size_t)total_bytes);
    288  1.1  agc 
    289  1.1  agc     /* Allocate `tmp_tags' from `buf'. */
    290  1.1  agc     tmp_tags = (void *)buf;
    291  1.1  agc     buf_cursor = buf + tag_bytes;
    292  1.1  agc     buf_cursor += ALIGN(buf_cursor, long);
    293  1.1  agc 
    294  1.1  agc     /* Allocate `reach' from `buf'. */
    295  1.1  agc     reach = (void *)buf_cursor;
    296  1.1  agc     buf_cursor += reach_bytes;
    297  1.1  agc     buf_cursor += ALIGN(buf_cursor, long);
    298  1.1  agc 
    299  1.1  agc     /* Allocate `reach_next' from `buf'. */
    300  1.1  agc     reach_next = (void *)buf_cursor;
    301  1.1  agc     buf_cursor += reach_bytes;
    302  1.1  agc     buf_cursor += ALIGN(buf_cursor, long);
    303  1.1  agc 
    304  1.1  agc     /* Allocate tag arrays for `reach' and `reach_next' from `buf'. */
    305  1.1  agc     for (i = 0; i < tnfa->num_states; i++)
    306  1.1  agc       {
    307  1.1  agc 	reach[i].tags = (void *)buf_cursor;
    308  1.1  agc 	buf_cursor += tag_bytes;
    309  1.1  agc 	reach_next[i].tags = (void *)buf_cursor;
    310  1.1  agc 	buf_cursor += tag_bytes;
    311  1.1  agc       }
    312  1.1  agc     assert(buf_cursor <= buf + total_bytes);
    313  1.1  agc   }
    314  1.1  agc 
    315  1.1  agc   for (i = 0; i < TRE_M_LAST; i++)
    316  1.1  agc     match_costs[i] = INT_MAX;
    317  1.1  agc 
    318  1.1  agc   /* Mark the reach arrays empty. */
    319  1.1  agc   for (i = 0; i < tnfa->num_states; i++)
    320  1.1  agc     reach[i].pos = reach_next[i].pos = -2;
    321  1.1  agc 
    322  1.1  agc   prev_pos = pos;
    323  1.1  agc   GET_NEXT_WCHAR();
    324  1.1  agc   pos = 0;
    325  1.1  agc 
    326  1.1  agc   while (/*CONSTCOND*/1)
    327  1.1  agc     {
    328  1.1  agc       DPRINT(("%03d:%2lc/%05d\n", pos, (tre_cint_t)next_c, (int)next_c));
    329  1.1  agc 
    330  1.1  agc       /* Add initial states to `reach_next' if an exact match has not yet
    331  1.1  agc 	 been found. */
    332  1.1  agc       if (match_costs[TRE_M_COST] > 0)
    333  1.1  agc 	{
    334  1.1  agc 	  tre_tnfa_transition_t *trans;
    335  1.1  agc 	  DPRINT(("  init"));
    336  1.1  agc 	  for (trans = tnfa->initial; trans->state; trans++)
    337  1.1  agc 	    {
    338  1.1  agc 	      int stateid = trans->state_id;
    339  1.1  agc 
    340  1.1  agc 	      /* If this state is not currently in `reach_next', add it
    341  1.1  agc 		 there. */
    342  1.1  agc 	      if (reach_next[stateid].pos < pos)
    343  1.1  agc 		{
    344  1.1  agc 		  if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
    345  1.1  agc 		    {
    346  1.1  agc 		      /* Assertions failed, don't add this state. */
    347  1.1  agc 		      DPRINT((" !%d (assert)", stateid));
    348  1.1  agc 		      continue;
    349  1.1  agc 		    }
    350  1.1  agc 		  DPRINT((" %d", stateid));
    351  1.1  agc 		  reach_next[stateid].state = trans->state;
    352  1.1  agc 		  reach_next[stateid].pos = pos;
    353  1.1  agc 
    354  1.1  agc 		  /* Compute tag values after this transition. */
    355  1.1  agc 		  for (i = 0; i < num_tags; i++)
    356  1.1  agc 		    reach_next[stateid].tags[i] = -1;
    357  1.1  agc 
    358  1.1  agc 		  if (trans->tags)
    359  1.1  agc 		    for (i = 0; trans->tags[i] >= 0; i++)
    360  1.1  agc 		      if (trans->tags[i] < num_tags)
    361  1.1  agc 			reach_next[stateid].tags[trans->tags[i]] = pos;
    362  1.1  agc 
    363  1.1  agc 		  /* Set the parameters, depth, and costs. */
    364  1.1  agc 		  reach_next[stateid].params = default_params;
    365  1.1  agc 		  reach_next[stateid].depth = 0;
    366  1.1  agc 		  for (i = 0; i < TRE_M_LAST; i++)
    367  1.1  agc 		    reach_next[stateid].costs[0][i] = 0;
    368  1.1  agc 		  if (trans->params)
    369  1.1  agc 		    tre_set_params(&reach_next[stateid], trans->params,
    370  1.1  agc 				   default_params);
    371  1.1  agc 
    372  1.1  agc 		  /* If this is the final state, mark the exact match. */
    373  1.1  agc 		  if (trans->state == tnfa->final)
    374  1.1  agc 		    {
    375  1.1  agc 		      match_eo = pos;
    376  1.1  agc 		      for (i = 0; i < num_tags; i++)
    377  1.1  agc 			match_tags[i] = reach_next[stateid].tags[i];
    378  1.1  agc 		      for (i = 0; i < TRE_M_LAST; i++)
    379  1.1  agc 			match_costs[i] = 0;
    380  1.1  agc 		    }
    381  1.1  agc 		}
    382  1.1  agc 	    }
    383  1.1  agc 	    DPRINT(("\n"));
    384  1.1  agc 	}
    385  1.1  agc 
    386  1.1  agc 
    387  1.1  agc       /* Handle inserts.  This is done by pretending there's an epsilon
    388  1.1  agc 	 transition from each state in `reach' back to the same state.
    389  1.1  agc 	 We don't need to worry about the final state here; this will never
    390  1.1  agc 	 give a better match than what we already have. */
    391  1.1  agc       for (id = 0; id < tnfa->num_states; id++)
    392  1.1  agc 	{
    393  1.1  agc 	  int depth;
    394  1.1  agc 	  int cost, cost0;
    395  1.1  agc 
    396  1.1  agc 	  if (reach[id].pos != prev_pos)
    397  1.1  agc 	    {
    398  1.1  agc 	      DPRINT(("	 insert: %d not reached\n", id));
    399  1.1  agc 	      continue;	 /* Not reached. */
    400  1.1  agc 	    }
    401  1.1  agc 
    402  1.1  agc 	  depth = reach[id].depth;
    403  1.1  agc 
    404  1.1  agc 	  /* Compute and check cost at current depth. */
    405  1.1  agc 	  cost = reach[id].costs[depth][TRE_M_COST];
    406  1.1  agc 	  if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
    407  1.1  agc 	    cost += reach[id].params.cost_ins;
    408  1.1  agc 	  if (cost > reach[id].params.max_cost)
    409  1.1  agc 	    continue;  /* Cost too large. */
    410  1.1  agc 
    411  1.1  agc 	  /* Check number of inserts at current depth. */
    412  1.1  agc 	  if (reach[id].costs[depth][TRE_M_NUM_INS] + 1
    413  1.1  agc 	      > reach[id].params.max_ins)
    414  1.1  agc 	    continue;  /* Too many inserts. */
    415  1.1  agc 
    416  1.1  agc 	  /* Check total number of errors at current depth. */
    417  1.1  agc 	  if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
    418  1.1  agc 	      > reach[id].params.max_err)
    419  1.1  agc 	    continue;  /* Too many errors. */
    420  1.1  agc 
    421  1.1  agc 	  /* Compute overall cost. */
    422  1.1  agc 	  cost0 = cost;
    423  1.1  agc 	  if (depth > 0)
    424  1.1  agc 	    {
    425  1.1  agc 	      cost0 = reach[id].costs[0][TRE_M_COST];
    426  1.1  agc 	      if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
    427  1.1  agc 		cost0 += reach[id].params.cost_ins;
    428  1.1  agc 	      else
    429  1.1  agc 		cost0 += default_params.cost_ins;
    430  1.1  agc 	    }
    431  1.1  agc 
    432  1.1  agc 	  DPRINT(("  insert: from %d to %d, cost %d: ", id, id,
    433  1.1  agc 		  reach[id].costs[depth][TRE_M_COST]));
    434  1.1  agc 	  if (reach_next[id].pos == pos
    435  1.1  agc 	      && (cost0 >= reach_next[id].costs[0][TRE_M_COST]))
    436  1.1  agc 	    {
    437  1.1  agc 	      DPRINT(("lose\n"));
    438  1.1  agc 	      continue;
    439  1.1  agc 	    }
    440  1.1  agc 	  DPRINT(("win\n"));
    441  1.1  agc 
    442  1.1  agc 	  /* Copy state, position, tags, parameters, and depth. */
    443  1.1  agc 	  reach_next[id].state = reach[id].state;
    444  1.1  agc 	  reach_next[id].pos = pos;
    445  1.1  agc 	  for (i = 0; i < num_tags; i++)
    446  1.1  agc 	    reach_next[id].tags[i] = reach[id].tags[i];
    447  1.1  agc 	  reach_next[id].params = reach[id].params;
    448  1.1  agc 	  reach_next[id].depth = reach[id].depth;
    449  1.1  agc 
    450  1.1  agc 	  /* Set the costs after this transition. */
    451  1.1  agc 	  memcpy(reach_next[id].costs, reach[id].costs,
    452  1.1  agc 		 sizeof(reach_next[id].costs[0][0])
    453  1.1  agc 		 * TRE_M_LAST * (depth + 1));
    454  1.1  agc 	  reach_next[id].costs[depth][TRE_M_COST] = cost;
    455  1.1  agc 	  reach_next[id].costs[depth][TRE_M_NUM_INS]++;
    456  1.1  agc 	  reach_next[id].costs[depth][TRE_M_NUM_ERR]++;
    457  1.1  agc 	  if (depth > 0)
    458  1.1  agc 	    {
    459  1.1  agc 	      reach_next[id].costs[0][TRE_M_COST] = cost0;
    460  1.1  agc 	      reach_next[id].costs[0][TRE_M_NUM_INS]++;
    461  1.1  agc 	      reach_next[id].costs[0][TRE_M_NUM_ERR]++;
    462  1.1  agc 	    }
    463  1.1  agc 
    464  1.1  agc 	}
    465  1.1  agc 
    466  1.1  agc 
    467  1.1  agc       /* Handle deletes.  This is done by traversing through the whole TNFA
    468  1.1  agc 	 pretending that all transitions are epsilon transitions, until
    469  1.1  agc 	 no more states can be reached with better costs. */
    470  1.1  agc       {
    471  1.1  agc 	/* XXX - dynamic ringbuffer size */
    472  1.1  agc 	tre_tnfa_approx_reach_t *ringbuffer[512];
    473  1.1  agc 	tre_tnfa_approx_reach_t **deque_start, **deque_end;
    474  1.1  agc 
    475  1.1  agc 	deque_start = deque_end = ringbuffer;
    476  1.1  agc 
    477  1.1  agc 	/* Add all states in `reach_next' to the deque. */
    478  1.1  agc 	for (id = 0; id < tnfa->num_states; id++)
    479  1.1  agc 	  {
    480  1.1  agc 	    if (reach_next[id].pos != pos)
    481  1.1  agc 	      continue;
    482  1.1  agc 	    *deque_end = &reach_next[id];
    483  1.1  agc 	    deque_end++;
    484  1.1  agc 	    assert(deque_end != deque_start);
    485  1.1  agc 	  }
    486  1.1  agc 
    487  1.1  agc 	/* Repeat until the deque is empty. */
    488  1.1  agc 	while (deque_end != deque_start)
    489  1.1  agc 	  {
    490  1.1  agc 	    tre_tnfa_approx_reach_t *reach_p;
    491  1.1  agc 	    int depth;
    492  1.1  agc 	    int cost, cost0;
    493  1.1  agc 	    tre_tnfa_transition_t *trans;
    494  1.1  agc 
    495  1.1  agc 	    /* Pop the first item off the deque. */
    496  1.1  agc 	    reach_p = *deque_start;
    497  1.1  agc 	    id = reach_p - reach_next;
    498  1.1  agc 	    depth = reach_p->depth;
    499  1.1  agc 
    500  1.1  agc 	    /* Compute cost at current depth. */
    501  1.1  agc 	    cost = reach_p->costs[depth][TRE_M_COST];
    502  1.1  agc 	    if (reach_p->params.cost_del != TRE_PARAM_UNSET)
    503  1.1  agc 	      cost += reach_p->params.cost_del;
    504  1.1  agc 
    505  1.1  agc 	    /* Check cost, number of deletes, and total number of errors
    506  1.1  agc 	       at current depth. */
    507  1.1  agc 	    if (cost > reach_p->params.max_cost
    508  1.1  agc 		|| (reach_p->costs[depth][TRE_M_NUM_DEL] + 1
    509  1.1  agc 		    > reach_p->params.max_del)
    510  1.1  agc 		|| (reach_p->costs[depth][TRE_M_NUM_ERR] + 1
    511  1.1  agc 		    > reach_p->params.max_err))
    512  1.1  agc 	      {
    513  1.1  agc 		/* Too many errors or cost too large. */
    514  1.1  agc 		DPRINT(("  delete: from %03d: cost too large\n", id));
    515  1.1  agc 		deque_start++;
    516  1.1  agc 		if (deque_start >= (ringbuffer + 512))
    517  1.1  agc 		  deque_start = ringbuffer;
    518  1.1  agc 		continue;
    519  1.1  agc 	      }
    520  1.1  agc 
    521  1.1  agc 	    /* Compute overall cost. */
    522  1.1  agc 	    cost0 = cost;
    523  1.1  agc 	    if (depth > 0)
    524  1.1  agc 	      {
    525  1.1  agc 		cost0 = reach_p->costs[0][TRE_M_COST];
    526  1.1  agc 		if (reach_p->params.cost_del != TRE_PARAM_UNSET)
    527  1.1  agc 		  cost0 += reach_p->params.cost_del;
    528  1.1  agc 		else
    529  1.1  agc 		  cost0 += default_params.cost_del;
    530  1.1  agc 	      }
    531  1.1  agc 
    532  1.1  agc 	    for (trans = reach_p->state; trans->state; trans++)
    533  1.1  agc 	      {
    534  1.1  agc 		int dest_id = trans->state_id;
    535  1.1  agc 		DPRINT(("  delete: from %03d to %03d, cost %d (%d): ",
    536  1.1  agc 			id, dest_id, cost0, reach_p->params.max_cost));
    537  1.1  agc 
    538  1.1  agc 		if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
    539  1.1  agc 		  {
    540  1.1  agc 		    DPRINT(("assertion failed\n"));
    541  1.1  agc 		    continue;
    542  1.1  agc 		  }
    543  1.1  agc 
    544  1.1  agc 		/* Compute tag values after this transition. */
    545  1.1  agc 		for (i = 0; i < num_tags; i++)
    546  1.1  agc 		  tmp_tags[i] = reach_p->tags[i];
    547  1.1  agc 		if (trans->tags)
    548  1.1  agc 		  for (i = 0; trans->tags[i] >= 0; i++)
    549  1.1  agc 		    if (trans->tags[i] < num_tags)
    550  1.1  agc 		      tmp_tags[trans->tags[i]] = pos;
    551  1.1  agc 
    552  1.1  agc 		/* If another path has also reached this state, choose the one
    553  1.1  agc 		   with the smallest cost or best tags if costs are equal. */
    554  1.1  agc 		if (reach_next[dest_id].pos == pos
    555  1.1  agc 		    && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
    556  1.1  agc 			|| (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
    557  1.1  agc 			    && (!match_tags
    558  1.1  agc 				|| !tre_tag_order(num_tags,
    559  1.1  agc 						  tnfa->tag_directions,
    560  1.1  agc 						  tmp_tags,
    561  1.1  agc 						  reach_next[dest_id].tags)))))
    562  1.1  agc 		  {
    563  1.1  agc 		    DPRINT(("lose, cost0 %d, have %d\n",
    564  1.1  agc 			    cost0, reach_next[dest_id].costs[0][TRE_M_COST]));
    565  1.1  agc 		    continue;
    566  1.1  agc 		  }
    567  1.1  agc 		DPRINT(("win\n"));
    568  1.1  agc 
    569  1.1  agc 		/* Set state, position, tags, parameters, depth, and costs. */
    570  1.1  agc 		reach_next[dest_id].state = trans->state;
    571  1.1  agc 		reach_next[dest_id].pos = pos;
    572  1.1  agc 		for (i = 0; i < num_tags; i++)
    573  1.1  agc 		  reach_next[dest_id].tags[i] = tmp_tags[i];
    574  1.1  agc 
    575  1.1  agc 		reach_next[dest_id].params = reach_p->params;
    576  1.1  agc 		if (trans->params)
    577  1.1  agc 		  tre_set_params(&reach_next[dest_id], trans->params,
    578  1.1  agc 				 default_params);
    579  1.1  agc 
    580  1.1  agc 		reach_next[dest_id].depth = reach_p->depth;
    581  1.1  agc 		memcpy(&reach_next[dest_id].costs,
    582  1.1  agc 		       reach_p->costs,
    583  1.1  agc 		       sizeof(reach_p->costs[0][0])
    584  1.1  agc 		       * TRE_M_LAST * (depth + 1));
    585  1.1  agc 		reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
    586  1.1  agc 		reach_next[dest_id].costs[depth][TRE_M_NUM_DEL]++;
    587  1.1  agc 		reach_next[dest_id].costs[depth][TRE_M_NUM_ERR]++;
    588  1.1  agc 		if (depth > 0)
    589  1.1  agc 		  {
    590  1.1  agc 		    reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
    591  1.1  agc 		    reach_next[dest_id].costs[0][TRE_M_NUM_DEL]++;
    592  1.1  agc 		    reach_next[dest_id].costs[0][TRE_M_NUM_ERR]++;
    593  1.1  agc 		  }
    594  1.1  agc 
    595  1.1  agc 		if (trans->state == tnfa->final
    596  1.1  agc 		    && (match_eo < 0
    597  1.1  agc 			|| match_costs[TRE_M_COST] > cost0
    598  1.1  agc 			|| (match_costs[TRE_M_COST] == cost0
    599  1.1  agc 			    && (num_tags > 0
    600  1.1  agc 				&& tmp_tags[0] <= match_tags[0]))))
    601  1.1  agc 		  {
    602  1.1  agc 		    DPRINT(("	 setting new match at %d, cost %d\n",
    603  1.1  agc 			    pos, cost0));
    604  1.1  agc 		    match_eo = pos;
    605  1.1  agc 		    memcpy(match_costs, reach_next[dest_id].costs[0],
    606  1.1  agc 			   sizeof(match_costs[0]) * TRE_M_LAST);
    607  1.1  agc 		    for (i = 0; i < num_tags; i++)
    608  1.1  agc 		      match_tags[i] = tmp_tags[i];
    609  1.1  agc 		  }
    610  1.1  agc 
    611  1.1  agc 		/* Add to the end of the deque. */
    612  1.1  agc 		*deque_end = &reach_next[dest_id];
    613  1.1  agc 		deque_end++;
    614  1.1  agc 		if (deque_end >= (ringbuffer + 512))
    615  1.1  agc 		  deque_end = ringbuffer;
    616  1.1  agc 		assert(deque_end != deque_start);
    617  1.1  agc 	      }
    618  1.1  agc 	    deque_start++;
    619  1.1  agc 	    if (deque_start >= (ringbuffer + 512))
    620  1.1  agc 	      deque_start = ringbuffer;
    621  1.1  agc 	  }
    622  1.1  agc 
    623  1.1  agc       }
    624  1.1  agc 
    625  1.1  agc #ifdef TRE_DEBUG
    626  1.1  agc       tre_print_reach(tnfa, reach_next, pos, num_tags);
    627  1.1  agc #endif /* TRE_DEBUG */
    628  1.1  agc 
    629  1.1  agc       /* Check for end of string. */
    630  1.1  agc       if (len < 0)
    631  1.1  agc 	{
    632  1.1  agc 	  if (type == STR_USER)
    633  1.1  agc 	    {
    634  1.1  agc 	      if (str_user_end)
    635  1.1  agc 		break;
    636  1.1  agc 	    }
    637  1.1  agc 	  else if (next_c == L'\0')
    638  1.1  agc 	    break;
    639  1.1  agc 	}
    640  1.1  agc       else
    641  1.1  agc 	{
    642  1.1  agc 	  if (pos >= len)
    643  1.1  agc 	    break;
    644  1.1  agc 	}
    645  1.1  agc 
    646  1.1  agc       prev_pos = pos;
    647  1.1  agc       GET_NEXT_WCHAR();
    648  1.1  agc 
    649  1.1  agc       /* Swap `reach' and `reach_next'. */
    650  1.1  agc       {
    651  1.1  agc 	tre_tnfa_approx_reach_t *tmp;
    652  1.1  agc 	tmp = reach;
    653  1.1  agc 	reach = reach_next;
    654  1.1  agc 	reach_next = tmp;
    655  1.1  agc       }
    656  1.1  agc 
    657  1.1  agc       /* Handle exact matches and substitutions. */
    658  1.1  agc       for (id = 0; id < tnfa->num_states; id++)
    659  1.1  agc 	{
    660  1.1  agc 	  tre_tnfa_transition_t *trans;
    661  1.1  agc 
    662  1.1  agc 	  if (reach[id].pos < prev_pos)
    663  1.1  agc 	    continue;  /* Not reached. */
    664  1.1  agc 	  for (trans = reach[id].state; trans->state; trans++)
    665  1.1  agc 	    {
    666  1.1  agc 	      int dest_id;
    667  1.1  agc 	      int depth;
    668  1.1  agc 	      int cost, cost0, err;
    669  1.1  agc 
    670  1.1  agc 	      if (trans->assertions
    671  1.1  agc 		  && (CHECK_ASSERTIONS(trans->assertions)
    672  1.1  agc 		      || CHECK_CHAR_CLASSES(trans, tnfa, eflags)))
    673  1.1  agc 		{
    674  1.1  agc 		  DPRINT(("  exact,  from %d: assert failed\n", id));
    675  1.1  agc 		  continue;
    676  1.1  agc 		}
    677  1.1  agc 
    678  1.1  agc 	      depth = reach[id].depth;
    679  1.1  agc 	      dest_id = trans->state_id;
    680  1.1  agc 
    681  1.1  agc 	      cost = reach[id].costs[depth][TRE_M_COST];
    682  1.1  agc 	      cost0 = reach[id].costs[0][TRE_M_COST];
    683  1.1  agc 	      err = 0;
    684  1.1  agc 
    685  1.1  agc 	      if (trans->code_min > (tre_cint_t)prev_c
    686  1.1  agc 		  || trans->code_max < (tre_cint_t)prev_c)
    687  1.1  agc 		{
    688  1.1  agc 		  /* Handle substitutes.  The required character was not in
    689  1.1  agc 		     the string, so match it in place of whatever was supposed
    690  1.1  agc 		     to be there and increase costs accordingly. */
    691  1.1  agc 		  err = 1;
    692  1.1  agc 
    693  1.1  agc 		  /* Compute and check cost at current depth. */
    694  1.1  agc 		  cost = reach[id].costs[depth][TRE_M_COST];
    695  1.1  agc 		  if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
    696  1.1  agc 		    cost += reach[id].params.cost_subst;
    697  1.1  agc 		  if (cost > reach[id].params.max_cost)
    698  1.1  agc 		    continue; /* Cost too large. */
    699  1.1  agc 
    700  1.1  agc 		  /* Check number of substitutes at current depth. */
    701  1.1  agc 		  if (reach[id].costs[depth][TRE_M_NUM_SUBST] + 1
    702  1.1  agc 		      > reach[id].params.max_subst)
    703  1.1  agc 		    continue; /* Too many substitutes. */
    704  1.1  agc 
    705  1.1  agc 		  /* Check total number of errors at current depth. */
    706  1.1  agc 		  if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
    707  1.1  agc 		      > reach[id].params.max_err)
    708  1.1  agc 		    continue; /* Too many errors. */
    709  1.1  agc 
    710  1.1  agc 		  /* Compute overall cost. */
    711  1.1  agc 		  cost0 = cost;
    712  1.1  agc 		  if (depth > 0)
    713  1.1  agc 		    {
    714  1.1  agc 		      cost0 = reach[id].costs[0][TRE_M_COST];
    715  1.1  agc 		      if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
    716  1.1  agc 			cost0 += reach[id].params.cost_subst;
    717  1.1  agc 		      else
    718  1.1  agc 			cost0 += default_params.cost_subst;
    719  1.1  agc 		    }
    720  1.1  agc 		  DPRINT(("  subst,  from %03d to %03d, cost %d: ",
    721  1.1  agc 			  id, dest_id, cost0));
    722  1.1  agc 		}
    723  1.1  agc 	      else
    724  1.1  agc 		DPRINT(("  exact,  from %03d to %03d, cost %d: ",
    725  1.1  agc 			id, dest_id, cost0));
    726  1.1  agc 
    727  1.1  agc 	      /* Compute tag values after this transition. */
    728  1.1  agc 	      for (i = 0; i < num_tags; i++)
    729  1.1  agc 		tmp_tags[i] = reach[id].tags[i];
    730  1.1  agc 	      if (trans->tags)
    731  1.1  agc 		for (i = 0; trans->tags[i] >= 0; i++)
    732  1.1  agc 		  if (trans->tags[i] < num_tags)
    733  1.1  agc 		    tmp_tags[trans->tags[i]] = pos;
    734  1.1  agc 
    735  1.1  agc 	      /* If another path has also reached this state, choose the
    736  1.1  agc 		 one with the smallest cost or best tags if costs are equal. */
    737  1.1  agc 	      if (reach_next[dest_id].pos == pos
    738  1.1  agc 		  && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
    739  1.1  agc 		      || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
    740  1.1  agc 			  && !tre_tag_order(num_tags, tnfa->tag_directions,
    741  1.1  agc 					    tmp_tags,
    742  1.1  agc 					    reach_next[dest_id].tags))))
    743  1.1  agc 		{
    744  1.1  agc 		  DPRINT(("lose\n"));
    745  1.1  agc 		  continue;
    746  1.1  agc 		}
    747  1.1  agc 	      DPRINT(("win %d %d\n",
    748  1.1  agc 		      reach_next[dest_id].pos,
    749  1.1  agc 		      reach_next[dest_id].costs[0][TRE_M_COST]));
    750  1.1  agc 
    751  1.1  agc 	      /* Set state, position, tags, and depth. */
    752  1.1  agc 	      reach_next[dest_id].state = trans->state;
    753  1.1  agc 	      reach_next[dest_id].pos = pos;
    754  1.1  agc 	      for (i = 0; i < num_tags; i++)
    755  1.1  agc 		reach_next[dest_id].tags[i] = tmp_tags[i];
    756  1.1  agc 	      reach_next[dest_id].depth = reach[id].depth;
    757  1.1  agc 
    758  1.1  agc 	      /* Set parameters. */
    759  1.1  agc 	      reach_next[dest_id].params = reach[id].params;
    760  1.1  agc 	      if (trans->params)
    761  1.1  agc 		tre_set_params(&reach_next[dest_id], trans->params,
    762  1.1  agc 			       default_params);
    763  1.1  agc 
    764  1.1  agc 	      /* Set the costs after this transition. */
    765  1.1  agc 		memcpy(&reach_next[dest_id].costs,
    766  1.1  agc 		       reach[id].costs,
    767  1.1  agc 		       sizeof(reach[id].costs[0][0])
    768  1.1  agc 		       * TRE_M_LAST * (depth + 1));
    769  1.1  agc 	      reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
    770  1.1  agc 	      reach_next[dest_id].costs[depth][TRE_M_NUM_SUBST] += err;
    771  1.1  agc 	      reach_next[dest_id].costs[depth][TRE_M_NUM_ERR] += err;
    772  1.1  agc 	      if (depth > 0)
    773  1.1  agc 		{
    774  1.1  agc 		  reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
    775  1.1  agc 		  reach_next[dest_id].costs[0][TRE_M_NUM_SUBST] += err;
    776  1.1  agc 		  reach_next[dest_id].costs[0][TRE_M_NUM_ERR] += err;
    777  1.1  agc 		}
    778  1.1  agc 
    779  1.1  agc 	      if (trans->state == tnfa->final
    780  1.1  agc 		  && (match_eo < 0
    781  1.1  agc 		      || cost0 < match_costs[TRE_M_COST]
    782  1.1  agc 		      || (cost0 == match_costs[TRE_M_COST]
    783  1.1  agc 			  && num_tags > 0 && tmp_tags[0] <= match_tags[0])))
    784  1.1  agc 		{
    785  1.1  agc 		  DPRINT(("    setting new match at %d, cost %d\n",
    786  1.1  agc 			  pos, cost0));
    787  1.1  agc 		  match_eo = pos;
    788  1.1  agc 		  for (i = 0; i < TRE_M_LAST; i++)
    789  1.1  agc 		    match_costs[i] = reach_next[dest_id].costs[0][i];
    790  1.1  agc 		  for (i = 0; i < num_tags; i++)
    791  1.1  agc 		    match_tags[i] = tmp_tags[i];
    792  1.1  agc 		}
    793  1.1  agc 	    }
    794  1.1  agc 	}
    795  1.1  agc     }
    796  1.1  agc 
    797  1.1  agc   DPRINT(("match end offset = %d, match cost = %d\n", match_eo,
    798  1.1  agc 	  match_costs[TRE_M_COST]));
    799  1.1  agc 
    800  1.1  agc #ifndef TRE_USE_ALLOCA
    801  1.1  agc   if (buf)
    802  1.1  agc     xfree(buf);
    803  1.1  agc #endif /* !TRE_USE_ALLOCA */
    804  1.1  agc 
    805  1.1  agc   match->cost = match_costs[TRE_M_COST];
    806  1.1  agc   match->num_ins = match_costs[TRE_M_NUM_INS];
    807  1.1  agc   match->num_del = match_costs[TRE_M_NUM_DEL];
    808  1.1  agc   match->num_subst = match_costs[TRE_M_NUM_SUBST];
    809  1.1  agc   *match_end_ofs = match_eo;
    810  1.1  agc 
    811  1.1  agc   return match_eo >= 0 ? REG_OK : REG_NOMATCH;
    812  1.1  agc }
    813