Home | History | Annotate | Line # | Download | only in lib
tre-match-approx.c revision 1.7
      1  1.1       agc /*
      2  1.1       agc   tre-match-approx.c - TRE approximate regex matching engine
      3  1.1       agc 
      4  1.1       agc   This software is released under a BSD-style license.
      5  1.1       agc   See the file LICENSE for details and copyright.
      6  1.1       agc 
      7  1.1       agc */
      8  1.1       agc #ifdef HAVE_CONFIG_H
      9  1.1       agc #include <config.h>
     10  1.1       agc #endif /* HAVE_CONFIG_H */
     11  1.1       agc 
     12  1.2  christos #include "tre-alloca.h"
     13  1.1       agc 
     14  1.1       agc #define __USE_STRING_INLINES
     15  1.1       agc #undef __NO_INLINE__
     16  1.1       agc 
     17  1.1       agc #include <assert.h>
     18  1.1       agc #include <stdlib.h>
     19  1.1       agc #include <string.h>
     20  1.1       agc #include <limits.h>
     21  1.1       agc #ifdef HAVE_WCHAR_H
     22  1.1       agc #include <wchar.h>
     23  1.1       agc #endif /* HAVE_WCHAR_H */
     24  1.1       agc #ifdef HAVE_WCTYPE_H
     25  1.1       agc #include <wctype.h>
     26  1.1       agc #endif /* HAVE_WCTYPE_H */
     27  1.1       agc #ifndef TRE_WCHAR
     28  1.1       agc #include <ctype.h>
     29  1.1       agc #endif /* !TRE_WCHAR */
     30  1.1       agc #ifdef HAVE_MALLOC_H
     31  1.1       agc #include <malloc.h>
     32  1.1       agc #endif /* HAVE_MALLOC_H */
     33  1.6       rin #include <stdint.h>
     34  1.1       agc 
     35  1.1       agc #include "tre-internal.h"
     36  1.1       agc #include "tre-match-utils.h"
     37  1.1       agc #include "tre.h"
     38  1.1       agc #include "xmalloc.h"
     39  1.1       agc 
     40  1.1       agc #define TRE_M_COST	0
     41  1.1       agc #define TRE_M_NUM_INS	1
     42  1.1       agc #define TRE_M_NUM_DEL	2
     43  1.1       agc #define TRE_M_NUM_SUBST 3
     44  1.1       agc #define TRE_M_NUM_ERR	4
     45  1.1       agc #define TRE_M_LAST	5
     46  1.1       agc 
     47  1.1       agc #define TRE_M_MAX_DEPTH 3
     48  1.1       agc 
     49  1.1       agc typedef struct {
     50  1.1       agc   /* State in the TNFA transition table. */
     51  1.1       agc   tre_tnfa_transition_t *state;
     52  1.1       agc   /* Position in input string. */
     53  1.1       agc   int pos;
     54  1.1       agc   /* Tag values. */
     55  1.1       agc   int *tags;
     56  1.1       agc   /* Matching parameters. */
     57  1.1       agc   regaparams_t params;
     58  1.1       agc   /* Nesting depth of parameters.  This is used as an index in
     59  1.1       agc      the `costs' array. */
     60  1.1       agc   int depth;
     61  1.1       agc   /* Costs and counter values for different parameter nesting depths. */
     62  1.1       agc   int costs[TRE_M_MAX_DEPTH + 1][TRE_M_LAST];
     63  1.1       agc } tre_tnfa_approx_reach_t;
     64  1.1       agc 
     65  1.1       agc 
     66  1.1       agc #ifdef TRE_DEBUG
     67  1.1       agc /* Prints the `reach' array in a readable fashion with DPRINT. */
     68  1.1       agc static void
     69  1.1       agc tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_approx_reach_t *reach,
     70  1.2  christos 		int pos, size_t num_tags)
     71  1.1       agc {
     72  1.1       agc   int id;
     73  1.1       agc 
     74  1.1       agc   /* Print each state on one line. */
     75  1.1       agc   DPRINT(("  reach:\n"));
     76  1.1       agc   for (id = 0; id < tnfa->num_states; id++)
     77  1.1       agc     {
     78  1.1       agc       int i, j;
     79  1.1       agc       if (reach[id].pos < pos)
     80  1.1       agc 	continue;  /* Not reached. */
     81  1.1       agc       DPRINT(("	 %03d, costs ", id));
     82  1.1       agc       for (i = 0; i <= reach[id].depth; i++)
     83  1.1       agc 	{
     84  1.1       agc 	  DPRINT(("["));
     85  1.1       agc 	  for (j = 0; j < TRE_M_LAST; j++)
     86  1.1       agc 	    {
     87  1.1       agc 	      DPRINT(("%2d", reach[id].costs[i][j]));
     88  1.1       agc 	      if (j + 1 < TRE_M_LAST)
     89  1.1       agc 		DPRINT((","));
     90  1.1       agc 	    }
     91  1.1       agc 	  DPRINT(("]"));
     92  1.1       agc 	  if (i + 1 <= reach[id].depth)
     93  1.1       agc 	    DPRINT((", "));
     94  1.1       agc 	}
     95  1.1       agc       DPRINT(("\n	tags "));
     96  1.1       agc       for (i = 0; i < num_tags; i++)
     97  1.1       agc 	{
     98  1.1       agc 	  DPRINT(("%02d", reach[id].tags[i]));
     99  1.1       agc 	  if (i + 1 < num_tags)
    100  1.1       agc 	    DPRINT((","));
    101  1.1       agc 	}
    102  1.1       agc       DPRINT(("\n"));
    103  1.1       agc     }
    104  1.1       agc   DPRINT(("\n"));
    105  1.1       agc }
    106  1.1       agc #endif /* TRE_DEBUG */
    107  1.1       agc 
    108  1.1       agc 
    109  1.1       agc /* Sets the matching parameters in `reach' to the ones defined in the `pa'
    110  1.1       agc    array.  If `pa' specifies default values, they are taken from
    111  1.1       agc    `default_params'. */
    112  1.1       agc inline static void
    113  1.1       agc tre_set_params(tre_tnfa_approx_reach_t *reach,
    114  1.1       agc 	       int *pa, regaparams_t default_params)
    115  1.1       agc {
    116  1.1       agc   int value;
    117  1.1       agc 
    118  1.1       agc   /* If depth is increased reset costs and counters to zero for the
    119  1.1       agc      new levels. */
    120  1.1       agc   value = pa[TRE_PARAM_DEPTH];
    121  1.1       agc   assert(value <= TRE_M_MAX_DEPTH);
    122  1.1       agc   if (value > reach->depth)
    123  1.1       agc     {
    124  1.1       agc       int i, j;
    125  1.1       agc       for (i = reach->depth + 1; i <= value; i++)
    126  1.1       agc 	for (j = 0; j < TRE_M_LAST; j++)
    127  1.1       agc 	  reach->costs[i][j] = 0;
    128  1.1       agc     }
    129  1.1       agc   reach->depth = value;
    130  1.1       agc 
    131  1.1       agc   /* Set insert cost. */
    132  1.1       agc   value = pa[TRE_PARAM_COST_INS];
    133  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    134  1.1       agc     reach->params.cost_ins = default_params.cost_ins;
    135  1.1       agc   else if (value != TRE_PARAM_UNSET)
    136  1.1       agc     reach->params.cost_ins = value;
    137  1.1       agc 
    138  1.1       agc   /* Set delete cost. */
    139  1.1       agc   value = pa[TRE_PARAM_COST_DEL];
    140  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    141  1.1       agc     reach->params.cost_del = default_params.cost_del;
    142  1.1       agc   else if (value != TRE_PARAM_UNSET)
    143  1.1       agc     reach->params.cost_del = value;
    144  1.1       agc 
    145  1.1       agc   /* Set substitute cost. */
    146  1.1       agc   value = pa[TRE_PARAM_COST_SUBST];
    147  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    148  1.1       agc     reach->params.cost_subst = default_params.cost_subst;
    149  1.1       agc   else
    150  1.1       agc     reach->params.cost_subst = value;
    151  1.1       agc 
    152  1.1       agc   /* Set maximum cost. */
    153  1.1       agc   value = pa[TRE_PARAM_COST_MAX];
    154  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    155  1.1       agc     reach->params.max_cost = default_params.max_cost;
    156  1.1       agc   else if (value != TRE_PARAM_UNSET)
    157  1.1       agc     reach->params.max_cost = value;
    158  1.1       agc 
    159  1.1       agc   /* Set maximum inserts. */
    160  1.1       agc   value = pa[TRE_PARAM_MAX_INS];
    161  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    162  1.1       agc     reach->params.max_ins = default_params.max_ins;
    163  1.1       agc   else if (value != TRE_PARAM_UNSET)
    164  1.1       agc     reach->params.max_ins = value;
    165  1.1       agc 
    166  1.1       agc   /* Set maximum deletes. */
    167  1.1       agc   value = pa[TRE_PARAM_MAX_DEL];
    168  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    169  1.1       agc     reach->params.max_del = default_params.max_del;
    170  1.1       agc   else if (value != TRE_PARAM_UNSET)
    171  1.1       agc     reach->params.max_del = value;
    172  1.1       agc 
    173  1.1       agc   /* Set maximum substitutes. */
    174  1.1       agc   value = pa[TRE_PARAM_MAX_SUBST];
    175  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    176  1.1       agc     reach->params.max_subst = default_params.max_subst;
    177  1.1       agc   else if (value != TRE_PARAM_UNSET)
    178  1.1       agc     reach->params.max_subst = value;
    179  1.1       agc 
    180  1.1       agc   /* Set maximum number of errors. */
    181  1.1       agc   value = pa[TRE_PARAM_MAX_ERR];
    182  1.1       agc   if (value == TRE_PARAM_DEFAULT)
    183  1.1       agc     reach->params.max_err = default_params.max_err;
    184  1.1       agc   else if (value != TRE_PARAM_UNSET)
    185  1.1       agc     reach->params.max_err = value;
    186  1.1       agc }
    187  1.1       agc 
    188  1.1       agc reg_errcode_t
    189  1.1       agc tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
    190  1.1       agc 		    tre_str_type_t type, int *match_tags,
    191  1.1       agc 		    regamatch_t *match, regaparams_t default_params,
    192  1.1       agc 		    int eflags, int *match_end_ofs)
    193  1.1       agc {
    194  1.1       agc   /* State variables required by GET_NEXT_WCHAR. */
    195  1.1       agc   tre_char_t prev_c = 0, next_c = 0;
    196  1.1       agc   const char *str_byte = string;
    197  1.1       agc   int pos = -1;
    198  1.1       agc   unsigned int pos_add_next = 1;
    199  1.1       agc #ifdef TRE_WCHAR
    200  1.1       agc   const wchar_t *str_wide = string;
    201  1.1       agc #ifdef TRE_MBSTATE
    202  1.1       agc   mbstate_t mbstate;
    203  1.1       agc #endif /* !TRE_WCHAR */
    204  1.1       agc #endif /* TRE_WCHAR */
    205  1.1       agc   int reg_notbol = eflags & REG_NOTBOL;
    206  1.1       agc   int reg_noteol = eflags & REG_NOTEOL;
    207  1.1       agc   int reg_newline = tnfa->cflags & REG_NEWLINE;
    208  1.2  christos   size_t str_user_end = 0;
    209  1.1       agc 
    210  1.1       agc   int prev_pos;
    211  1.1       agc 
    212  1.1       agc   /* Number of tags. */
    213  1.2  christos   size_t num_tags;
    214  1.1       agc   /* The reach tables. */
    215  1.1       agc   tre_tnfa_approx_reach_t *reach, *reach_next;
    216  1.1       agc   /* Tag array for temporary use. */
    217  1.1       agc   int *tmp_tags;
    218  1.1       agc 
    219  1.1       agc   /* End offset of best match so far, or -1 if no match found yet. */
    220  1.1       agc   int match_eo = -1;
    221  1.1       agc   /* Costs of the match. */
    222  1.1       agc   int match_costs[TRE_M_LAST];
    223  1.1       agc 
    224  1.1       agc   /* Space for temporary data required for matching. */
    225  1.1       agc   unsigned char *buf;
    226  1.1       agc 
    227  1.2  christos   size_t i, id;
    228  1.1       agc 
    229  1.4       rin   reg_errcode_t ret;
    230  1.4       rin 
    231  1.1       agc   if (!match_tags)
    232  1.1       agc     num_tags = 0;
    233  1.1       agc   else
    234  1.1       agc     num_tags = tnfa->num_tags;
    235  1.1       agc 
    236  1.1       agc #ifdef TRE_MBSTATE
    237  1.1       agc   memset(&mbstate, '\0', sizeof(mbstate));
    238  1.1       agc #endif /* TRE_MBSTATE */
    239  1.1       agc 
    240  1.1       agc   DPRINT(("tre_tnfa_run_approx, input type %d, len %d, eflags %d, "
    241  1.1       agc 	  "match_tags %p\n",
    242  1.1       agc 	  type, len, eflags,
    243  1.1       agc 	  match_tags));
    244  1.1       agc   DPRINT(("max cost %d, ins %d, del %d, subst %d\n",
    245  1.1       agc 	  default_params.max_cost,
    246  1.1       agc 	  default_params.cost_ins,
    247  1.1       agc 	  default_params.cost_del,
    248  1.1       agc 	  default_params.cost_subst));
    249  1.1       agc 
    250  1.1       agc   /* Allocate memory for temporary data required for matching.	This needs to
    251  1.1       agc      be done for every matching operation to be thread safe.  This allocates
    252  1.1       agc      everything in a single large block from the stack frame using alloca()
    253  1.1       agc      or with malloc() if alloca is unavailable. */
    254  1.1       agc   {
    255  1.1       agc     unsigned char *buf_cursor;
    256  1.5       rin 
    257  1.5       rin     /* Ensure that tag_bytes*num_states cannot overflow, and that it don't
    258  1.5       rin      * contribute more than 1/8 of SIZE_MAX to total_bytes. */
    259  1.5       rin     if (num_tags > SIZE_MAX/(8 * sizeof(*tmp_tags) * tnfa->num_states))
    260  1.5       rin       return REG_ESPACE;
    261  1.5       rin 
    262  1.5       rin     /* Likewise check reach_bytes. */
    263  1.5       rin     if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_next)))
    264  1.5       rin       return REG_ESPACE;
    265  1.5       rin 
    266  1.1       agc     /* Space needed for one array of tags. */
    267  1.2  christos     size_t tag_bytes = sizeof(*tmp_tags) * num_tags;
    268  1.1       agc     /* Space needed for one reach table. */
    269  1.2  christos     size_t reach_bytes = sizeof(*reach_next) * tnfa->num_states;
    270  1.1       agc     /* Total space needed. */
    271  1.2  christos     size_t total_bytes = reach_bytes * 2 + (tnfa->num_states * 2 + 1 ) * tag_bytes;
    272  1.1       agc     /* Add some extra to make sure we can align the pointers.  The multiplier
    273  1.1       agc        used here must be equal to the number of ALIGN calls below. */
    274  1.1       agc     total_bytes += (sizeof(long) - 1) * 3;
    275  1.1       agc 
    276  1.1       agc     /* Allocate the memory. */
    277  1.1       agc #ifdef TRE_USE_ALLOCA
    278  1.1       agc     buf = alloca(total_bytes);
    279  1.1       agc #else /* !TRE_USE_ALLOCA */
    280  1.1       agc     buf = xmalloc((unsigned)total_bytes);
    281  1.1       agc #endif /* !TRE_USE_ALLOCA */
    282  1.1       agc     if (!buf)
    283  1.1       agc       return REG_ESPACE;
    284  1.1       agc     memset(buf, 0, (size_t)total_bytes);
    285  1.1       agc 
    286  1.1       agc     /* Allocate `tmp_tags' from `buf'. */
    287  1.1       agc     tmp_tags = (void *)buf;
    288  1.1       agc     buf_cursor = buf + tag_bytes;
    289  1.1       agc     buf_cursor += ALIGN(buf_cursor, long);
    290  1.1       agc 
    291  1.1       agc     /* Allocate `reach' from `buf'. */
    292  1.1       agc     reach = (void *)buf_cursor;
    293  1.1       agc     buf_cursor += reach_bytes;
    294  1.1       agc     buf_cursor += ALIGN(buf_cursor, long);
    295  1.1       agc 
    296  1.1       agc     /* Allocate `reach_next' from `buf'. */
    297  1.1       agc     reach_next = (void *)buf_cursor;
    298  1.1       agc     buf_cursor += reach_bytes;
    299  1.1       agc     buf_cursor += ALIGN(buf_cursor, long);
    300  1.1       agc 
    301  1.1       agc     /* Allocate tag arrays for `reach' and `reach_next' from `buf'. */
    302  1.1       agc     for (i = 0; i < tnfa->num_states; i++)
    303  1.1       agc       {
    304  1.1       agc 	reach[i].tags = (void *)buf_cursor;
    305  1.1       agc 	buf_cursor += tag_bytes;
    306  1.1       agc 	reach_next[i].tags = (void *)buf_cursor;
    307  1.1       agc 	buf_cursor += tag_bytes;
    308  1.1       agc       }
    309  1.1       agc     assert(buf_cursor <= buf + total_bytes);
    310  1.1       agc   }
    311  1.1       agc 
    312  1.1       agc   for (i = 0; i < TRE_M_LAST; i++)
    313  1.1       agc     match_costs[i] = INT_MAX;
    314  1.1       agc 
    315  1.1       agc   /* Mark the reach arrays empty. */
    316  1.1       agc   for (i = 0; i < tnfa->num_states; i++)
    317  1.1       agc     reach[i].pos = reach_next[i].pos = -2;
    318  1.1       agc 
    319  1.1       agc   prev_pos = pos;
    320  1.1       agc   GET_NEXT_WCHAR();
    321  1.1       agc   pos = 0;
    322  1.1       agc 
    323  1.3       rin   while (/*CONSTCOND*/(void)1,1)
    324  1.1       agc     {
    325  1.1       agc       DPRINT(("%03d:%2lc/%05d\n", pos, (tre_cint_t)next_c, (int)next_c));
    326  1.1       agc 
    327  1.1       agc       /* Add initial states to `reach_next' if an exact match has not yet
    328  1.1       agc 	 been found. */
    329  1.1       agc       if (match_costs[TRE_M_COST] > 0)
    330  1.1       agc 	{
    331  1.1       agc 	  tre_tnfa_transition_t *trans;
    332  1.1       agc 	  DPRINT(("  init"));
    333  1.1       agc 	  for (trans = tnfa->initial; trans->state; trans++)
    334  1.1       agc 	    {
    335  1.1       agc 	      int stateid = trans->state_id;
    336  1.1       agc 
    337  1.1       agc 	      /* If this state is not currently in `reach_next', add it
    338  1.1       agc 		 there. */
    339  1.1       agc 	      if (reach_next[stateid].pos < pos)
    340  1.1       agc 		{
    341  1.1       agc 		  if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
    342  1.1       agc 		    {
    343  1.1       agc 		      /* Assertions failed, don't add this state. */
    344  1.1       agc 		      DPRINT((" !%d (assert)", stateid));
    345  1.1       agc 		      continue;
    346  1.1       agc 		    }
    347  1.1       agc 		  DPRINT((" %d", stateid));
    348  1.1       agc 		  reach_next[stateid].state = trans->state;
    349  1.1       agc 		  reach_next[stateid].pos = pos;
    350  1.1       agc 
    351  1.1       agc 		  /* Compute tag values after this transition. */
    352  1.1       agc 		  for (i = 0; i < num_tags; i++)
    353  1.1       agc 		    reach_next[stateid].tags[i] = -1;
    354  1.1       agc 
    355  1.1       agc 		  if (trans->tags)
    356  1.1       agc 		    for (i = 0; trans->tags[i] >= 0; i++)
    357  1.2  christos 		      if ((size_t)trans->tags[i] < num_tags)
    358  1.1       agc 			reach_next[stateid].tags[trans->tags[i]] = pos;
    359  1.1       agc 
    360  1.1       agc 		  /* Set the parameters, depth, and costs. */
    361  1.1       agc 		  reach_next[stateid].params = default_params;
    362  1.1       agc 		  reach_next[stateid].depth = 0;
    363  1.1       agc 		  for (i = 0; i < TRE_M_LAST; i++)
    364  1.1       agc 		    reach_next[stateid].costs[0][i] = 0;
    365  1.1       agc 		  if (trans->params)
    366  1.1       agc 		    tre_set_params(&reach_next[stateid], trans->params,
    367  1.1       agc 				   default_params);
    368  1.1       agc 
    369  1.1       agc 		  /* If this is the final state, mark the exact match. */
    370  1.1       agc 		  if (trans->state == tnfa->final)
    371  1.1       agc 		    {
    372  1.1       agc 		      match_eo = pos;
    373  1.1       agc 		      for (i = 0; i < num_tags; i++)
    374  1.1       agc 			match_tags[i] = reach_next[stateid].tags[i];
    375  1.1       agc 		      for (i = 0; i < TRE_M_LAST; i++)
    376  1.1       agc 			match_costs[i] = 0;
    377  1.1       agc 		    }
    378  1.1       agc 		}
    379  1.1       agc 	    }
    380  1.1       agc 	    DPRINT(("\n"));
    381  1.1       agc 	}
    382  1.1       agc 
    383  1.1       agc 
    384  1.1       agc       /* Handle inserts.  This is done by pretending there's an epsilon
    385  1.1       agc 	 transition from each state in `reach' back to the same state.
    386  1.1       agc 	 We don't need to worry about the final state here; this will never
    387  1.1       agc 	 give a better match than what we already have. */
    388  1.1       agc       for (id = 0; id < tnfa->num_states; id++)
    389  1.1       agc 	{
    390  1.1       agc 	  int depth;
    391  1.1       agc 	  int cost, cost0;
    392  1.1       agc 
    393  1.1       agc 	  if (reach[id].pos != prev_pos)
    394  1.1       agc 	    {
    395  1.1       agc 	      DPRINT(("	 insert: %d not reached\n", id));
    396  1.1       agc 	      continue;	 /* Not reached. */
    397  1.1       agc 	    }
    398  1.1       agc 
    399  1.1       agc 	  depth = reach[id].depth;
    400  1.1       agc 
    401  1.1       agc 	  /* Compute and check cost at current depth. */
    402  1.1       agc 	  cost = reach[id].costs[depth][TRE_M_COST];
    403  1.1       agc 	  if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
    404  1.1       agc 	    cost += reach[id].params.cost_ins;
    405  1.1       agc 	  if (cost > reach[id].params.max_cost)
    406  1.1       agc 	    continue;  /* Cost too large. */
    407  1.1       agc 
    408  1.1       agc 	  /* Check number of inserts at current depth. */
    409  1.1       agc 	  if (reach[id].costs[depth][TRE_M_NUM_INS] + 1
    410  1.1       agc 	      > reach[id].params.max_ins)
    411  1.1       agc 	    continue;  /* Too many inserts. */
    412  1.1       agc 
    413  1.1       agc 	  /* Check total number of errors at current depth. */
    414  1.1       agc 	  if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
    415  1.1       agc 	      > reach[id].params.max_err)
    416  1.1       agc 	    continue;  /* Too many errors. */
    417  1.1       agc 
    418  1.1       agc 	  /* Compute overall cost. */
    419  1.1       agc 	  cost0 = cost;
    420  1.1       agc 	  if (depth > 0)
    421  1.1       agc 	    {
    422  1.1       agc 	      cost0 = reach[id].costs[0][TRE_M_COST];
    423  1.1       agc 	      if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
    424  1.1       agc 		cost0 += reach[id].params.cost_ins;
    425  1.1       agc 	      else
    426  1.1       agc 		cost0 += default_params.cost_ins;
    427  1.1       agc 	    }
    428  1.1       agc 
    429  1.1       agc 	  DPRINT(("  insert: from %d to %d, cost %d: ", id, id,
    430  1.1       agc 		  reach[id].costs[depth][TRE_M_COST]));
    431  1.1       agc 	  if (reach_next[id].pos == pos
    432  1.1       agc 	      && (cost0 >= reach_next[id].costs[0][TRE_M_COST]))
    433  1.1       agc 	    {
    434  1.1       agc 	      DPRINT(("lose\n"));
    435  1.1       agc 	      continue;
    436  1.1       agc 	    }
    437  1.1       agc 	  DPRINT(("win\n"));
    438  1.1       agc 
    439  1.1       agc 	  /* Copy state, position, tags, parameters, and depth. */
    440  1.1       agc 	  reach_next[id].state = reach[id].state;
    441  1.1       agc 	  reach_next[id].pos = pos;
    442  1.1       agc 	  for (i = 0; i < num_tags; i++)
    443  1.1       agc 	    reach_next[id].tags[i] = reach[id].tags[i];
    444  1.1       agc 	  reach_next[id].params = reach[id].params;
    445  1.1       agc 	  reach_next[id].depth = reach[id].depth;
    446  1.1       agc 
    447  1.1       agc 	  /* Set the costs after this transition. */
    448  1.1       agc 	  memcpy(reach_next[id].costs, reach[id].costs,
    449  1.1       agc 		 sizeof(reach_next[id].costs[0][0])
    450  1.1       agc 		 * TRE_M_LAST * (depth + 1));
    451  1.1       agc 	  reach_next[id].costs[depth][TRE_M_COST] = cost;
    452  1.1       agc 	  reach_next[id].costs[depth][TRE_M_NUM_INS]++;
    453  1.1       agc 	  reach_next[id].costs[depth][TRE_M_NUM_ERR]++;
    454  1.1       agc 	  if (depth > 0)
    455  1.1       agc 	    {
    456  1.1       agc 	      reach_next[id].costs[0][TRE_M_COST] = cost0;
    457  1.1       agc 	      reach_next[id].costs[0][TRE_M_NUM_INS]++;
    458  1.1       agc 	      reach_next[id].costs[0][TRE_M_NUM_ERR]++;
    459  1.1       agc 	    }
    460  1.1       agc 
    461  1.1       agc 	}
    462  1.1       agc 
    463  1.1       agc 
    464  1.1       agc       /* Handle deletes.  This is done by traversing through the whole TNFA
    465  1.1       agc 	 pretending that all transitions are epsilon transitions, until
    466  1.1       agc 	 no more states can be reached with better costs. */
    467  1.1       agc       {
    468  1.1       agc 	/* XXX - dynamic ringbuffer size */
    469  1.1       agc 	tre_tnfa_approx_reach_t *ringbuffer[512];
    470  1.1       agc 	tre_tnfa_approx_reach_t **deque_start, **deque_end;
    471  1.1       agc 
    472  1.1       agc 	deque_start = deque_end = ringbuffer;
    473  1.1       agc 
    474  1.1       agc 	/* Add all states in `reach_next' to the deque. */
    475  1.1       agc 	for (id = 0; id < tnfa->num_states; id++)
    476  1.1       agc 	  {
    477  1.1       agc 	    if (reach_next[id].pos != pos)
    478  1.1       agc 	      continue;
    479  1.1       agc 	    *deque_end = &reach_next[id];
    480  1.1       agc 	    deque_end++;
    481  1.1       agc 	    assert(deque_end != deque_start);
    482  1.1       agc 	  }
    483  1.1       agc 
    484  1.1       agc 	/* Repeat until the deque is empty. */
    485  1.1       agc 	while (deque_end != deque_start)
    486  1.1       agc 	  {
    487  1.1       agc 	    tre_tnfa_approx_reach_t *reach_p;
    488  1.1       agc 	    int depth;
    489  1.1       agc 	    int cost, cost0;
    490  1.1       agc 	    tre_tnfa_transition_t *trans;
    491  1.1       agc 
    492  1.1       agc 	    /* Pop the first item off the deque. */
    493  1.1       agc 	    reach_p = *deque_start;
    494  1.1       agc 	    id = reach_p - reach_next;
    495  1.1       agc 	    depth = reach_p->depth;
    496  1.1       agc 
    497  1.1       agc 	    /* Compute cost at current depth. */
    498  1.1       agc 	    cost = reach_p->costs[depth][TRE_M_COST];
    499  1.1       agc 	    if (reach_p->params.cost_del != TRE_PARAM_UNSET)
    500  1.1       agc 	      cost += reach_p->params.cost_del;
    501  1.1       agc 
    502  1.1       agc 	    /* Check cost, number of deletes, and total number of errors
    503  1.1       agc 	       at current depth. */
    504  1.1       agc 	    if (cost > reach_p->params.max_cost
    505  1.1       agc 		|| (reach_p->costs[depth][TRE_M_NUM_DEL] + 1
    506  1.1       agc 		    > reach_p->params.max_del)
    507  1.1       agc 		|| (reach_p->costs[depth][TRE_M_NUM_ERR] + 1
    508  1.1       agc 		    > reach_p->params.max_err))
    509  1.1       agc 	      {
    510  1.1       agc 		/* Too many errors or cost too large. */
    511  1.1       agc 		DPRINT(("  delete: from %03d: cost too large\n", id));
    512  1.1       agc 		deque_start++;
    513  1.1       agc 		if (deque_start >= (ringbuffer + 512))
    514  1.1       agc 		  deque_start = ringbuffer;
    515  1.1       agc 		continue;
    516  1.1       agc 	      }
    517  1.1       agc 
    518  1.1       agc 	    /* Compute overall cost. */
    519  1.1       agc 	    cost0 = cost;
    520  1.1       agc 	    if (depth > 0)
    521  1.1       agc 	      {
    522  1.1       agc 		cost0 = reach_p->costs[0][TRE_M_COST];
    523  1.1       agc 		if (reach_p->params.cost_del != TRE_PARAM_UNSET)
    524  1.1       agc 		  cost0 += reach_p->params.cost_del;
    525  1.1       agc 		else
    526  1.1       agc 		  cost0 += default_params.cost_del;
    527  1.1       agc 	      }
    528  1.1       agc 
    529  1.1       agc 	    for (trans = reach_p->state; trans->state; trans++)
    530  1.1       agc 	      {
    531  1.1       agc 		int dest_id = trans->state_id;
    532  1.1       agc 		DPRINT(("  delete: from %03d to %03d, cost %d (%d): ",
    533  1.1       agc 			id, dest_id, cost0, reach_p->params.max_cost));
    534  1.1       agc 
    535  1.1       agc 		if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
    536  1.1       agc 		  {
    537  1.1       agc 		    DPRINT(("assertion failed\n"));
    538  1.1       agc 		    continue;
    539  1.1       agc 		  }
    540  1.1       agc 
    541  1.1       agc 		/* Compute tag values after this transition. */
    542  1.1       agc 		for (i = 0; i < num_tags; i++)
    543  1.1       agc 		  tmp_tags[i] = reach_p->tags[i];
    544  1.1       agc 		if (trans->tags)
    545  1.1       agc 		  for (i = 0; trans->tags[i] >= 0; i++)
    546  1.2  christos 		    if ((size_t)trans->tags[i] < num_tags)
    547  1.1       agc 		      tmp_tags[trans->tags[i]] = pos;
    548  1.1       agc 
    549  1.1       agc 		/* If another path has also reached this state, choose the one
    550  1.1       agc 		   with the smallest cost or best tags if costs are equal. */
    551  1.1       agc 		if (reach_next[dest_id].pos == pos
    552  1.1       agc 		    && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
    553  1.1       agc 			|| (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
    554  1.1       agc 			    && (!match_tags
    555  1.1       agc 				|| !tre_tag_order(num_tags,
    556  1.1       agc 						  tnfa->tag_directions,
    557  1.1       agc 						  tmp_tags,
    558  1.1       agc 						  reach_next[dest_id].tags)))))
    559  1.1       agc 		  {
    560  1.1       agc 		    DPRINT(("lose, cost0 %d, have %d\n",
    561  1.1       agc 			    cost0, reach_next[dest_id].costs[0][TRE_M_COST]));
    562  1.1       agc 		    continue;
    563  1.1       agc 		  }
    564  1.1       agc 		DPRINT(("win\n"));
    565  1.1       agc 
    566  1.1       agc 		/* Set state, position, tags, parameters, depth, and costs. */
    567  1.1       agc 		reach_next[dest_id].state = trans->state;
    568  1.1       agc 		reach_next[dest_id].pos = pos;
    569  1.1       agc 		for (i = 0; i < num_tags; i++)
    570  1.1       agc 		  reach_next[dest_id].tags[i] = tmp_tags[i];
    571  1.1       agc 
    572  1.1       agc 		reach_next[dest_id].params = reach_p->params;
    573  1.1       agc 		if (trans->params)
    574  1.1       agc 		  tre_set_params(&reach_next[dest_id], trans->params,
    575  1.1       agc 				 default_params);
    576  1.1       agc 
    577  1.1       agc 		reach_next[dest_id].depth = reach_p->depth;
    578  1.1       agc 		memcpy(&reach_next[dest_id].costs,
    579  1.1       agc 		       reach_p->costs,
    580  1.1       agc 		       sizeof(reach_p->costs[0][0])
    581  1.1       agc 		       * TRE_M_LAST * (depth + 1));
    582  1.1       agc 		reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
    583  1.1       agc 		reach_next[dest_id].costs[depth][TRE_M_NUM_DEL]++;
    584  1.1       agc 		reach_next[dest_id].costs[depth][TRE_M_NUM_ERR]++;
    585  1.1       agc 		if (depth > 0)
    586  1.1       agc 		  {
    587  1.1       agc 		    reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
    588  1.1       agc 		    reach_next[dest_id].costs[0][TRE_M_NUM_DEL]++;
    589  1.1       agc 		    reach_next[dest_id].costs[0][TRE_M_NUM_ERR]++;
    590  1.1       agc 		  }
    591  1.1       agc 
    592  1.1       agc 		if (trans->state == tnfa->final
    593  1.1       agc 		    && (match_eo < 0
    594  1.1       agc 			|| match_costs[TRE_M_COST] > cost0
    595  1.1       agc 			|| (match_costs[TRE_M_COST] == cost0
    596  1.1       agc 			    && (num_tags > 0
    597  1.1       agc 				&& tmp_tags[0] <= match_tags[0]))))
    598  1.1       agc 		  {
    599  1.1       agc 		    DPRINT(("	 setting new match at %d, cost %d\n",
    600  1.1       agc 			    pos, cost0));
    601  1.1       agc 		    match_eo = pos;
    602  1.1       agc 		    memcpy(match_costs, reach_next[dest_id].costs[0],
    603  1.1       agc 			   sizeof(match_costs[0]) * TRE_M_LAST);
    604  1.1       agc 		    for (i = 0; i < num_tags; i++)
    605  1.1       agc 		      match_tags[i] = tmp_tags[i];
    606  1.1       agc 		  }
    607  1.1       agc 
    608  1.1       agc 		/* Add to the end of the deque. */
    609  1.1       agc 		*deque_end = &reach_next[dest_id];
    610  1.1       agc 		deque_end++;
    611  1.1       agc 		if (deque_end >= (ringbuffer + 512))
    612  1.1       agc 		  deque_end = ringbuffer;
    613  1.1       agc 		assert(deque_end != deque_start);
    614  1.1       agc 	      }
    615  1.1       agc 	    deque_start++;
    616  1.1       agc 	    if (deque_start >= (ringbuffer + 512))
    617  1.1       agc 	      deque_start = ringbuffer;
    618  1.1       agc 	  }
    619  1.1       agc 
    620  1.1       agc       }
    621  1.1       agc 
    622  1.1       agc #ifdef TRE_DEBUG
    623  1.1       agc       tre_print_reach(tnfa, reach_next, pos, num_tags);
    624  1.1       agc #endif /* TRE_DEBUG */
    625  1.1       agc 
    626  1.1       agc       /* Check for end of string. */
    627  1.1       agc       if (len < 0)
    628  1.1       agc 	{
    629  1.1       agc 	  if (type == STR_USER)
    630  1.1       agc 	    {
    631  1.1       agc 	      if (str_user_end)
    632  1.1       agc 		break;
    633  1.1       agc 	    }
    634  1.1       agc 	  else if (next_c == L'\0')
    635  1.1       agc 	    break;
    636  1.1       agc 	}
    637  1.1       agc       else
    638  1.1       agc 	{
    639  1.1       agc 	  if (pos >= len)
    640  1.1       agc 	    break;
    641  1.1       agc 	}
    642  1.1       agc 
    643  1.1       agc       prev_pos = pos;
    644  1.1       agc       GET_NEXT_WCHAR();
    645  1.1       agc 
    646  1.1       agc       /* Swap `reach' and `reach_next'. */
    647  1.1       agc       {
    648  1.1       agc 	tre_tnfa_approx_reach_t *tmp;
    649  1.1       agc 	tmp = reach;
    650  1.1       agc 	reach = reach_next;
    651  1.1       agc 	reach_next = tmp;
    652  1.1       agc       }
    653  1.1       agc 
    654  1.1       agc       /* Handle exact matches and substitutions. */
    655  1.1       agc       for (id = 0; id < tnfa->num_states; id++)
    656  1.1       agc 	{
    657  1.1       agc 	  tre_tnfa_transition_t *trans;
    658  1.1       agc 
    659  1.1       agc 	  if (reach[id].pos < prev_pos)
    660  1.1       agc 	    continue;  /* Not reached. */
    661  1.1       agc 	  for (trans = reach[id].state; trans->state; trans++)
    662  1.1       agc 	    {
    663  1.1       agc 	      int dest_id;
    664  1.1       agc 	      int depth;
    665  1.1       agc 	      int cost, cost0, err;
    666  1.1       agc 
    667  1.1       agc 	      if (trans->assertions
    668  1.1       agc 		  && (CHECK_ASSERTIONS(trans->assertions)
    669  1.1       agc 		      || CHECK_CHAR_CLASSES(trans, tnfa, eflags)))
    670  1.1       agc 		{
    671  1.1       agc 		  DPRINT(("  exact,  from %d: assert failed\n", id));
    672  1.1       agc 		  continue;
    673  1.1       agc 		}
    674  1.1       agc 
    675  1.1       agc 	      depth = reach[id].depth;
    676  1.1       agc 	      dest_id = trans->state_id;
    677  1.1       agc 
    678  1.1       agc 	      cost = reach[id].costs[depth][TRE_M_COST];
    679  1.1       agc 	      cost0 = reach[id].costs[0][TRE_M_COST];
    680  1.1       agc 	      err = 0;
    681  1.1       agc 
    682  1.1       agc 	      if (trans->code_min > (tre_cint_t)prev_c
    683  1.1       agc 		  || trans->code_max < (tre_cint_t)prev_c)
    684  1.1       agc 		{
    685  1.1       agc 		  /* Handle substitutes.  The required character was not in
    686  1.1       agc 		     the string, so match it in place of whatever was supposed
    687  1.1       agc 		     to be there and increase costs accordingly. */
    688  1.1       agc 		  err = 1;
    689  1.1       agc 
    690  1.1       agc 		  /* Compute and check cost at current depth. */
    691  1.1       agc 		  cost = reach[id].costs[depth][TRE_M_COST];
    692  1.1       agc 		  if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
    693  1.1       agc 		    cost += reach[id].params.cost_subst;
    694  1.1       agc 		  if (cost > reach[id].params.max_cost)
    695  1.1       agc 		    continue; /* Cost too large. */
    696  1.1       agc 
    697  1.1       agc 		  /* Check number of substitutes at current depth. */
    698  1.1       agc 		  if (reach[id].costs[depth][TRE_M_NUM_SUBST] + 1
    699  1.1       agc 		      > reach[id].params.max_subst)
    700  1.1       agc 		    continue; /* Too many substitutes. */
    701  1.1       agc 
    702  1.1       agc 		  /* Check total number of errors at current depth. */
    703  1.1       agc 		  if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
    704  1.1       agc 		      > reach[id].params.max_err)
    705  1.1       agc 		    continue; /* Too many errors. */
    706  1.1       agc 
    707  1.1       agc 		  /* Compute overall cost. */
    708  1.1       agc 		  cost0 = cost;
    709  1.1       agc 		  if (depth > 0)
    710  1.1       agc 		    {
    711  1.1       agc 		      cost0 = reach[id].costs[0][TRE_M_COST];
    712  1.1       agc 		      if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
    713  1.1       agc 			cost0 += reach[id].params.cost_subst;
    714  1.1       agc 		      else
    715  1.1       agc 			cost0 += default_params.cost_subst;
    716  1.1       agc 		    }
    717  1.1       agc 		  DPRINT(("  subst,  from %03d to %03d, cost %d: ",
    718  1.1       agc 			  id, dest_id, cost0));
    719  1.1       agc 		}
    720  1.1       agc 	      else
    721  1.1       agc 		DPRINT(("  exact,  from %03d to %03d, cost %d: ",
    722  1.1       agc 			id, dest_id, cost0));
    723  1.1       agc 
    724  1.1       agc 	      /* Compute tag values after this transition. */
    725  1.1       agc 	      for (i = 0; i < num_tags; i++)
    726  1.1       agc 		tmp_tags[i] = reach[id].tags[i];
    727  1.1       agc 	      if (trans->tags)
    728  1.1       agc 		for (i = 0; trans->tags[i] >= 0; i++)
    729  1.2  christos 		  if ((size_t)trans->tags[i] < num_tags)
    730  1.1       agc 		    tmp_tags[trans->tags[i]] = pos;
    731  1.1       agc 
    732  1.1       agc 	      /* If another path has also reached this state, choose the
    733  1.1       agc 		 one with the smallest cost or best tags if costs are equal. */
    734  1.1       agc 	      if (reach_next[dest_id].pos == pos
    735  1.1       agc 		  && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
    736  1.1       agc 		      || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
    737  1.1       agc 			  && !tre_tag_order(num_tags, tnfa->tag_directions,
    738  1.1       agc 					    tmp_tags,
    739  1.1       agc 					    reach_next[dest_id].tags))))
    740  1.1       agc 		{
    741  1.1       agc 		  DPRINT(("lose\n"));
    742  1.1       agc 		  continue;
    743  1.1       agc 		}
    744  1.1       agc 	      DPRINT(("win %d %d\n",
    745  1.1       agc 		      reach_next[dest_id].pos,
    746  1.1       agc 		      reach_next[dest_id].costs[0][TRE_M_COST]));
    747  1.1       agc 
    748  1.1       agc 	      /* Set state, position, tags, and depth. */
    749  1.1       agc 	      reach_next[dest_id].state = trans->state;
    750  1.1       agc 	      reach_next[dest_id].pos = pos;
    751  1.1       agc 	      for (i = 0; i < num_tags; i++)
    752  1.1       agc 		reach_next[dest_id].tags[i] = tmp_tags[i];
    753  1.1       agc 	      reach_next[dest_id].depth = reach[id].depth;
    754  1.1       agc 
    755  1.1       agc 	      /* Set parameters. */
    756  1.1       agc 	      reach_next[dest_id].params = reach[id].params;
    757  1.1       agc 	      if (trans->params)
    758  1.1       agc 		tre_set_params(&reach_next[dest_id], trans->params,
    759  1.1       agc 			       default_params);
    760  1.1       agc 
    761  1.1       agc 	      /* Set the costs after this transition. */
    762  1.7     joerg 	      memcpy(&reach_next[dest_id].costs,
    763  1.7     joerg 		     reach[id].costs,
    764  1.7     joerg 		     sizeof(reach[id].costs[0][0])
    765  1.7     joerg 		     * TRE_M_LAST * (depth + 1));
    766  1.1       agc 	      reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
    767  1.1       agc 	      reach_next[dest_id].costs[depth][TRE_M_NUM_SUBST] += err;
    768  1.1       agc 	      reach_next[dest_id].costs[depth][TRE_M_NUM_ERR] += err;
    769  1.1       agc 	      if (depth > 0)
    770  1.1       agc 		{
    771  1.1       agc 		  reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
    772  1.1       agc 		  reach_next[dest_id].costs[0][TRE_M_NUM_SUBST] += err;
    773  1.1       agc 		  reach_next[dest_id].costs[0][TRE_M_NUM_ERR] += err;
    774  1.1       agc 		}
    775  1.1       agc 
    776  1.1       agc 	      if (trans->state == tnfa->final
    777  1.1       agc 		  && (match_eo < 0
    778  1.1       agc 		      || cost0 < match_costs[TRE_M_COST]
    779  1.1       agc 		      || (cost0 == match_costs[TRE_M_COST]
    780  1.1       agc 			  && num_tags > 0 && tmp_tags[0] <= match_tags[0])))
    781  1.1       agc 		{
    782  1.1       agc 		  DPRINT(("    setting new match at %d, cost %d\n",
    783  1.1       agc 			  pos, cost0));
    784  1.1       agc 		  match_eo = pos;
    785  1.1       agc 		  for (i = 0; i < TRE_M_LAST; i++)
    786  1.1       agc 		    match_costs[i] = reach_next[dest_id].costs[0][i];
    787  1.1       agc 		  for (i = 0; i < num_tags; i++)
    788  1.1       agc 		    match_tags[i] = tmp_tags[i];
    789  1.1       agc 		}
    790  1.1       agc 	    }
    791  1.1       agc 	}
    792  1.1       agc     }
    793  1.1       agc 
    794  1.1       agc   DPRINT(("match end offset = %d, match cost = %d\n", match_eo,
    795  1.1       agc 	  match_costs[TRE_M_COST]));
    796  1.1       agc 
    797  1.1       agc   match->cost = match_costs[TRE_M_COST];
    798  1.1       agc   match->num_ins = match_costs[TRE_M_NUM_INS];
    799  1.1       agc   match->num_del = match_costs[TRE_M_NUM_DEL];
    800  1.1       agc   match->num_subst = match_costs[TRE_M_NUM_SUBST];
    801  1.1       agc   *match_end_ofs = match_eo;
    802  1.1       agc 
    803  1.4       rin   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
    804  1.4       rin error_exit:
    805  1.4       rin #ifndef TRE_USE_ALLOCA
    806  1.4       rin   if (buf)
    807  1.4       rin     xfree(buf);
    808  1.4       rin #endif /* !TRE_USE_ALLOCA */
    809  1.4       rin   return ret;
    810  1.1       agc }
    811