Home | History | Annotate | Line # | Download | only in libgrep
      1  1.1  christos /* kwset.c - search for any of a set of keywords.
      2  1.1  christos    Copyright 1989, 1998, 2000, 2005 Free Software Foundation, Inc.
      3  1.1  christos 
      4  1.1  christos    This program is free software; you can redistribute it and/or modify
      5  1.1  christos    it under the terms of the GNU General Public License as published by
      6  1.1  christos    the Free Software Foundation; either version 2, or (at your option)
      7  1.1  christos    any later version.
      8  1.1  christos 
      9  1.1  christos    This program is distributed in the hope that it will be useful,
     10  1.1  christos    but WITHOUT ANY WARRANTY; without even the implied warranty of
     11  1.1  christos    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     12  1.1  christos    GNU General Public License for more details.
     13  1.1  christos 
     14  1.1  christos    You should have received a copy of the GNU General Public License
     15  1.1  christos    along with this program; if not, write to the Free Software Foundation,
     16  1.1  christos    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
     17  1.1  christos 
     18  1.1  christos /* Written August 1989 by Mike Haertel.
     19  1.1  christos    The author may be reached (Email) at the address mike (at) ai.mit.edu,
     20  1.1  christos    or (US mail) as Mike Haertel c/o Free Software Foundation. */
     21  1.1  christos 
     22  1.1  christos /* The algorithm implemented by these routines bears a startling resemblence
     23  1.1  christos    to one discovered by Beate Commentz-Walter, although it is not identical.
     24  1.1  christos    See "A String Matching Algorithm Fast on the Average," Technical Report,
     25  1.1  christos    IBM-Germany, Scientific Center Heidelberg, Tiergartenstrasse 15, D-6900
     26  1.1  christos    Heidelberg, Germany.  See also Aho, A.V., and M. Corasick, "Efficient
     27  1.1  christos    String Matching:  An Aid to Bibliographic Search," CACM June 1975,
     28  1.1  christos    Vol. 18, No. 6, which describes the failure function used below. */
     29  1.1  christos 
     30  1.1  christos #ifdef HAVE_CONFIG_H
     31  1.1  christos # include <config.h>
     32  1.1  christos #endif
     33  1.1  christos #include <sys/types.h>
     34  1.1  christos #include "kwset.h"
     35  1.1  christos #include <limits.h>
     36  1.1  christos #include <stdlib.h>
     37  1.1  christos #include "obstack.h"
     38  1.1  christos #include "gettext.h"
     39  1.1  christos #define _(str) gettext (str)
     40  1.1  christos 
     41  1.1  christos #ifdef GREP
     42  1.1  christos extern char *xmalloc();
     43  1.1  christos # undef malloc
     44  1.1  christos # define malloc xmalloc
     45  1.1  christos #endif
     46  1.1  christos 
     47  1.1  christos #define NCHAR (UCHAR_MAX + 1)
     48  1.1  christos #define obstack_chunk_alloc malloc
     49  1.1  christos #define obstack_chunk_free free
     50  1.1  christos 
     51  1.1  christos /* Balanced tree of edges and labels leaving a given trie node. */
     52  1.1  christos struct tree
     53  1.1  christos {
     54  1.1  christos   struct tree *llink;		/* Left link; MUST be first field. */
     55  1.1  christos   struct tree *rlink;		/* Right link (to larger labels). */
     56  1.1  christos   struct trie *trie;		/* Trie node pointed to by this edge. */
     57  1.1  christos   unsigned char label;		/* Label on this edge. */
     58  1.1  christos   char balance;			/* Difference in depths of subtrees. */
     59  1.1  christos };
     60  1.1  christos 
     61  1.1  christos /* Node of a trie representing a set of reversed keywords. */
     62  1.1  christos struct trie
     63  1.1  christos {
     64  1.1  christos   unsigned int accepting;	/* Word index of accepted word, or zero. */
     65  1.1  christos   struct tree *links;		/* Tree of edges leaving this node. */
     66  1.1  christos   struct trie *parent;		/* Parent of this node. */
     67  1.1  christos   struct trie *next;		/* List of all trie nodes in level order. */
     68  1.1  christos   struct trie *fail;		/* Aho-Corasick failure function. */
     69  1.1  christos   int depth;			/* Depth of this node from the root. */
     70  1.1  christos   int shift;			/* Shift function for search failures. */
     71  1.1  christos   int maxshift;			/* Max shift of self and descendents. */
     72  1.1  christos };
     73  1.1  christos 
     74  1.1  christos /* Structure returned opaquely to the caller, containing everything. */
     75  1.1  christos struct kwset
     76  1.1  christos {
     77  1.1  christos   struct obstack obstack;	/* Obstack for node allocation. */
     78  1.1  christos   int words;			/* Number of words in the trie. */
     79  1.1  christos   struct trie *trie;		/* The trie itself. */
     80  1.1  christos   int mind;			/* Minimum depth of an accepting node. */
     81  1.1  christos   int maxd;			/* Maximum depth of any node. */
     82  1.1  christos   unsigned char delta[NCHAR];	/* Delta table for rapid search. */
     83  1.1  christos   struct trie *next[NCHAR];	/* Table of children of the root. */
     84  1.1  christos   char *target;			/* Target string if there's only one. */
     85  1.1  christos   int mind2;			/* Used in Boyer-Moore search for one string. */
     86  1.1  christos   char const *trans;		/* Character translation table. */
     87  1.1  christos };
     88  1.1  christos 
     89  1.1  christos /* Allocate and initialize a keyword set object, returning an opaque
     90  1.1  christos    pointer to it.  Return NULL if memory is not available. */
     91  1.1  christos kwset_t
     92  1.1  christos kwsalloc (char const *trans)
     93  1.1  christos {
     94  1.1  christos   struct kwset *kwset;
     95  1.1  christos 
     96  1.1  christos   kwset = (struct kwset *) malloc(sizeof (struct kwset));
     97  1.1  christos   if (!kwset)
     98  1.1  christos     return 0;
     99  1.1  christos 
    100  1.1  christos   obstack_init(&kwset->obstack);
    101  1.1  christos   kwset->words = 0;
    102  1.1  christos   kwset->trie
    103  1.1  christos     = (struct trie *) obstack_alloc(&kwset->obstack, sizeof (struct trie));
    104  1.1  christos   if (!kwset->trie)
    105  1.1  christos     {
    106  1.1  christos       kwsfree((kwset_t) kwset);
    107  1.1  christos       return 0;
    108  1.1  christos     }
    109  1.1  christos   kwset->trie->accepting = 0;
    110  1.1  christos   kwset->trie->links = 0;
    111  1.1  christos   kwset->trie->parent = 0;
    112  1.1  christos   kwset->trie->next = 0;
    113  1.1  christos   kwset->trie->fail = 0;
    114  1.1  christos   kwset->trie->depth = 0;
    115  1.1  christos   kwset->trie->shift = 0;
    116  1.1  christos   kwset->mind = INT_MAX;
    117  1.1  christos   kwset->maxd = -1;
    118  1.1  christos   kwset->target = 0;
    119  1.1  christos   kwset->trans = trans;
    120  1.1  christos 
    121  1.1  christos   return (kwset_t) kwset;
    122  1.1  christos }
    123  1.1  christos 
    124  1.1  christos /* Add the given string to the contents of the keyword set.  Return NULL
    125  1.1  christos    for success, an error message otherwise. */
    126  1.1  christos const char *
    127  1.1  christos kwsincr (kwset_t kws, char const *text, size_t len)
    128  1.1  christos {
    129  1.1  christos   struct kwset *kwset;
    130  1.1  christos   register struct trie *trie;
    131  1.1  christos   register unsigned char label;
    132  1.1  christos   register struct tree *link;
    133  1.1  christos   register int depth;
    134  1.1  christos   struct tree *links[12];
    135  1.1  christos   enum { L, R } dirs[12];
    136  1.1  christos   struct tree *t, *r, *l, *rl, *lr;
    137  1.1  christos 
    138  1.1  christos   kwset = (struct kwset *) kws;
    139  1.1  christos   trie = kwset->trie;
    140  1.1  christos   text += len;
    141  1.1  christos 
    142  1.1  christos   /* Descend the trie (built of reversed keywords) character-by-character,
    143  1.1  christos      installing new nodes when necessary. */
    144  1.1  christos   while (len--)
    145  1.1  christos     {
    146  1.1  christos       label = kwset->trans ? kwset->trans[(unsigned char) *--text] : *--text;
    147  1.1  christos 
    148  1.1  christos       /* Descend the tree of outgoing links for this trie node,
    149  1.1  christos 	 looking for the current character and keeping track
    150  1.1  christos 	 of the path followed. */
    151  1.1  christos       link = trie->links;
    152  1.1  christos       links[0] = (struct tree *) &trie->links;
    153  1.1  christos       dirs[0] = L;
    154  1.1  christos       depth = 1;
    155  1.1  christos 
    156  1.1  christos       while (link && label != link->label)
    157  1.1  christos 	{
    158  1.1  christos 	  links[depth] = link;
    159  1.1  christos 	  if (label < link->label)
    160  1.1  christos 	    dirs[depth++] = L, link = link->llink;
    161  1.1  christos 	  else
    162  1.1  christos 	    dirs[depth++] = R, link = link->rlink;
    163  1.1  christos 	}
    164  1.1  christos 
    165  1.1  christos       /* The current character doesn't have an outgoing link at
    166  1.1  christos 	 this trie node, so build a new trie node and install
    167  1.1  christos 	 a link in the current trie node's tree. */
    168  1.1  christos       if (!link)
    169  1.1  christos 	{
    170  1.1  christos 	  link = (struct tree *) obstack_alloc(&kwset->obstack,
    171  1.1  christos 					       sizeof (struct tree));
    172  1.1  christos 	  if (!link)
    173  1.1  christos 	    return _("memory exhausted");
    174  1.1  christos 	  link->llink = 0;
    175  1.1  christos 	  link->rlink = 0;
    176  1.1  christos 	  link->trie = (struct trie *) obstack_alloc(&kwset->obstack,
    177  1.1  christos 						     sizeof (struct trie));
    178  1.1  christos 	  if (!link->trie)
    179  1.1  christos 	    return _("memory exhausted");
    180  1.1  christos 	  link->trie->accepting = 0;
    181  1.1  christos 	  link->trie->links = 0;
    182  1.1  christos 	  link->trie->parent = trie;
    183  1.1  christos 	  link->trie->next = 0;
    184  1.1  christos 	  link->trie->fail = 0;
    185  1.1  christos 	  link->trie->depth = trie->depth + 1;
    186  1.1  christos 	  link->trie->shift = 0;
    187  1.1  christos 	  link->label = label;
    188  1.1  christos 	  link->balance = 0;
    189  1.1  christos 
    190  1.1  christos 	  /* Install the new tree node in its parent. */
    191  1.1  christos 	  if (dirs[--depth] == L)
    192  1.1  christos 	    links[depth]->llink = link;
    193  1.1  christos 	  else
    194  1.1  christos 	    links[depth]->rlink = link;
    195  1.1  christos 
    196  1.1  christos 	  /* Back up the tree fixing the balance flags. */
    197  1.1  christos 	  while (depth && !links[depth]->balance)
    198  1.1  christos 	    {
    199  1.1  christos 	      if (dirs[depth] == L)
    200  1.1  christos 		--links[depth]->balance;
    201  1.1  christos 	      else
    202  1.1  christos 		++links[depth]->balance;
    203  1.1  christos 	      --depth;
    204  1.1  christos 	    }
    205  1.1  christos 
    206  1.1  christos 	  /* Rebalance the tree by pointer rotations if necessary. */
    207  1.1  christos 	  if (depth && ((dirs[depth] == L && --links[depth]->balance)
    208  1.1  christos 			|| (dirs[depth] == R && ++links[depth]->balance)))
    209  1.1  christos 	    {
    210  1.1  christos 	      switch (links[depth]->balance)
    211  1.1  christos 		{
    212  1.1  christos 		case (char) -2:
    213  1.1  christos 		  switch (dirs[depth + 1])
    214  1.1  christos 		    {
    215  1.1  christos 		    case L:
    216  1.1  christos 		      r = links[depth], t = r->llink, rl = t->rlink;
    217  1.1  christos 		      t->rlink = r, r->llink = rl;
    218  1.1  christos 		      t->balance = r->balance = 0;
    219  1.1  christos 		      break;
    220  1.1  christos 		    case R:
    221  1.1  christos 		      r = links[depth], l = r->llink, t = l->rlink;
    222  1.1  christos 		      rl = t->rlink, lr = t->llink;
    223  1.1  christos 		      t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
    224  1.1  christos 		      l->balance = t->balance != 1 ? 0 : -1;
    225  1.1  christos 		      r->balance = t->balance != (char) -1 ? 0 : 1;
    226  1.1  christos 		      t->balance = 0;
    227  1.1  christos 		      break;
    228  1.1  christos 		    default:
    229  1.1  christos 		      abort ();
    230  1.1  christos 		    }
    231  1.1  christos 		  break;
    232  1.1  christos 		case 2:
    233  1.1  christos 		  switch (dirs[depth + 1])
    234  1.1  christos 		    {
    235  1.1  christos 		    case R:
    236  1.1  christos 		      l = links[depth], t = l->rlink, lr = t->llink;
    237  1.1  christos 		      t->llink = l, l->rlink = lr;
    238  1.1  christos 		      t->balance = l->balance = 0;
    239  1.1  christos 		      break;
    240  1.1  christos 		    case L:
    241  1.1  christos 		      l = links[depth], r = l->rlink, t = r->llink;
    242  1.1  christos 		      lr = t->llink, rl = t->rlink;
    243  1.1  christos 		      t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
    244  1.1  christos 		      l->balance = t->balance != 1 ? 0 : -1;
    245  1.1  christos 		      r->balance = t->balance != (char) -1 ? 0 : 1;
    246  1.1  christos 		      t->balance = 0;
    247  1.1  christos 		      break;
    248  1.1  christos 		    default:
    249  1.1  christos 		      abort ();
    250  1.1  christos 		    }
    251  1.1  christos 		  break;
    252  1.1  christos 		default:
    253  1.1  christos 		  abort ();
    254  1.1  christos 		}
    255  1.1  christos 
    256  1.1  christos 	      if (dirs[depth - 1] == L)
    257  1.1  christos 		links[depth - 1]->llink = t;
    258  1.1  christos 	      else
    259  1.1  christos 		links[depth - 1]->rlink = t;
    260  1.1  christos 	    }
    261  1.1  christos 	}
    262  1.1  christos 
    263  1.1  christos       trie = link->trie;
    264  1.1  christos     }
    265  1.1  christos 
    266  1.1  christos   /* Mark the node we finally reached as accepting, encoding the
    267  1.1  christos      index number of this word in the keyword set so far. */
    268  1.1  christos   if (!trie->accepting)
    269  1.1  christos     trie->accepting = 1 + 2 * kwset->words;
    270  1.1  christos   ++kwset->words;
    271  1.1  christos 
    272  1.1  christos   /* Keep track of the longest and shortest string of the keyword set. */
    273  1.1  christos   if (trie->depth < kwset->mind)
    274  1.1  christos     kwset->mind = trie->depth;
    275  1.1  christos   if (trie->depth > kwset->maxd)
    276  1.1  christos     kwset->maxd = trie->depth;
    277  1.1  christos 
    278  1.1  christos   return 0;
    279  1.1  christos }
    280  1.1  christos 
    281  1.1  christos /* Enqueue the trie nodes referenced from the given tree in the
    282  1.1  christos    given queue. */
    283  1.1  christos static void
    284  1.1  christos enqueue (struct tree *tree, struct trie **last)
    285  1.1  christos {
    286  1.1  christos   if (!tree)
    287  1.1  christos     return;
    288  1.1  christos   enqueue(tree->llink, last);
    289  1.1  christos   enqueue(tree->rlink, last);
    290  1.1  christos   (*last) = (*last)->next = tree->trie;
    291  1.1  christos }
    292  1.1  christos 
    293  1.1  christos /* Compute the Aho-Corasick failure function for the trie nodes referenced
    294  1.1  christos    from the given tree, given the failure function for their parent as
    295  1.1  christos    well as a last resort failure node. */
    296  1.1  christos static void
    297  1.1  christos treefails (register struct tree const *tree, struct trie const *fail,
    298  1.1  christos 	   struct trie *recourse)
    299  1.1  christos {
    300  1.1  christos   register struct tree *link;
    301  1.1  christos 
    302  1.1  christos   if (!tree)
    303  1.1  christos     return;
    304  1.1  christos 
    305  1.1  christos   treefails(tree->llink, fail, recourse);
    306  1.1  christos   treefails(tree->rlink, fail, recourse);
    307  1.1  christos 
    308  1.1  christos   /* Find, in the chain of fails going back to the root, the first
    309  1.1  christos      node that has a descendent on the current label. */
    310  1.1  christos   while (fail)
    311  1.1  christos     {
    312  1.1  christos       link = fail->links;
    313  1.1  christos       while (link && tree->label != link->label)
    314  1.1  christos 	if (tree->label < link->label)
    315  1.1  christos 	  link = link->llink;
    316  1.1  christos 	else
    317  1.1  christos 	  link = link->rlink;
    318  1.1  christos       if (link)
    319  1.1  christos 	{
    320  1.1  christos 	  tree->trie->fail = link->trie;
    321  1.1  christos 	  return;
    322  1.1  christos 	}
    323  1.1  christos       fail = fail->fail;
    324  1.1  christos     }
    325  1.1  christos 
    326  1.1  christos   tree->trie->fail = recourse;
    327  1.1  christos }
    328  1.1  christos 
    329  1.1  christos /* Set delta entries for the links of the given tree such that
    330  1.1  christos    the preexisting delta value is larger than the current depth. */
    331  1.1  christos static void
    332  1.1  christos treedelta (register struct tree const *tree,
    333  1.1  christos 	   register unsigned int depth,
    334  1.1  christos 	   unsigned char delta[])
    335  1.1  christos {
    336  1.1  christos   if (!tree)
    337  1.1  christos     return;
    338  1.1  christos   treedelta(tree->llink, depth, delta);
    339  1.1  christos   treedelta(tree->rlink, depth, delta);
    340  1.1  christos   if (depth < delta[tree->label])
    341  1.1  christos     delta[tree->label] = depth;
    342  1.1  christos }
    343  1.1  christos 
    344  1.1  christos /* Return true if A has every label in B. */
    345  1.1  christos static int
    346  1.1  christos hasevery (register struct tree const *a, register struct tree const *b)
    347  1.1  christos {
    348  1.1  christos   if (!b)
    349  1.1  christos     return 1;
    350  1.1  christos   if (!hasevery(a, b->llink))
    351  1.1  christos     return 0;
    352  1.1  christos   if (!hasevery(a, b->rlink))
    353  1.1  christos     return 0;
    354  1.1  christos   while (a && b->label != a->label)
    355  1.1  christos     if (b->label < a->label)
    356  1.1  christos       a = a->llink;
    357  1.1  christos     else
    358  1.1  christos       a = a->rlink;
    359  1.1  christos   return !!a;
    360  1.1  christos }
    361  1.1  christos 
    362  1.1  christos /* Compute a vector, indexed by character code, of the trie nodes
    363  1.1  christos    referenced from the given tree. */
    364  1.1  christos static void
    365  1.1  christos treenext (struct tree const *tree, struct trie *next[])
    366  1.1  christos {
    367  1.1  christos   if (!tree)
    368  1.1  christos     return;
    369  1.1  christos   treenext(tree->llink, next);
    370  1.1  christos   treenext(tree->rlink, next);
    371  1.1  christos   next[tree->label] = tree->trie;
    372  1.1  christos }
    373  1.1  christos 
    374  1.1  christos /* Compute the shift for each trie node, as well as the delta
    375  1.1  christos    table and next cache for the given keyword set. */
    376  1.1  christos const char *
    377  1.1  christos kwsprep (kwset_t kws)
    378  1.1  christos {
    379  1.1  christos   register struct kwset *kwset;
    380  1.1  christos   register int i;
    381  1.1  christos   register struct trie *curr, *fail;
    382  1.1  christos   register char const *trans;
    383  1.1  christos   unsigned char delta[NCHAR];
    384  1.1  christos   struct trie *last, *next[NCHAR];
    385  1.1  christos 
    386  1.1  christos   kwset = (struct kwset *) kws;
    387  1.1  christos 
    388  1.1  christos   /* Initial values for the delta table; will be changed later.  The
    389  1.1  christos      delta entry for a given character is the smallest depth of any
    390  1.1  christos      node at which an outgoing edge is labeled by that character. */
    391  1.1  christos   if (kwset->mind < 256)
    392  1.1  christos     for (i = 0; i < NCHAR; ++i)
    393  1.1  christos       delta[i] = kwset->mind;
    394  1.1  christos   else
    395  1.1  christos     for (i = 0; i < NCHAR; ++i)
    396  1.1  christos       delta[i] = 255;
    397  1.1  christos 
    398  1.1  christos   /* Check if we can use the simple boyer-moore algorithm, instead
    399  1.1  christos      of the hairy commentz-walter algorithm. */
    400  1.1  christos   if (kwset->words == 1 && kwset->trans == 0)
    401  1.1  christos     {
    402  1.1  christos       /* Looking for just one string.  Extract it from the trie. */
    403  1.1  christos       kwset->target = obstack_alloc(&kwset->obstack, kwset->mind);
    404  1.1  christos       for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i)
    405  1.1  christos 	{
    406  1.1  christos 	  kwset->target[i] = curr->links->label;
    407  1.1  christos 	  curr = curr->links->trie;
    408  1.1  christos 	}
    409  1.1  christos       /* Build the Boyer Moore delta.  Boy that's easy compared to CW. */
    410  1.1  christos       for (i = 0; i < kwset->mind; ++i)
    411  1.1  christos 	delta[(unsigned char) kwset->target[i]] = kwset->mind - (i + 1);
    412  1.1  christos       kwset->mind2 = kwset->mind;
    413  1.1  christos       /* Find the minimal delta2 shift that we might make after
    414  1.1  christos 	 a backwards match has failed. */
    415  1.1  christos       for (i = 0; i < kwset->mind - 1; ++i)
    416  1.1  christos 	if (kwset->target[i] == kwset->target[kwset->mind - 1])
    417  1.1  christos 	  kwset->mind2 = kwset->mind - (i + 1);
    418  1.1  christos     }
    419  1.1  christos   else
    420  1.1  christos     {
    421  1.1  christos       /* Traverse the nodes of the trie in level order, simultaneously
    422  1.1  christos 	 computing the delta table, failure function, and shift function. */
    423  1.1  christos       for (curr = last = kwset->trie; curr; curr = curr->next)
    424  1.1  christos 	{
    425  1.1  christos 	  /* Enqueue the immediate descendents in the level order queue. */
    426  1.1  christos 	  enqueue(curr->links, &last);
    427  1.1  christos 
    428  1.1  christos 	  curr->shift = kwset->mind;
    429  1.1  christos 	  curr->maxshift = kwset->mind;
    430  1.1  christos 
    431  1.1  christos 	  /* Update the delta table for the descendents of this node. */
    432  1.1  christos 	  treedelta(curr->links, curr->depth, delta);
    433  1.1  christos 
    434  1.1  christos 	  /* Compute the failure function for the decendents of this node. */
    435  1.1  christos 	  treefails(curr->links, curr->fail, kwset->trie);
    436  1.1  christos 
    437  1.1  christos 	  /* Update the shifts at each node in the current node's chain
    438  1.1  christos 	     of fails back to the root. */
    439  1.1  christos 	  for (fail = curr->fail; fail; fail = fail->fail)
    440  1.1  christos 	    {
    441  1.1  christos 	      /* If the current node has some outgoing edge that the fail
    442  1.1  christos 		 doesn't, then the shift at the fail should be no larger
    443  1.1  christos 		 than the difference of their depths. */
    444  1.1  christos 	      if (!hasevery(fail->links, curr->links))
    445  1.1  christos 		if (curr->depth - fail->depth < fail->shift)
    446  1.1  christos 		  fail->shift = curr->depth - fail->depth;
    447  1.1  christos 
    448  1.1  christos 	      /* If the current node is accepting then the shift at the
    449  1.1  christos 		 fail and its descendents should be no larger than the
    450  1.1  christos 		 difference of their depths. */
    451  1.1  christos 	      if (curr->accepting && fail->maxshift > curr->depth - fail->depth)
    452  1.1  christos 		fail->maxshift = curr->depth - fail->depth;
    453  1.1  christos 	    }
    454  1.1  christos 	}
    455  1.1  christos 
    456  1.1  christos       /* Traverse the trie in level order again, fixing up all nodes whose
    457  1.1  christos 	 shift exceeds their inherited maxshift. */
    458  1.1  christos       for (curr = kwset->trie->next; curr; curr = curr->next)
    459  1.1  christos 	{
    460  1.1  christos 	  if (curr->maxshift > curr->parent->maxshift)
    461  1.1  christos 	    curr->maxshift = curr->parent->maxshift;
    462  1.1  christos 	  if (curr->shift > curr->maxshift)
    463  1.1  christos 	    curr->shift = curr->maxshift;
    464  1.1  christos 	}
    465  1.1  christos 
    466  1.1  christos       /* Create a vector, indexed by character code, of the outgoing links
    467  1.1  christos 	 from the root node. */
    468  1.1  christos       for (i = 0; i < NCHAR; ++i)
    469  1.1  christos 	next[i] = 0;
    470  1.1  christos       treenext(kwset->trie->links, next);
    471  1.1  christos 
    472  1.1  christos       if ((trans = kwset->trans) != 0)
    473  1.1  christos 	for (i = 0; i < NCHAR; ++i)
    474  1.1  christos 	  kwset->next[i] = next[(unsigned char) trans[i]];
    475  1.1  christos       else
    476  1.1  christos 	for (i = 0; i < NCHAR; ++i)
    477  1.1  christos 	  kwset->next[i] = next[i];
    478  1.1  christos     }
    479  1.1  christos 
    480  1.1  christos   /* Fix things up for any translation table. */
    481  1.1  christos   if ((trans = kwset->trans) != 0)
    482  1.1  christos     for (i = 0; i < NCHAR; ++i)
    483  1.1  christos       kwset->delta[i] = delta[(unsigned char) trans[i]];
    484  1.1  christos   else
    485  1.1  christos     for (i = 0; i < NCHAR; ++i)
    486  1.1  christos       kwset->delta[i] = delta[i];
    487  1.1  christos 
    488  1.1  christos   return 0;
    489  1.1  christos }
    490  1.1  christos 
    491  1.1  christos #define U(C) ((unsigned char) (C))
    492  1.1  christos 
    493  1.1  christos /* Fast boyer-moore search. */
    494  1.1  christos static size_t
    495  1.1  christos bmexec (kwset_t kws, char const *text, size_t size)
    496  1.1  christos {
    497  1.1  christos   struct kwset const *kwset;
    498  1.1  christos   register unsigned char const *d1;
    499  1.1  christos   register char const *ep, *sp, *tp;
    500  1.1  christos   register int d, gc, i, len, md2;
    501  1.1  christos 
    502  1.1  christos   kwset = (struct kwset const *) kws;
    503  1.1  christos   len = kwset->mind;
    504  1.1  christos 
    505  1.1  christos   if (len == 0)
    506  1.1  christos     return 0;
    507  1.1  christos   if (len > size)
    508  1.1  christos     return -1;
    509  1.1  christos   if (len == 1)
    510  1.1  christos     {
    511  1.1  christos       tp = memchr (text, kwset->target[0], size);
    512  1.1  christos       return tp ? tp - text : -1;
    513  1.1  christos     }
    514  1.1  christos 
    515  1.1  christos   d1 = kwset->delta;
    516  1.1  christos   sp = kwset->target + len;
    517  1.1  christos   gc = U(sp[-2]);
    518  1.1  christos   md2 = kwset->mind2;
    519  1.1  christos   tp = text + len;
    520  1.1  christos 
    521  1.1  christos   /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
    522  1.1  christos   if (size > 12 * len)
    523  1.1  christos     /* 11 is not a bug, the initial offset happens only once. */
    524  1.1  christos     for (ep = text + size - 11 * len;;)
    525  1.1  christos       {
    526  1.1  christos 	while (tp <= ep)
    527  1.1  christos 	  {
    528  1.1  christos 	    d = d1[U(tp[-1])], tp += d;
    529  1.1  christos 	    d = d1[U(tp[-1])], tp += d;
    530  1.1  christos 	    if (d == 0)
    531  1.1  christos 	      goto found;
    532  1.1  christos 	    d = d1[U(tp[-1])], tp += d;
    533  1.1  christos 	    d = d1[U(tp[-1])], tp += d;
    534  1.1  christos 	    d = d1[U(tp[-1])], tp += d;
    535  1.1  christos 	    if (d == 0)
    536  1.1  christos 	      goto found;
    537  1.1  christos 	    d = d1[U(tp[-1])], tp += d;
    538  1.1  christos 	    d = d1[U(tp[-1])], tp += d;
    539  1.1  christos 	    d = d1[U(tp[-1])], tp += d;
    540  1.1  christos 	    if (d == 0)
    541  1.1  christos 	      goto found;
    542  1.1  christos 	    d = d1[U(tp[-1])], tp += d;
    543  1.1  christos 	    d = d1[U(tp[-1])], tp += d;
    544  1.1  christos 	  }
    545  1.1  christos 	break;
    546  1.1  christos       found:
    547  1.1  christos 	if (U(tp[-2]) == gc)
    548  1.1  christos 	  {
    549  1.1  christos 	    for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i)
    550  1.1  christos 	      ;
    551  1.1  christos 	    if (i > len)
    552  1.1  christos 	      return tp - len - text;
    553  1.1  christos 	  }
    554  1.1  christos 	tp += md2;
    555  1.1  christos       }
    556  1.1  christos 
    557  1.1  christos   /* Now we have only a few characters left to search.  We
    558  1.1  christos      carefully avoid ever producing an out-of-bounds pointer. */
    559  1.1  christos   ep = text + size;
    560  1.1  christos   d = d1[U(tp[-1])];
    561  1.1  christos   while (d <= ep - tp)
    562  1.1  christos     {
    563  1.1  christos       d = d1[U((tp += d)[-1])];
    564  1.1  christos       if (d != 0)
    565  1.1  christos 	continue;
    566  1.1  christos       if (U(tp[-2]) == gc)
    567  1.1  christos 	{
    568  1.1  christos 	  for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i)
    569  1.1  christos 	    ;
    570  1.1  christos 	  if (i > len)
    571  1.1  christos 	    return tp - len - text;
    572  1.1  christos 	}
    573  1.1  christos       d = md2;
    574  1.1  christos     }
    575  1.1  christos 
    576  1.1  christos   return -1;
    577  1.1  christos }
    578  1.1  christos 
    579  1.1  christos /* Hairy multiple string search. */
    580  1.1  christos static size_t
    581  1.1  christos cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch)
    582  1.1  christos {
    583  1.1  christos   struct kwset const *kwset;
    584  1.1  christos   struct trie * const *next;
    585  1.1  christos   struct trie const *trie;
    586  1.1  christos   struct trie const *accept;
    587  1.1  christos   char const *beg, *lim, *mch, *lmch;
    588  1.1  christos   register unsigned char c;
    589  1.1  christos   register unsigned char const *delta;
    590  1.1  christos   register int d;
    591  1.1  christos   register char const *end, *qlim;
    592  1.1  christos   register struct tree const *tree;
    593  1.1  christos   register char const *trans;
    594  1.1  christos 
    595  1.1  christos   accept = NULL;
    596  1.1  christos 
    597  1.1  christos   /* Initialize register copies and look for easy ways out. */
    598  1.1  christos   kwset = (struct kwset *) kws;
    599  1.1  christos   if (len < kwset->mind)
    600  1.1  christos     return -1;
    601  1.1  christos   next = kwset->next;
    602  1.1  christos   delta = kwset->delta;
    603  1.1  christos   trans = kwset->trans;
    604  1.1  christos   lim = text + len;
    605  1.1  christos   end = text;
    606  1.1  christos   if ((d = kwset->mind) != 0)
    607  1.1  christos     mch = 0;
    608  1.1  christos   else
    609  1.1  christos     {
    610  1.1  christos       mch = text, accept = kwset->trie;
    611  1.1  christos       goto match;
    612  1.1  christos     }
    613  1.1  christos 
    614  1.1  christos   if (len >= 4 * kwset->mind)
    615  1.1  christos     qlim = lim - 4 * kwset->mind;
    616  1.1  christos   else
    617  1.1  christos     qlim = 0;
    618  1.1  christos 
    619  1.1  christos   while (lim - end >= d)
    620  1.1  christos     {
    621  1.1  christos       if (qlim && end <= qlim)
    622  1.1  christos 	{
    623  1.1  christos 	  end += d - 1;
    624  1.1  christos 	  while ((d = delta[c = *end]) && end < qlim)
    625  1.1  christos 	    {
    626  1.1  christos 	      end += d;
    627  1.1  christos 	      end += delta[(unsigned char) *end];
    628  1.1  christos 	      end += delta[(unsigned char) *end];
    629  1.1  christos 	    }
    630  1.1  christos 	  ++end;
    631  1.1  christos 	}
    632  1.1  christos       else
    633  1.1  christos 	d = delta[c = (end += d)[-1]];
    634  1.1  christos       if (d)
    635  1.1  christos 	continue;
    636  1.1  christos       beg = end - 1;
    637  1.1  christos       trie = next[c];
    638  1.1  christos       if (trie->accepting)
    639  1.1  christos 	{
    640  1.1  christos 	  mch = beg;
    641  1.1  christos 	  accept = trie;
    642  1.1  christos 	}
    643  1.1  christos       d = trie->shift;
    644  1.1  christos       while (beg > text)
    645  1.1  christos 	{
    646  1.1  christos 	  c = trans ? trans[(unsigned char) *--beg] : *--beg;
    647  1.1  christos 	  tree = trie->links;
    648  1.1  christos 	  while (tree && c != tree->label)
    649  1.1  christos 	    if (c < tree->label)
    650  1.1  christos 	      tree = tree->llink;
    651  1.1  christos 	    else
    652  1.1  christos 	      tree = tree->rlink;
    653  1.1  christos 	  if (tree)
    654  1.1  christos 	    {
    655  1.1  christos 	      trie = tree->trie;
    656  1.1  christos 	      if (trie->accepting)
    657  1.1  christos 		{
    658  1.1  christos 		  mch = beg;
    659  1.1  christos 		  accept = trie;
    660  1.1  christos 		}
    661  1.1  christos 	    }
    662  1.1  christos 	  else
    663  1.1  christos 	    break;
    664  1.1  christos 	  d = trie->shift;
    665  1.1  christos 	}
    666  1.1  christos       if (mch)
    667  1.1  christos 	goto match;
    668  1.1  christos     }
    669  1.1  christos   return -1;
    670  1.1  christos 
    671  1.1  christos  match:
    672  1.1  christos   /* Given a known match, find the longest possible match anchored
    673  1.1  christos      at or before its starting point.  This is nearly a verbatim
    674  1.1  christos      copy of the preceding main search loops. */
    675  1.1  christos   if (lim - mch > kwset->maxd)
    676  1.1  christos     lim = mch + kwset->maxd;
    677  1.1  christos   lmch = 0;
    678  1.1  christos   d = 1;
    679  1.1  christos   while (lim - end >= d)
    680  1.1  christos     {
    681  1.1  christos       if ((d = delta[c = (end += d)[-1]]) != 0)
    682  1.1  christos 	continue;
    683  1.1  christos       beg = end - 1;
    684  1.1  christos       if (!(trie = next[c]))
    685  1.1  christos 	{
    686  1.1  christos 	  d = 1;
    687  1.1  christos 	  continue;
    688  1.1  christos 	}
    689  1.1  christos       if (trie->accepting && beg <= mch)
    690  1.1  christos 	{
    691  1.1  christos 	  lmch = beg;
    692  1.1  christos 	  accept = trie;
    693  1.1  christos 	}
    694  1.1  christos       d = trie->shift;
    695  1.1  christos       while (beg > text)
    696  1.1  christos 	{
    697  1.1  christos 	  c = trans ? trans[(unsigned char) *--beg] : *--beg;
    698  1.1  christos 	  tree = trie->links;
    699  1.1  christos 	  while (tree && c != tree->label)
    700  1.1  christos 	    if (c < tree->label)
    701  1.1  christos 	      tree = tree->llink;
    702  1.1  christos 	    else
    703  1.1  christos 	      tree = tree->rlink;
    704  1.1  christos 	  if (tree)
    705  1.1  christos 	    {
    706  1.1  christos 	      trie = tree->trie;
    707  1.1  christos 	      if (trie->accepting && beg <= mch)
    708  1.1  christos 		{
    709  1.1  christos 		  lmch = beg;
    710  1.1  christos 		  accept = trie;
    711  1.1  christos 		}
    712  1.1  christos 	    }
    713  1.1  christos 	  else
    714  1.1  christos 	    break;
    715  1.1  christos 	  d = trie->shift;
    716  1.1  christos 	}
    717  1.1  christos       if (lmch)
    718  1.1  christos 	{
    719  1.1  christos 	  mch = lmch;
    720  1.1  christos 	  goto match;
    721  1.1  christos 	}
    722  1.1  christos       if (!d)
    723  1.1  christos 	d = 1;
    724  1.1  christos     }
    725  1.1  christos 
    726  1.1  christos   if (kwsmatch)
    727  1.1  christos     {
    728  1.1  christos       kwsmatch->index = accept->accepting / 2;
    729  1.1  christos       kwsmatch->offset[0] = mch - text;
    730  1.1  christos       kwsmatch->size[0] = accept->depth;
    731  1.1  christos     }
    732  1.1  christos   return mch - text;
    733  1.1  christos }
    734  1.1  christos 
    735  1.1  christos /* Search through the given text for a match of any member of the
    736  1.1  christos    given keyword set.  Return a pointer to the first character of
    737  1.1  christos    the matching substring, or NULL if no match is found.  If FOUNDLEN
    738  1.1  christos    is non-NULL store in the referenced location the length of the
    739  1.1  christos    matching substring.  Similarly, if FOUNDIDX is non-NULL, store
    740  1.1  christos    in the referenced location the index number of the particular
    741  1.1  christos    keyword matched. */
    742  1.1  christos size_t
    743  1.1  christos kwsexec (kwset_t kws, char const *text, size_t size,
    744  1.1  christos 	 struct kwsmatch *kwsmatch)
    745  1.1  christos {
    746  1.1  christos   struct kwset const *kwset = (struct kwset *) kws;
    747  1.1  christos   if (kwset->words == 1 && kwset->trans == 0)
    748  1.1  christos     {
    749  1.1  christos       size_t ret = bmexec (kws, text, size);
    750  1.1  christos       if (kwsmatch != 0 && ret != (size_t) -1)
    751  1.1  christos 	{
    752  1.1  christos 	  kwsmatch->index = 0;
    753  1.1  christos 	  kwsmatch->offset[0] = ret;
    754  1.1  christos 	  kwsmatch->size[0] = kwset->mind;
    755  1.1  christos 	}
    756  1.1  christos       return ret;
    757  1.1  christos     }
    758  1.1  christos   else
    759  1.1  christos     return cwexec(kws, text, size, kwsmatch);
    760  1.1  christos }
    761  1.1  christos 
    762  1.1  christos /* Free the components of the given keyword set. */
    763  1.1  christos void
    764  1.1  christos kwsfree (kwset_t kws)
    765  1.1  christos {
    766  1.1  christos   struct kwset *kwset;
    767  1.1  christos 
    768  1.1  christos   kwset = (struct kwset *) kws;
    769  1.1  christos   obstack_free(&kwset->obstack, 0);
    770  1.1  christos   free(kws);
    771  1.1  christos }
    772