1 1.1 christos /* kwset.c - search for any of a set of keywords. 2 1.1 christos Copyright 1989, 1998, 2000, 2005 Free Software Foundation, Inc. 3 1.1 christos 4 1.1 christos This program is free software; you can redistribute it and/or modify 5 1.1 christos it under the terms of the GNU General Public License as published by 6 1.1 christos the Free Software Foundation; either version 2, or (at your option) 7 1.1 christos any later version. 8 1.1 christos 9 1.1 christos This program is distributed in the hope that it will be useful, 10 1.1 christos but WITHOUT ANY WARRANTY; without even the implied warranty of 11 1.1 christos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 1.1 christos GNU General Public License for more details. 13 1.1 christos 14 1.1 christos You should have received a copy of the GNU General Public License 15 1.1 christos along with this program; if not, write to the Free Software Foundation, 16 1.1 christos Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 17 1.1 christos 18 1.1 christos /* Written August 1989 by Mike Haertel. 19 1.1 christos The author may be reached (Email) at the address mike (at) ai.mit.edu, 20 1.1 christos or (US mail) as Mike Haertel c/o Free Software Foundation. */ 21 1.1 christos 22 1.1 christos /* The algorithm implemented by these routines bears a startling resemblence 23 1.1 christos to one discovered by Beate Commentz-Walter, although it is not identical. 24 1.1 christos See "A String Matching Algorithm Fast on the Average," Technical Report, 25 1.1 christos IBM-Germany, Scientific Center Heidelberg, Tiergartenstrasse 15, D-6900 26 1.1 christos Heidelberg, Germany. See also Aho, A.V., and M. Corasick, "Efficient 27 1.1 christos String Matching: An Aid to Bibliographic Search," CACM June 1975, 28 1.1 christos Vol. 18, No. 6, which describes the failure function used below. */ 29 1.1 christos 30 1.1 christos #ifdef HAVE_CONFIG_H 31 1.1 christos # include <config.h> 32 1.1 christos #endif 33 1.1 christos #include <sys/types.h> 34 1.1 christos #include "kwset.h" 35 1.1 christos #include <limits.h> 36 1.1 christos #include <stdlib.h> 37 1.1 christos #include "obstack.h" 38 1.1 christos #include "gettext.h" 39 1.1 christos #define _(str) gettext (str) 40 1.1 christos 41 1.1 christos #ifdef GREP 42 1.1 christos extern char *xmalloc(); 43 1.1 christos # undef malloc 44 1.1 christos # define malloc xmalloc 45 1.1 christos #endif 46 1.1 christos 47 1.1 christos #define NCHAR (UCHAR_MAX + 1) 48 1.1 christos #define obstack_chunk_alloc malloc 49 1.1 christos #define obstack_chunk_free free 50 1.1 christos 51 1.1 christos /* Balanced tree of edges and labels leaving a given trie node. */ 52 1.1 christos struct tree 53 1.1 christos { 54 1.1 christos struct tree *llink; /* Left link; MUST be first field. */ 55 1.1 christos struct tree *rlink; /* Right link (to larger labels). */ 56 1.1 christos struct trie *trie; /* Trie node pointed to by this edge. */ 57 1.1 christos unsigned char label; /* Label on this edge. */ 58 1.1 christos char balance; /* Difference in depths of subtrees. */ 59 1.1 christos }; 60 1.1 christos 61 1.1 christos /* Node of a trie representing a set of reversed keywords. */ 62 1.1 christos struct trie 63 1.1 christos { 64 1.1 christos unsigned int accepting; /* Word index of accepted word, or zero. */ 65 1.1 christos struct tree *links; /* Tree of edges leaving this node. */ 66 1.1 christos struct trie *parent; /* Parent of this node. */ 67 1.1 christos struct trie *next; /* List of all trie nodes in level order. */ 68 1.1 christos struct trie *fail; /* Aho-Corasick failure function. */ 69 1.1 christos int depth; /* Depth of this node from the root. */ 70 1.1 christos int shift; /* Shift function for search failures. */ 71 1.1 christos int maxshift; /* Max shift of self and descendents. */ 72 1.1 christos }; 73 1.1 christos 74 1.1 christos /* Structure returned opaquely to the caller, containing everything. */ 75 1.1 christos struct kwset 76 1.1 christos { 77 1.1 christos struct obstack obstack; /* Obstack for node allocation. */ 78 1.1 christos int words; /* Number of words in the trie. */ 79 1.1 christos struct trie *trie; /* The trie itself. */ 80 1.1 christos int mind; /* Minimum depth of an accepting node. */ 81 1.1 christos int maxd; /* Maximum depth of any node. */ 82 1.1 christos unsigned char delta[NCHAR]; /* Delta table for rapid search. */ 83 1.1 christos struct trie *next[NCHAR]; /* Table of children of the root. */ 84 1.1 christos char *target; /* Target string if there's only one. */ 85 1.1 christos int mind2; /* Used in Boyer-Moore search for one string. */ 86 1.1 christos char const *trans; /* Character translation table. */ 87 1.1 christos }; 88 1.1 christos 89 1.1 christos /* Allocate and initialize a keyword set object, returning an opaque 90 1.1 christos pointer to it. Return NULL if memory is not available. */ 91 1.1 christos kwset_t 92 1.1 christos kwsalloc (char const *trans) 93 1.1 christos { 94 1.1 christos struct kwset *kwset; 95 1.1 christos 96 1.1 christos kwset = (struct kwset *) malloc(sizeof (struct kwset)); 97 1.1 christos if (!kwset) 98 1.1 christos return 0; 99 1.1 christos 100 1.1 christos obstack_init(&kwset->obstack); 101 1.1 christos kwset->words = 0; 102 1.1 christos kwset->trie 103 1.1 christos = (struct trie *) obstack_alloc(&kwset->obstack, sizeof (struct trie)); 104 1.1 christos if (!kwset->trie) 105 1.1 christos { 106 1.1 christos kwsfree((kwset_t) kwset); 107 1.1 christos return 0; 108 1.1 christos } 109 1.1 christos kwset->trie->accepting = 0; 110 1.1 christos kwset->trie->links = 0; 111 1.1 christos kwset->trie->parent = 0; 112 1.1 christos kwset->trie->next = 0; 113 1.1 christos kwset->trie->fail = 0; 114 1.1 christos kwset->trie->depth = 0; 115 1.1 christos kwset->trie->shift = 0; 116 1.1 christos kwset->mind = INT_MAX; 117 1.1 christos kwset->maxd = -1; 118 1.1 christos kwset->target = 0; 119 1.1 christos kwset->trans = trans; 120 1.1 christos 121 1.1 christos return (kwset_t) kwset; 122 1.1 christos } 123 1.1 christos 124 1.1 christos /* Add the given string to the contents of the keyword set. Return NULL 125 1.1 christos for success, an error message otherwise. */ 126 1.1 christos const char * 127 1.1 christos kwsincr (kwset_t kws, char const *text, size_t len) 128 1.1 christos { 129 1.1 christos struct kwset *kwset; 130 1.1 christos register struct trie *trie; 131 1.1 christos register unsigned char label; 132 1.1 christos register struct tree *link; 133 1.1 christos register int depth; 134 1.1 christos struct tree *links[12]; 135 1.1 christos enum { L, R } dirs[12]; 136 1.1 christos struct tree *t, *r, *l, *rl, *lr; 137 1.1 christos 138 1.1 christos kwset = (struct kwset *) kws; 139 1.1 christos trie = kwset->trie; 140 1.1 christos text += len; 141 1.1 christos 142 1.1 christos /* Descend the trie (built of reversed keywords) character-by-character, 143 1.1 christos installing new nodes when necessary. */ 144 1.1 christos while (len--) 145 1.1 christos { 146 1.1 christos label = kwset->trans ? kwset->trans[(unsigned char) *--text] : *--text; 147 1.1 christos 148 1.1 christos /* Descend the tree of outgoing links for this trie node, 149 1.1 christos looking for the current character and keeping track 150 1.1 christos of the path followed. */ 151 1.1 christos link = trie->links; 152 1.1 christos links[0] = (struct tree *) &trie->links; 153 1.1 christos dirs[0] = L; 154 1.1 christos depth = 1; 155 1.1 christos 156 1.1 christos while (link && label != link->label) 157 1.1 christos { 158 1.1 christos links[depth] = link; 159 1.1 christos if (label < link->label) 160 1.1 christos dirs[depth++] = L, link = link->llink; 161 1.1 christos else 162 1.1 christos dirs[depth++] = R, link = link->rlink; 163 1.1 christos } 164 1.1 christos 165 1.1 christos /* The current character doesn't have an outgoing link at 166 1.1 christos this trie node, so build a new trie node and install 167 1.1 christos a link in the current trie node's tree. */ 168 1.1 christos if (!link) 169 1.1 christos { 170 1.1 christos link = (struct tree *) obstack_alloc(&kwset->obstack, 171 1.1 christos sizeof (struct tree)); 172 1.1 christos if (!link) 173 1.1 christos return _("memory exhausted"); 174 1.1 christos link->llink = 0; 175 1.1 christos link->rlink = 0; 176 1.1 christos link->trie = (struct trie *) obstack_alloc(&kwset->obstack, 177 1.1 christos sizeof (struct trie)); 178 1.1 christos if (!link->trie) 179 1.1 christos return _("memory exhausted"); 180 1.1 christos link->trie->accepting = 0; 181 1.1 christos link->trie->links = 0; 182 1.1 christos link->trie->parent = trie; 183 1.1 christos link->trie->next = 0; 184 1.1 christos link->trie->fail = 0; 185 1.1 christos link->trie->depth = trie->depth + 1; 186 1.1 christos link->trie->shift = 0; 187 1.1 christos link->label = label; 188 1.1 christos link->balance = 0; 189 1.1 christos 190 1.1 christos /* Install the new tree node in its parent. */ 191 1.1 christos if (dirs[--depth] == L) 192 1.1 christos links[depth]->llink = link; 193 1.1 christos else 194 1.1 christos links[depth]->rlink = link; 195 1.1 christos 196 1.1 christos /* Back up the tree fixing the balance flags. */ 197 1.1 christos while (depth && !links[depth]->balance) 198 1.1 christos { 199 1.1 christos if (dirs[depth] == L) 200 1.1 christos --links[depth]->balance; 201 1.1 christos else 202 1.1 christos ++links[depth]->balance; 203 1.1 christos --depth; 204 1.1 christos } 205 1.1 christos 206 1.1 christos /* Rebalance the tree by pointer rotations if necessary. */ 207 1.1 christos if (depth && ((dirs[depth] == L && --links[depth]->balance) 208 1.1 christos || (dirs[depth] == R && ++links[depth]->balance))) 209 1.1 christos { 210 1.1 christos switch (links[depth]->balance) 211 1.1 christos { 212 1.1 christos case (char) -2: 213 1.1 christos switch (dirs[depth + 1]) 214 1.1 christos { 215 1.1 christos case L: 216 1.1 christos r = links[depth], t = r->llink, rl = t->rlink; 217 1.1 christos t->rlink = r, r->llink = rl; 218 1.1 christos t->balance = r->balance = 0; 219 1.1 christos break; 220 1.1 christos case R: 221 1.1 christos r = links[depth], l = r->llink, t = l->rlink; 222 1.1 christos rl = t->rlink, lr = t->llink; 223 1.1 christos t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 224 1.1 christos l->balance = t->balance != 1 ? 0 : -1; 225 1.1 christos r->balance = t->balance != (char) -1 ? 0 : 1; 226 1.1 christos t->balance = 0; 227 1.1 christos break; 228 1.1 christos default: 229 1.1 christos abort (); 230 1.1 christos } 231 1.1 christos break; 232 1.1 christos case 2: 233 1.1 christos switch (dirs[depth + 1]) 234 1.1 christos { 235 1.1 christos case R: 236 1.1 christos l = links[depth], t = l->rlink, lr = t->llink; 237 1.1 christos t->llink = l, l->rlink = lr; 238 1.1 christos t->balance = l->balance = 0; 239 1.1 christos break; 240 1.1 christos case L: 241 1.1 christos l = links[depth], r = l->rlink, t = r->llink; 242 1.1 christos lr = t->llink, rl = t->rlink; 243 1.1 christos t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 244 1.1 christos l->balance = t->balance != 1 ? 0 : -1; 245 1.1 christos r->balance = t->balance != (char) -1 ? 0 : 1; 246 1.1 christos t->balance = 0; 247 1.1 christos break; 248 1.1 christos default: 249 1.1 christos abort (); 250 1.1 christos } 251 1.1 christos break; 252 1.1 christos default: 253 1.1 christos abort (); 254 1.1 christos } 255 1.1 christos 256 1.1 christos if (dirs[depth - 1] == L) 257 1.1 christos links[depth - 1]->llink = t; 258 1.1 christos else 259 1.1 christos links[depth - 1]->rlink = t; 260 1.1 christos } 261 1.1 christos } 262 1.1 christos 263 1.1 christos trie = link->trie; 264 1.1 christos } 265 1.1 christos 266 1.1 christos /* Mark the node we finally reached as accepting, encoding the 267 1.1 christos index number of this word in the keyword set so far. */ 268 1.1 christos if (!trie->accepting) 269 1.1 christos trie->accepting = 1 + 2 * kwset->words; 270 1.1 christos ++kwset->words; 271 1.1 christos 272 1.1 christos /* Keep track of the longest and shortest string of the keyword set. */ 273 1.1 christos if (trie->depth < kwset->mind) 274 1.1 christos kwset->mind = trie->depth; 275 1.1 christos if (trie->depth > kwset->maxd) 276 1.1 christos kwset->maxd = trie->depth; 277 1.1 christos 278 1.1 christos return 0; 279 1.1 christos } 280 1.1 christos 281 1.1 christos /* Enqueue the trie nodes referenced from the given tree in the 282 1.1 christos given queue. */ 283 1.1 christos static void 284 1.1 christos enqueue (struct tree *tree, struct trie **last) 285 1.1 christos { 286 1.1 christos if (!tree) 287 1.1 christos return; 288 1.1 christos enqueue(tree->llink, last); 289 1.1 christos enqueue(tree->rlink, last); 290 1.1 christos (*last) = (*last)->next = tree->trie; 291 1.1 christos } 292 1.1 christos 293 1.1 christos /* Compute the Aho-Corasick failure function for the trie nodes referenced 294 1.1 christos from the given tree, given the failure function for their parent as 295 1.1 christos well as a last resort failure node. */ 296 1.1 christos static void 297 1.1 christos treefails (register struct tree const *tree, struct trie const *fail, 298 1.1 christos struct trie *recourse) 299 1.1 christos { 300 1.1 christos register struct tree *link; 301 1.1 christos 302 1.1 christos if (!tree) 303 1.1 christos return; 304 1.1 christos 305 1.1 christos treefails(tree->llink, fail, recourse); 306 1.1 christos treefails(tree->rlink, fail, recourse); 307 1.1 christos 308 1.1 christos /* Find, in the chain of fails going back to the root, the first 309 1.1 christos node that has a descendent on the current label. */ 310 1.1 christos while (fail) 311 1.1 christos { 312 1.1 christos link = fail->links; 313 1.1 christos while (link && tree->label != link->label) 314 1.1 christos if (tree->label < link->label) 315 1.1 christos link = link->llink; 316 1.1 christos else 317 1.1 christos link = link->rlink; 318 1.1 christos if (link) 319 1.1 christos { 320 1.1 christos tree->trie->fail = link->trie; 321 1.1 christos return; 322 1.1 christos } 323 1.1 christos fail = fail->fail; 324 1.1 christos } 325 1.1 christos 326 1.1 christos tree->trie->fail = recourse; 327 1.1 christos } 328 1.1 christos 329 1.1 christos /* Set delta entries for the links of the given tree such that 330 1.1 christos the preexisting delta value is larger than the current depth. */ 331 1.1 christos static void 332 1.1 christos treedelta (register struct tree const *tree, 333 1.1 christos register unsigned int depth, 334 1.1 christos unsigned char delta[]) 335 1.1 christos { 336 1.1 christos if (!tree) 337 1.1 christos return; 338 1.1 christos treedelta(tree->llink, depth, delta); 339 1.1 christos treedelta(tree->rlink, depth, delta); 340 1.1 christos if (depth < delta[tree->label]) 341 1.1 christos delta[tree->label] = depth; 342 1.1 christos } 343 1.1 christos 344 1.1 christos /* Return true if A has every label in B. */ 345 1.1 christos static int 346 1.1 christos hasevery (register struct tree const *a, register struct tree const *b) 347 1.1 christos { 348 1.1 christos if (!b) 349 1.1 christos return 1; 350 1.1 christos if (!hasevery(a, b->llink)) 351 1.1 christos return 0; 352 1.1 christos if (!hasevery(a, b->rlink)) 353 1.1 christos return 0; 354 1.1 christos while (a && b->label != a->label) 355 1.1 christos if (b->label < a->label) 356 1.1 christos a = a->llink; 357 1.1 christos else 358 1.1 christos a = a->rlink; 359 1.1 christos return !!a; 360 1.1 christos } 361 1.1 christos 362 1.1 christos /* Compute a vector, indexed by character code, of the trie nodes 363 1.1 christos referenced from the given tree. */ 364 1.1 christos static void 365 1.1 christos treenext (struct tree const *tree, struct trie *next[]) 366 1.1 christos { 367 1.1 christos if (!tree) 368 1.1 christos return; 369 1.1 christos treenext(tree->llink, next); 370 1.1 christos treenext(tree->rlink, next); 371 1.1 christos next[tree->label] = tree->trie; 372 1.1 christos } 373 1.1 christos 374 1.1 christos /* Compute the shift for each trie node, as well as the delta 375 1.1 christos table and next cache for the given keyword set. */ 376 1.1 christos const char * 377 1.1 christos kwsprep (kwset_t kws) 378 1.1 christos { 379 1.1 christos register struct kwset *kwset; 380 1.1 christos register int i; 381 1.1 christos register struct trie *curr, *fail; 382 1.1 christos register char const *trans; 383 1.1 christos unsigned char delta[NCHAR]; 384 1.1 christos struct trie *last, *next[NCHAR]; 385 1.1 christos 386 1.1 christos kwset = (struct kwset *) kws; 387 1.1 christos 388 1.1 christos /* Initial values for the delta table; will be changed later. The 389 1.1 christos delta entry for a given character is the smallest depth of any 390 1.1 christos node at which an outgoing edge is labeled by that character. */ 391 1.1 christos if (kwset->mind < 256) 392 1.1 christos for (i = 0; i < NCHAR; ++i) 393 1.1 christos delta[i] = kwset->mind; 394 1.1 christos else 395 1.1 christos for (i = 0; i < NCHAR; ++i) 396 1.1 christos delta[i] = 255; 397 1.1 christos 398 1.1 christos /* Check if we can use the simple boyer-moore algorithm, instead 399 1.1 christos of the hairy commentz-walter algorithm. */ 400 1.1 christos if (kwset->words == 1 && kwset->trans == 0) 401 1.1 christos { 402 1.1 christos /* Looking for just one string. Extract it from the trie. */ 403 1.1 christos kwset->target = obstack_alloc(&kwset->obstack, kwset->mind); 404 1.1 christos for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i) 405 1.1 christos { 406 1.1 christos kwset->target[i] = curr->links->label; 407 1.1 christos curr = curr->links->trie; 408 1.1 christos } 409 1.1 christos /* Build the Boyer Moore delta. Boy that's easy compared to CW. */ 410 1.1 christos for (i = 0; i < kwset->mind; ++i) 411 1.1 christos delta[(unsigned char) kwset->target[i]] = kwset->mind - (i + 1); 412 1.1 christos kwset->mind2 = kwset->mind; 413 1.1 christos /* Find the minimal delta2 shift that we might make after 414 1.1 christos a backwards match has failed. */ 415 1.1 christos for (i = 0; i < kwset->mind - 1; ++i) 416 1.1 christos if (kwset->target[i] == kwset->target[kwset->mind - 1]) 417 1.1 christos kwset->mind2 = kwset->mind - (i + 1); 418 1.1 christos } 419 1.1 christos else 420 1.1 christos { 421 1.1 christos /* Traverse the nodes of the trie in level order, simultaneously 422 1.1 christos computing the delta table, failure function, and shift function. */ 423 1.1 christos for (curr = last = kwset->trie; curr; curr = curr->next) 424 1.1 christos { 425 1.1 christos /* Enqueue the immediate descendents in the level order queue. */ 426 1.1 christos enqueue(curr->links, &last); 427 1.1 christos 428 1.1 christos curr->shift = kwset->mind; 429 1.1 christos curr->maxshift = kwset->mind; 430 1.1 christos 431 1.1 christos /* Update the delta table for the descendents of this node. */ 432 1.1 christos treedelta(curr->links, curr->depth, delta); 433 1.1 christos 434 1.1 christos /* Compute the failure function for the decendents of this node. */ 435 1.1 christos treefails(curr->links, curr->fail, kwset->trie); 436 1.1 christos 437 1.1 christos /* Update the shifts at each node in the current node's chain 438 1.1 christos of fails back to the root. */ 439 1.1 christos for (fail = curr->fail; fail; fail = fail->fail) 440 1.1 christos { 441 1.1 christos /* If the current node has some outgoing edge that the fail 442 1.1 christos doesn't, then the shift at the fail should be no larger 443 1.1 christos than the difference of their depths. */ 444 1.1 christos if (!hasevery(fail->links, curr->links)) 445 1.1 christos if (curr->depth - fail->depth < fail->shift) 446 1.1 christos fail->shift = curr->depth - fail->depth; 447 1.1 christos 448 1.1 christos /* If the current node is accepting then the shift at the 449 1.1 christos fail and its descendents should be no larger than the 450 1.1 christos difference of their depths. */ 451 1.1 christos if (curr->accepting && fail->maxshift > curr->depth - fail->depth) 452 1.1 christos fail->maxshift = curr->depth - fail->depth; 453 1.1 christos } 454 1.1 christos } 455 1.1 christos 456 1.1 christos /* Traverse the trie in level order again, fixing up all nodes whose 457 1.1 christos shift exceeds their inherited maxshift. */ 458 1.1 christos for (curr = kwset->trie->next; curr; curr = curr->next) 459 1.1 christos { 460 1.1 christos if (curr->maxshift > curr->parent->maxshift) 461 1.1 christos curr->maxshift = curr->parent->maxshift; 462 1.1 christos if (curr->shift > curr->maxshift) 463 1.1 christos curr->shift = curr->maxshift; 464 1.1 christos } 465 1.1 christos 466 1.1 christos /* Create a vector, indexed by character code, of the outgoing links 467 1.1 christos from the root node. */ 468 1.1 christos for (i = 0; i < NCHAR; ++i) 469 1.1 christos next[i] = 0; 470 1.1 christos treenext(kwset->trie->links, next); 471 1.1 christos 472 1.1 christos if ((trans = kwset->trans) != 0) 473 1.1 christos for (i = 0; i < NCHAR; ++i) 474 1.1 christos kwset->next[i] = next[(unsigned char) trans[i]]; 475 1.1 christos else 476 1.1 christos for (i = 0; i < NCHAR; ++i) 477 1.1 christos kwset->next[i] = next[i]; 478 1.1 christos } 479 1.1 christos 480 1.1 christos /* Fix things up for any translation table. */ 481 1.1 christos if ((trans = kwset->trans) != 0) 482 1.1 christos for (i = 0; i < NCHAR; ++i) 483 1.1 christos kwset->delta[i] = delta[(unsigned char) trans[i]]; 484 1.1 christos else 485 1.1 christos for (i = 0; i < NCHAR; ++i) 486 1.1 christos kwset->delta[i] = delta[i]; 487 1.1 christos 488 1.1 christos return 0; 489 1.1 christos } 490 1.1 christos 491 1.1 christos #define U(C) ((unsigned char) (C)) 492 1.1 christos 493 1.1 christos /* Fast boyer-moore search. */ 494 1.1 christos static size_t 495 1.1 christos bmexec (kwset_t kws, char const *text, size_t size) 496 1.1 christos { 497 1.1 christos struct kwset const *kwset; 498 1.1 christos register unsigned char const *d1; 499 1.1 christos register char const *ep, *sp, *tp; 500 1.1 christos register int d, gc, i, len, md2; 501 1.1 christos 502 1.1 christos kwset = (struct kwset const *) kws; 503 1.1 christos len = kwset->mind; 504 1.1 christos 505 1.1 christos if (len == 0) 506 1.1 christos return 0; 507 1.1 christos if (len > size) 508 1.1 christos return -1; 509 1.1 christos if (len == 1) 510 1.1 christos { 511 1.1 christos tp = memchr (text, kwset->target[0], size); 512 1.1 christos return tp ? tp - text : -1; 513 1.1 christos } 514 1.1 christos 515 1.1 christos d1 = kwset->delta; 516 1.1 christos sp = kwset->target + len; 517 1.1 christos gc = U(sp[-2]); 518 1.1 christos md2 = kwset->mind2; 519 1.1 christos tp = text + len; 520 1.1 christos 521 1.1 christos /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ 522 1.1 christos if (size > 12 * len) 523 1.1 christos /* 11 is not a bug, the initial offset happens only once. */ 524 1.1 christos for (ep = text + size - 11 * len;;) 525 1.1 christos { 526 1.1 christos while (tp <= ep) 527 1.1 christos { 528 1.1 christos d = d1[U(tp[-1])], tp += d; 529 1.1 christos d = d1[U(tp[-1])], tp += d; 530 1.1 christos if (d == 0) 531 1.1 christos goto found; 532 1.1 christos d = d1[U(tp[-1])], tp += d; 533 1.1 christos d = d1[U(tp[-1])], tp += d; 534 1.1 christos d = d1[U(tp[-1])], tp += d; 535 1.1 christos if (d == 0) 536 1.1 christos goto found; 537 1.1 christos d = d1[U(tp[-1])], tp += d; 538 1.1 christos d = d1[U(tp[-1])], tp += d; 539 1.1 christos d = d1[U(tp[-1])], tp += d; 540 1.1 christos if (d == 0) 541 1.1 christos goto found; 542 1.1 christos d = d1[U(tp[-1])], tp += d; 543 1.1 christos d = d1[U(tp[-1])], tp += d; 544 1.1 christos } 545 1.1 christos break; 546 1.1 christos found: 547 1.1 christos if (U(tp[-2]) == gc) 548 1.1 christos { 549 1.1 christos for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) 550 1.1 christos ; 551 1.1 christos if (i > len) 552 1.1 christos return tp - len - text; 553 1.1 christos } 554 1.1 christos tp += md2; 555 1.1 christos } 556 1.1 christos 557 1.1 christos /* Now we have only a few characters left to search. We 558 1.1 christos carefully avoid ever producing an out-of-bounds pointer. */ 559 1.1 christos ep = text + size; 560 1.1 christos d = d1[U(tp[-1])]; 561 1.1 christos while (d <= ep - tp) 562 1.1 christos { 563 1.1 christos d = d1[U((tp += d)[-1])]; 564 1.1 christos if (d != 0) 565 1.1 christos continue; 566 1.1 christos if (U(tp[-2]) == gc) 567 1.1 christos { 568 1.1 christos for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) 569 1.1 christos ; 570 1.1 christos if (i > len) 571 1.1 christos return tp - len - text; 572 1.1 christos } 573 1.1 christos d = md2; 574 1.1 christos } 575 1.1 christos 576 1.1 christos return -1; 577 1.1 christos } 578 1.1 christos 579 1.1 christos /* Hairy multiple string search. */ 580 1.1 christos static size_t 581 1.1 christos cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch) 582 1.1 christos { 583 1.1 christos struct kwset const *kwset; 584 1.1 christos struct trie * const *next; 585 1.1 christos struct trie const *trie; 586 1.1 christos struct trie const *accept; 587 1.1 christos char const *beg, *lim, *mch, *lmch; 588 1.1 christos register unsigned char c; 589 1.1 christos register unsigned char const *delta; 590 1.1 christos register int d; 591 1.1 christos register char const *end, *qlim; 592 1.1 christos register struct tree const *tree; 593 1.1 christos register char const *trans; 594 1.1 christos 595 1.1 christos accept = NULL; 596 1.1 christos 597 1.1 christos /* Initialize register copies and look for easy ways out. */ 598 1.1 christos kwset = (struct kwset *) kws; 599 1.1 christos if (len < kwset->mind) 600 1.1 christos return -1; 601 1.1 christos next = kwset->next; 602 1.1 christos delta = kwset->delta; 603 1.1 christos trans = kwset->trans; 604 1.1 christos lim = text + len; 605 1.1 christos end = text; 606 1.1 christos if ((d = kwset->mind) != 0) 607 1.1 christos mch = 0; 608 1.1 christos else 609 1.1 christos { 610 1.1 christos mch = text, accept = kwset->trie; 611 1.1 christos goto match; 612 1.1 christos } 613 1.1 christos 614 1.1 christos if (len >= 4 * kwset->mind) 615 1.1 christos qlim = lim - 4 * kwset->mind; 616 1.1 christos else 617 1.1 christos qlim = 0; 618 1.1 christos 619 1.1 christos while (lim - end >= d) 620 1.1 christos { 621 1.1 christos if (qlim && end <= qlim) 622 1.1 christos { 623 1.1 christos end += d - 1; 624 1.1 christos while ((d = delta[c = *end]) && end < qlim) 625 1.1 christos { 626 1.1 christos end += d; 627 1.1 christos end += delta[(unsigned char) *end]; 628 1.1 christos end += delta[(unsigned char) *end]; 629 1.1 christos } 630 1.1 christos ++end; 631 1.1 christos } 632 1.1 christos else 633 1.1 christos d = delta[c = (end += d)[-1]]; 634 1.1 christos if (d) 635 1.1 christos continue; 636 1.1 christos beg = end - 1; 637 1.1 christos trie = next[c]; 638 1.1 christos if (trie->accepting) 639 1.1 christos { 640 1.1 christos mch = beg; 641 1.1 christos accept = trie; 642 1.1 christos } 643 1.1 christos d = trie->shift; 644 1.1 christos while (beg > text) 645 1.1 christos { 646 1.1 christos c = trans ? trans[(unsigned char) *--beg] : *--beg; 647 1.1 christos tree = trie->links; 648 1.1 christos while (tree && c != tree->label) 649 1.1 christos if (c < tree->label) 650 1.1 christos tree = tree->llink; 651 1.1 christos else 652 1.1 christos tree = tree->rlink; 653 1.1 christos if (tree) 654 1.1 christos { 655 1.1 christos trie = tree->trie; 656 1.1 christos if (trie->accepting) 657 1.1 christos { 658 1.1 christos mch = beg; 659 1.1 christos accept = trie; 660 1.1 christos } 661 1.1 christos } 662 1.1 christos else 663 1.1 christos break; 664 1.1 christos d = trie->shift; 665 1.1 christos } 666 1.1 christos if (mch) 667 1.1 christos goto match; 668 1.1 christos } 669 1.1 christos return -1; 670 1.1 christos 671 1.1 christos match: 672 1.1 christos /* Given a known match, find the longest possible match anchored 673 1.1 christos at or before its starting point. This is nearly a verbatim 674 1.1 christos copy of the preceding main search loops. */ 675 1.1 christos if (lim - mch > kwset->maxd) 676 1.1 christos lim = mch + kwset->maxd; 677 1.1 christos lmch = 0; 678 1.1 christos d = 1; 679 1.1 christos while (lim - end >= d) 680 1.1 christos { 681 1.1 christos if ((d = delta[c = (end += d)[-1]]) != 0) 682 1.1 christos continue; 683 1.1 christos beg = end - 1; 684 1.1 christos if (!(trie = next[c])) 685 1.1 christos { 686 1.1 christos d = 1; 687 1.1 christos continue; 688 1.1 christos } 689 1.1 christos if (trie->accepting && beg <= mch) 690 1.1 christos { 691 1.1 christos lmch = beg; 692 1.1 christos accept = trie; 693 1.1 christos } 694 1.1 christos d = trie->shift; 695 1.1 christos while (beg > text) 696 1.1 christos { 697 1.1 christos c = trans ? trans[(unsigned char) *--beg] : *--beg; 698 1.1 christos tree = trie->links; 699 1.1 christos while (tree && c != tree->label) 700 1.1 christos if (c < tree->label) 701 1.1 christos tree = tree->llink; 702 1.1 christos else 703 1.1 christos tree = tree->rlink; 704 1.1 christos if (tree) 705 1.1 christos { 706 1.1 christos trie = tree->trie; 707 1.1 christos if (trie->accepting && beg <= mch) 708 1.1 christos { 709 1.1 christos lmch = beg; 710 1.1 christos accept = trie; 711 1.1 christos } 712 1.1 christos } 713 1.1 christos else 714 1.1 christos break; 715 1.1 christos d = trie->shift; 716 1.1 christos } 717 1.1 christos if (lmch) 718 1.1 christos { 719 1.1 christos mch = lmch; 720 1.1 christos goto match; 721 1.1 christos } 722 1.1 christos if (!d) 723 1.1 christos d = 1; 724 1.1 christos } 725 1.1 christos 726 1.1 christos if (kwsmatch) 727 1.1 christos { 728 1.1 christos kwsmatch->index = accept->accepting / 2; 729 1.1 christos kwsmatch->offset[0] = mch - text; 730 1.1 christos kwsmatch->size[0] = accept->depth; 731 1.1 christos } 732 1.1 christos return mch - text; 733 1.1 christos } 734 1.1 christos 735 1.1 christos /* Search through the given text for a match of any member of the 736 1.1 christos given keyword set. Return a pointer to the first character of 737 1.1 christos the matching substring, or NULL if no match is found. If FOUNDLEN 738 1.1 christos is non-NULL store in the referenced location the length of the 739 1.1 christos matching substring. Similarly, if FOUNDIDX is non-NULL, store 740 1.1 christos in the referenced location the index number of the particular 741 1.1 christos keyword matched. */ 742 1.1 christos size_t 743 1.1 christos kwsexec (kwset_t kws, char const *text, size_t size, 744 1.1 christos struct kwsmatch *kwsmatch) 745 1.1 christos { 746 1.1 christos struct kwset const *kwset = (struct kwset *) kws; 747 1.1 christos if (kwset->words == 1 && kwset->trans == 0) 748 1.1 christos { 749 1.1 christos size_t ret = bmexec (kws, text, size); 750 1.1 christos if (kwsmatch != 0 && ret != (size_t) -1) 751 1.1 christos { 752 1.1 christos kwsmatch->index = 0; 753 1.1 christos kwsmatch->offset[0] = ret; 754 1.1 christos kwsmatch->size[0] = kwset->mind; 755 1.1 christos } 756 1.1 christos return ret; 757 1.1 christos } 758 1.1 christos else 759 1.1 christos return cwexec(kws, text, size, kwsmatch); 760 1.1 christos } 761 1.1 christos 762 1.1 christos /* Free the components of the given keyword set. */ 763 1.1 christos void 764 1.1 christos kwsfree (kwset_t kws) 765 1.1 christos { 766 1.1 christos struct kwset *kwset; 767 1.1 christos 768 1.1 christos kwset = (struct kwset *) kws; 769 1.1 christos obstack_free(&kwset->obstack, 0); 770 1.1 christos free(kws); 771 1.1 christos } 772