1 1.1 agc /* 2 1.1 agc tre-match-parallel.c - TRE parallel regex matching engine 3 1.1 agc 4 1.1 agc This software is released under a BSD-style license. 5 1.1 agc See the file LICENSE for details and copyright. 6 1.1 agc 7 1.1 agc */ 8 1.1 agc 9 1.1 agc /* 10 1.1 agc This algorithm searches for matches basically by reading characters 11 1.1 agc in the searched string one by one, starting at the beginning. All 12 1.1 agc matching paths in the TNFA are traversed in parallel. When two or 13 1.1 agc more paths reach the same state, exactly one is chosen according to 14 1.1 agc tag ordering rules; if returning submatches is not required it does 15 1.1 agc not matter which path is chosen. 16 1.1 agc 17 1.1 agc The worst case time required for finding the leftmost and longest 18 1.1 agc match, or determining that there is no match, is always linearly 19 1.1 agc dependent on the length of the text being searched. 20 1.1 agc 21 1.1 agc This algorithm cannot handle TNFAs with back referencing nodes. 22 1.1 agc See `tre-match-backtrack.c'. 23 1.1 agc */ 24 1.1 agc 25 1.1 agc 26 1.1 agc #ifdef HAVE_CONFIG_H 27 1.1 agc #include <config.h> 28 1.1 agc #endif /* HAVE_CONFIG_H */ 29 1.1 agc 30 1.2 christos #include "tre-alloca.h" 31 1.1 agc 32 1.1 agc #include <assert.h> 33 1.1 agc #include <stdlib.h> 34 1.1 agc #include <string.h> 35 1.1 agc #ifdef HAVE_WCHAR_H 36 1.1 agc #include <wchar.h> 37 1.1 agc #endif /* HAVE_WCHAR_H */ 38 1.1 agc #ifdef HAVE_WCTYPE_H 39 1.1 agc #include <wctype.h> 40 1.1 agc #endif /* HAVE_WCTYPE_H */ 41 1.1 agc #ifndef TRE_WCHAR 42 1.1 agc #include <ctype.h> 43 1.1 agc #endif /* !TRE_WCHAR */ 44 1.1 agc #ifdef HAVE_MALLOC_H 45 1.1 agc #include <malloc.h> 46 1.1 agc #endif /* HAVE_MALLOC_H */ 47 1.6 rin #include <stdint.h> 48 1.1 agc 49 1.1 agc #include "tre-internal.h" 50 1.1 agc #include "tre-match-utils.h" 51 1.1 agc #include "tre.h" 52 1.1 agc #include "xmalloc.h" 53 1.1 agc 54 1.1 agc 55 1.1 agc 56 1.1 agc typedef struct { 57 1.1 agc tre_tnfa_transition_t *state; 58 1.1 agc int *tags; 59 1.1 agc } tre_tnfa_reach_t; 60 1.1 agc 61 1.1 agc typedef struct { 62 1.1 agc int pos; 63 1.1 agc int **tags; 64 1.1 agc } tre_reach_pos_t; 65 1.1 agc 66 1.1 agc 67 1.1 agc #ifdef TRE_DEBUG 68 1.1 agc static void 69 1.1 agc tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags) 70 1.1 agc { 71 1.1 agc int i; 72 1.1 agc 73 1.1 agc while (reach->state != NULL) 74 1.1 agc { 75 1.1 agc DPRINT((" %p", (void *)reach->state)); 76 1.1 agc if (num_tags > 0) 77 1.1 agc { 78 1.1 agc DPRINT(("/")); 79 1.1 agc for (i = 0; i < num_tags; i++) 80 1.1 agc { 81 1.1 agc DPRINT(("%d:%d", i, reach->tags[i])); 82 1.1 agc if (i < (num_tags-1)) 83 1.1 agc DPRINT((",")); 84 1.1 agc } 85 1.1 agc } 86 1.1 agc reach++; 87 1.1 agc } 88 1.1 agc DPRINT(("\n")); 89 1.1 agc 90 1.1 agc } 91 1.1 agc #endif /* TRE_DEBUG */ 92 1.1 agc 93 1.1 agc reg_errcode_t 94 1.1 agc tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, 95 1.1 agc tre_str_type_t type, int *match_tags, int eflags, 96 1.1 agc int *match_end_ofs) 97 1.1 agc { 98 1.1 agc /* State variables required by GET_NEXT_WCHAR. */ 99 1.1 agc tre_char_t prev_c = 0, next_c = 0; 100 1.1 agc const char *str_byte = string; 101 1.1 agc int pos = -1; 102 1.1 agc unsigned int pos_add_next = 1; 103 1.1 agc #ifdef TRE_WCHAR 104 1.1 agc const wchar_t *str_wide = string; 105 1.1 agc #ifdef TRE_MBSTATE 106 1.1 agc mbstate_t mbstate; 107 1.1 agc #endif /* TRE_MBSTATE */ 108 1.1 agc #endif /* TRE_WCHAR */ 109 1.1 agc int reg_notbol = eflags & REG_NOTBOL; 110 1.1 agc int reg_noteol = eflags & REG_NOTEOL; 111 1.1 agc int reg_newline = tnfa->cflags & REG_NEWLINE; 112 1.2 christos size_t str_user_end = 0; 113 1.1 agc 114 1.1 agc char *buf; 115 1.1 agc tre_tnfa_transition_t *trans_i; 116 1.1 agc tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i; 117 1.1 agc tre_reach_pos_t *reach_pos; 118 1.1 agc int *tag_i; 119 1.2 christos size_t num_tags, i; 120 1.1 agc 121 1.1 agc int match_eo = -1; /* end offset of match (-1 if no match found yet) */ 122 1.1 agc int new_match = 0; 123 1.1 agc int *tmp_tags = NULL; 124 1.1 agc int *tmp_iptr; 125 1.4 rin reg_errcode_t ret; 126 1.1 agc 127 1.1 agc #ifdef TRE_MBSTATE 128 1.1 agc memset(&mbstate, '\0', sizeof(mbstate)); 129 1.1 agc #endif /* TRE_MBSTATE */ 130 1.1 agc 131 1.1 agc DPRINT(("tre_tnfa_run_parallel, input type %d\n", type)); 132 1.1 agc 133 1.1 agc if (!match_tags) 134 1.1 agc num_tags = 0; 135 1.1 agc else 136 1.1 agc num_tags = tnfa->num_tags; 137 1.1 agc 138 1.1 agc /* Allocate memory for temporary data required for matching. This needs to 139 1.1 agc be done for every matching operation to be thread safe. This allocates 140 1.1 agc everything in a single large block from the stack frame using alloca() 141 1.1 agc or with malloc() if alloca is unavailable. */ 142 1.1 agc { 143 1.2 christos size_t tbytes, rbytes, pbytes, xbytes, total_bytes; 144 1.1 agc char *tmp_buf; 145 1.5 rin 146 1.5 rin /* Ensure that tbytes and xbytes*num_states cannot overflow, and that 147 1.5 rin * they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */ 148 1.5 rin if (num_tags > SIZE_MAX/(8 * sizeof(int) * tnfa->num_states)) 149 1.5 rin return REG_ESPACE; 150 1.5 rin 151 1.5 rin /* Likewise check rbytes. */ 152 1.5 rin if (tnfa->num_states+1 > SIZE_MAX/(8 * sizeof(*reach_next))) 153 1.5 rin return REG_ESPACE; 154 1.5 rin 155 1.5 rin /* Likewise check pbytes. */ 156 1.5 rin if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_pos))) 157 1.5 rin return REG_ESPACE; 158 1.5 rin 159 1.1 agc /* Compute the length of the block we need. */ 160 1.1 agc tbytes = sizeof(*tmp_tags) * num_tags; 161 1.1 agc rbytes = sizeof(*reach_next) * (tnfa->num_states + 1); 162 1.1 agc pbytes = sizeof(*reach_pos) * tnfa->num_states; 163 1.1 agc xbytes = sizeof(int) * num_tags; 164 1.1 agc total_bytes = 165 1.1 agc (sizeof(long) - 1) * 4 /* for alignment paddings */ 166 1.1 agc + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes; 167 1.1 agc 168 1.1 agc /* Allocate the memory. */ 169 1.1 agc #ifdef TRE_USE_ALLOCA 170 1.1 agc buf = alloca(total_bytes); 171 1.1 agc #else /* !TRE_USE_ALLOCA */ 172 1.1 agc buf = xmalloc((unsigned)total_bytes); 173 1.1 agc #endif /* !TRE_USE_ALLOCA */ 174 1.1 agc if (buf == NULL) 175 1.1 agc return REG_ESPACE; 176 1.1 agc memset(buf, 0, (size_t)total_bytes); 177 1.1 agc 178 1.1 agc /* Get the various pointers within tmp_buf (properly aligned). */ 179 1.1 agc tmp_tags = (void *)buf; 180 1.1 agc tmp_buf = buf + tbytes; 181 1.1 agc tmp_buf += ALIGN(tmp_buf, long); 182 1.1 agc reach_next = (void *)tmp_buf; 183 1.1 agc tmp_buf += rbytes; 184 1.1 agc tmp_buf += ALIGN(tmp_buf, long); 185 1.1 agc reach = (void *)tmp_buf; 186 1.1 agc tmp_buf += rbytes; 187 1.1 agc tmp_buf += ALIGN(tmp_buf, long); 188 1.1 agc reach_pos = (void *)tmp_buf; 189 1.1 agc tmp_buf += pbytes; 190 1.1 agc tmp_buf += ALIGN(tmp_buf, long); 191 1.1 agc for (i = 0; i < tnfa->num_states; i++) 192 1.1 agc { 193 1.1 agc reach[i].tags = (void *)tmp_buf; 194 1.1 agc tmp_buf += xbytes; 195 1.1 agc reach_next[i].tags = (void *)tmp_buf; 196 1.1 agc tmp_buf += xbytes; 197 1.1 agc } 198 1.1 agc } 199 1.1 agc 200 1.1 agc for (i = 0; i < tnfa->num_states; i++) 201 1.1 agc reach_pos[i].pos = -1; 202 1.1 agc 203 1.1 agc /* If only one character can start a match, find it first. */ 204 1.1 agc if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte) 205 1.1 agc { 206 1.1 agc const char *orig_str = str_byte; 207 1.1 agc int first = tnfa->first_char; 208 1.1 agc 209 1.1 agc if (len >= 0) 210 1.1 agc str_byte = memchr(orig_str, first, (size_t)len); 211 1.1 agc else 212 1.1 agc str_byte = strchr(orig_str, first); 213 1.1 agc if (str_byte == NULL) 214 1.1 agc { 215 1.1 agc #ifndef TRE_USE_ALLOCA 216 1.1 agc if (buf) 217 1.1 agc xfree(buf); 218 1.1 agc #endif /* !TRE_USE_ALLOCA */ 219 1.1 agc return REG_NOMATCH; 220 1.1 agc } 221 1.1 agc DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str))); 222 1.1 agc if (str_byte >= orig_str + 1) 223 1.1 agc prev_c = (unsigned char)*(str_byte - 1); 224 1.1 agc next_c = (unsigned char)*str_byte; 225 1.2 christos pos = (int)(str_byte - orig_str); 226 1.1 agc if (len < 0 || pos < len) 227 1.1 agc str_byte++; 228 1.1 agc } 229 1.1 agc else 230 1.1 agc { 231 1.1 agc GET_NEXT_WCHAR(); 232 1.1 agc pos = 0; 233 1.1 agc } 234 1.1 agc 235 1.1 agc #if 0 236 1.1 agc /* Skip over characters that cannot possibly be the first character 237 1.1 agc of a match. */ 238 1.1 agc if (tnfa->firstpos_chars != NULL) 239 1.1 agc { 240 1.1 agc char *chars = tnfa->firstpos_chars; 241 1.1 agc 242 1.1 agc if (len < 0) 243 1.1 agc { 244 1.1 agc const char *orig_str = str_byte; 245 1.1 agc /* XXX - use strpbrk() and wcspbrk() because they might be 246 1.1 agc optimized for the target architecture. Try also strcspn() 247 1.1 agc and wcscspn() and compare the speeds. */ 248 1.1 agc while (next_c != L'\0' && !chars[next_c]) 249 1.1 agc { 250 1.1 agc next_c = *str_byte++; 251 1.1 agc } 252 1.1 agc prev_c = *(str_byte - 2); 253 1.1 agc pos += str_byte - orig_str; 254 1.1 agc DPRINT(("skipped %d chars\n", str_byte - orig_str)); 255 1.1 agc } 256 1.1 agc else 257 1.1 agc { 258 1.1 agc while (pos <= len && !chars[next_c]) 259 1.1 agc { 260 1.1 agc prev_c = next_c; 261 1.1 agc next_c = (unsigned char)(*str_byte++); 262 1.1 agc pos++; 263 1.1 agc } 264 1.1 agc } 265 1.1 agc } 266 1.1 agc #endif 267 1.1 agc 268 1.1 agc DPRINT(("length: %d\n", len)); 269 1.1 agc DPRINT(("pos:chr/code | states and tags\n")); 270 1.1 agc DPRINT(("-------------+------------------------------------------------\n")); 271 1.1 agc 272 1.1 agc reach_next_i = reach_next; 273 1.3 rin while (/*CONSTCOND*/(void)1,1) 274 1.1 agc { 275 1.1 agc /* If no match found yet, add the initial states to `reach_next'. */ 276 1.1 agc if (match_eo < 0) 277 1.1 agc { 278 1.1 agc DPRINT((" init >")); 279 1.1 agc trans_i = tnfa->initial; 280 1.1 agc while (trans_i->state != NULL) 281 1.1 agc { 282 1.1 agc if (reach_pos[trans_i->state_id].pos < pos) 283 1.1 agc { 284 1.1 agc if (trans_i->assertions 285 1.1 agc && CHECK_ASSERTIONS(trans_i->assertions)) 286 1.1 agc { 287 1.1 agc DPRINT(("assertion failed\n")); 288 1.1 agc trans_i++; 289 1.1 agc continue; 290 1.1 agc } 291 1.1 agc 292 1.1 agc DPRINT((" %p", (void *)trans_i->state)); 293 1.1 agc reach_next_i->state = trans_i->state; 294 1.1 agc for (i = 0; i < num_tags; i++) 295 1.1 agc reach_next_i->tags[i] = -1; 296 1.1 agc tag_i = trans_i->tags; 297 1.1 agc if (tag_i) 298 1.1 agc while (*tag_i >= 0) 299 1.1 agc { 300 1.2 christos if ((size_t)*tag_i < num_tags) 301 1.1 agc reach_next_i->tags[*tag_i] = pos; 302 1.1 agc tag_i++; 303 1.1 agc } 304 1.1 agc if (reach_next_i->state == tnfa->final) 305 1.1 agc { 306 1.1 agc DPRINT((" found empty match\n")); 307 1.1 agc match_eo = pos; 308 1.1 agc new_match = 1; 309 1.1 agc for (i = 0; i < num_tags; i++) 310 1.1 agc match_tags[i] = reach_next_i->tags[i]; 311 1.1 agc } 312 1.1 agc reach_pos[trans_i->state_id].pos = pos; 313 1.1 agc reach_pos[trans_i->state_id].tags = &reach_next_i->tags; 314 1.1 agc reach_next_i++; 315 1.1 agc } 316 1.1 agc trans_i++; 317 1.1 agc } 318 1.1 agc DPRINT(("\n")); 319 1.1 agc reach_next_i->state = NULL; 320 1.1 agc } 321 1.1 agc else 322 1.1 agc { 323 1.1 agc if (num_tags == 0 || reach_next_i == reach_next) 324 1.3 rin /* We have found a match. */ 325 1.1 agc break; 326 1.1 agc } 327 1.1 agc 328 1.1 agc /* Check for end of string. */ 329 1.1 agc if (len < 0) 330 1.1 agc { 331 1.1 agc if (type == STR_USER) 332 1.1 agc { 333 1.1 agc if (str_user_end) 334 1.1 agc break; 335 1.1 agc } 336 1.1 agc else if (next_c == L'\0') 337 1.1 agc break; 338 1.1 agc } 339 1.1 agc else 340 1.1 agc { 341 1.1 agc if (pos >= len) 342 1.1 agc break; 343 1.1 agc } 344 1.1 agc 345 1.1 agc GET_NEXT_WCHAR(); 346 1.1 agc 347 1.1 agc #ifdef TRE_DEBUG 348 1.1 agc DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c)); 349 1.1 agc tre_print_reach(tnfa, reach_next, num_tags); 350 1.1 agc DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c)); 351 1.1 agc tre_print_reach(tnfa, reach_next, num_tags); 352 1.1 agc #endif /* TRE_DEBUG */ 353 1.1 agc 354 1.1 agc /* Swap `reach' and `reach_next'. */ 355 1.1 agc reach_i = reach; 356 1.1 agc reach = reach_next; 357 1.1 agc reach_next = reach_i; 358 1.1 agc 359 1.1 agc /* For each state in `reach', weed out states that don't fulfill the 360 1.1 agc minimal matching conditions. */ 361 1.1 agc if (tnfa->num_minimals && new_match) 362 1.1 agc { 363 1.1 agc new_match = 0; 364 1.1 agc reach_next_i = reach_next; 365 1.1 agc for (reach_i = reach; reach_i->state; reach_i++) 366 1.1 agc { 367 1.1 agc int skip = 0; 368 1.1 agc for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) 369 1.1 agc { 370 1.1 agc int end = tnfa->minimal_tags[i]; 371 1.1 agc int start = tnfa->minimal_tags[i + 1]; 372 1.1 agc DPRINT((" Minimal start %d, end %d\n", start, end)); 373 1.2 christos if ((size_t)end >= num_tags) 374 1.1 agc { 375 1.1 agc DPRINT((" Throwing %p out.\n", reach_i->state)); 376 1.1 agc skip = 1; 377 1.1 agc break; 378 1.1 agc } 379 1.1 agc else if (reach_i->tags[start] == match_tags[start] 380 1.1 agc && reach_i->tags[end] < match_tags[end]) 381 1.1 agc { 382 1.1 agc DPRINT((" Throwing %p out because t%d < %d\n", 383 1.1 agc reach_i->state, end, match_tags[end])); 384 1.1 agc skip = 1; 385 1.1 agc break; 386 1.1 agc } 387 1.1 agc } 388 1.1 agc if (!skip) 389 1.1 agc { 390 1.1 agc reach_next_i->state = reach_i->state; 391 1.1 agc tmp_iptr = reach_next_i->tags; 392 1.1 agc reach_next_i->tags = reach_i->tags; 393 1.1 agc reach_i->tags = tmp_iptr; 394 1.1 agc reach_next_i++; 395 1.1 agc } 396 1.1 agc } 397 1.1 agc reach_next_i->state = NULL; 398 1.1 agc 399 1.1 agc /* Swap `reach' and `reach_next'. */ 400 1.1 agc reach_i = reach; 401 1.1 agc reach = reach_next; 402 1.1 agc reach_next = reach_i; 403 1.1 agc } 404 1.1 agc 405 1.1 agc /* For each state in `reach' see if there is a transition leaving with 406 1.1 agc the current input symbol to a state not yet in `reach_next', and 407 1.1 agc add the destination states to `reach_next'. */ 408 1.1 agc reach_next_i = reach_next; 409 1.1 agc for (reach_i = reach; reach_i->state; reach_i++) 410 1.1 agc { 411 1.1 agc for (trans_i = reach_i->state; trans_i->state; trans_i++) 412 1.1 agc { 413 1.1 agc /* Does this transition match the input symbol? */ 414 1.1 agc if (trans_i->code_min <= (tre_cint_t)prev_c && 415 1.1 agc trans_i->code_max >= (tre_cint_t)prev_c) 416 1.1 agc { 417 1.1 agc if (trans_i->assertions 418 1.1 agc && (CHECK_ASSERTIONS(trans_i->assertions) 419 1.1 agc || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) 420 1.1 agc { 421 1.1 agc DPRINT(("assertion failed\n")); 422 1.1 agc continue; 423 1.1 agc } 424 1.1 agc 425 1.1 agc /* Compute the tags after this transition. */ 426 1.1 agc for (i = 0; i < num_tags; i++) 427 1.1 agc tmp_tags[i] = reach_i->tags[i]; 428 1.1 agc tag_i = trans_i->tags; 429 1.1 agc if (tag_i != NULL) 430 1.1 agc while (*tag_i >= 0) 431 1.1 agc { 432 1.2 christos if ((size_t)*tag_i < num_tags) 433 1.1 agc tmp_tags[*tag_i] = pos; 434 1.1 agc tag_i++; 435 1.1 agc } 436 1.1 agc 437 1.1 agc if (reach_pos[trans_i->state_id].pos < pos) 438 1.1 agc { 439 1.1 agc /* Found an unvisited node. */ 440 1.1 agc reach_next_i->state = trans_i->state; 441 1.1 agc tmp_iptr = reach_next_i->tags; 442 1.1 agc reach_next_i->tags = tmp_tags; 443 1.1 agc tmp_tags = tmp_iptr; 444 1.1 agc reach_pos[trans_i->state_id].pos = pos; 445 1.1 agc reach_pos[trans_i->state_id].tags = &reach_next_i->tags; 446 1.1 agc 447 1.1 agc if (reach_next_i->state == tnfa->final 448 1.1 agc && (match_eo == -1 449 1.1 agc || (num_tags > 0 450 1.1 agc && reach_next_i->tags[0] <= match_tags[0]))) 451 1.1 agc { 452 1.1 agc DPRINT((" found match %p\n", trans_i->state)); 453 1.1 agc match_eo = pos; 454 1.1 agc new_match = 1; 455 1.1 agc for (i = 0; i < num_tags; i++) 456 1.1 agc match_tags[i] = reach_next_i->tags[i]; 457 1.1 agc } 458 1.1 agc reach_next_i++; 459 1.1 agc 460 1.1 agc } 461 1.1 agc else 462 1.1 agc { 463 1.1 agc assert(reach_pos[trans_i->state_id].pos == pos); 464 1.1 agc /* Another path has also reached this state. We choose 465 1.1 agc the winner by examining the tag values for both 466 1.1 agc paths. */ 467 1.1 agc if (tre_tag_order(num_tags, tnfa->tag_directions, 468 1.1 agc tmp_tags, 469 1.1 agc *reach_pos[trans_i->state_id].tags)) 470 1.1 agc { 471 1.1 agc /* The new path wins. */ 472 1.1 agc tmp_iptr = *reach_pos[trans_i->state_id].tags; 473 1.1 agc *reach_pos[trans_i->state_id].tags = tmp_tags; 474 1.1 agc if (trans_i->state == tnfa->final) 475 1.1 agc { 476 1.1 agc DPRINT((" found better match\n")); 477 1.1 agc match_eo = pos; 478 1.1 agc new_match = 1; 479 1.1 agc for (i = 0; i < num_tags; i++) 480 1.1 agc match_tags[i] = tmp_tags[i]; 481 1.1 agc } 482 1.1 agc tmp_tags = tmp_iptr; 483 1.1 agc } 484 1.1 agc } 485 1.1 agc } 486 1.1 agc } 487 1.1 agc } 488 1.1 agc reach_next_i->state = NULL; 489 1.1 agc } 490 1.1 agc 491 1.1 agc DPRINT(("match end offset = %d\n", match_eo)); 492 1.1 agc 493 1.4 rin *match_end_ofs = match_eo; 494 1.4 rin ret = match_eo >= 0 ? REG_OK : REG_NOMATCH; 495 1.4 rin error_exit: 496 1.1 agc #ifndef TRE_USE_ALLOCA 497 1.1 agc if (buf) 498 1.1 agc xfree(buf); 499 1.1 agc #endif /* !TRE_USE_ALLOCA */ 500 1.4 rin return ret; 501 1.1 agc } 502 1.1 agc 503 1.1 agc /* EOF */ 504