1/* 2 * Copyright (c) 2002 by The XFree86 Project, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 * 22 * Except as contained in this notice, the name of the XFree86 Project shall 23 * not be used in advertising or otherwise to promote the sale, use or other 24 * dealings in this Software without prior written authorization from the 25 * XFree86 Project. 26 * 27 * Author: Paulo César Pereira de Andrade 28 */ 29 30/* $XFree86: xc/programs/xedit/lisp/re/reo.c,v 1.8 2002/09/29 02:55:01 paulo Exp $ */ 31 32#include "rep.h" 33 34/* 35 * This file is a placeholder to add code to analyse and optimize the 36 * intermediate data structure generated in rep.c. 37 * Character ranges are optimized while being generated. 38 */ 39 40/* 41 * Types 42 */ 43typedef struct _orec_inf { 44 rec_alt *alt; /* Main alternatives list */ 45 rec_grp *grp; /* Current group pointer */ 46 int flags; 47 int ecode; 48} orec_inf; 49 50/* 51 * Prototypes 52 */ 53static int orec_alt(orec_inf*, rec_alt*); 54static int orec_pat(orec_inf*, rec_pat*); 55static int orec_grp(orec_inf*, rec_grp*); 56static int orec_pat_bad_rpt(orec_inf*, rec_pat*); 57static int orec_pat_bad_forward_rpt(orec_inf*, rec_pat*); 58static int orec_pat_rng(orec_inf*, rec_pat*); 59static int orec_pat_cse(orec_inf*, rec_pat*); 60static int orec_pat_cse_can(orec_inf*, rec_pat*); 61static int orec_str_list(orec_inf*, rec_alt*, int, int); 62 63/* 64 * Initialization 65 */ 66extern unsigned char re__alnum[256]; 67extern unsigned char re__odigit[256]; 68extern unsigned char re__ddigit[256]; 69extern unsigned char re__xdigit[256]; 70extern unsigned char re__control[256]; 71 72/* 73 * Implementation 74 */ 75int 76orec_comp(rec_alt *alt, int flags) 77{ 78 orec_inf inf; 79 80 inf.alt = alt; 81 inf.grp = NULL; 82 inf.flags = flags; 83 inf.ecode = 0; 84 85 orec_alt(&inf, alt); 86 87 return (inf.ecode); 88} 89 90void 91orec_free_stl(rec_stl *stl) 92{ 93 int i; 94 95 for (i = 0; i < stl->nstrs; i++) { 96 if (stl->lens[i] > 2) 97 free(stl->strs[i]); 98 } 99 100 free(stl->lens); 101 free(stl->strs); 102 free(stl); 103} 104 105 106static int 107orec_alt(orec_inf *inf, rec_alt *alt) 108{ 109 if (alt) { 110 rec_alt *ptr = alt; 111 int ret, count = 0, str = 1, cstr = 1, lits = 0, clits = 0; 112 113 /* Check if can build a string list */ 114 if (ptr->next) { 115 /* If more than one alternative */ 116 while (ptr && (str || cstr)) { 117 if (ptr->pat == NULL || ptr->pat->rep != NULL) { 118 cstr = str = 0; 119 break; 120 } 121 if ((inf->flags & RE_ICASE)) { 122 if (!(ret = orec_pat_cse_can(inf, ptr->pat))) { 123 cstr = str = 0; 124 break; 125 } 126 if (ret == 1) 127 ++lits; 128 else if (ret == 2) 129 ++clits; 130 } 131 else if (ptr->pat->next == NULL) { 132 if (ptr->pat->type != Rep_String) { 133 if (ptr->pat->type != Rep_Literal) { 134 str = 0; 135 if (ptr->pat->type != Rep_CaseString) { 136 if (ptr->pat->type != Rep_CaseLiteral) 137 cstr = 0; 138 else 139 ++clits; 140 } 141 else if (strlen((char*)ptr->pat->data.str) >= 255) 142 str = cstr = 0; 143 } 144 else 145 ++lits; 146 } 147 else if (strlen((char*)ptr->pat->data.str) >= 255) 148 str = cstr = 0; 149 } 150 else { 151 str = cstr = 0; 152 break; 153 } 154 if (++count >= 255) 155 str = cstr = 0; 156 ptr = ptr->next; 157 } 158 159 if (str || cstr) { 160 if (inf->flags & RE_ICASE) { 161 for (ptr = alt; ptr; ptr = ptr->next) { 162 if (orec_pat_cse(inf, ptr->pat)) 163 return (inf->ecode); 164 } 165 str = 0; 166 } 167 return (orec_str_list(inf, alt, str, count)); 168 } 169 } 170 else if (alt == inf->alt && alt->pat && alt->pat->rep == NULL) { 171 /* If the toplevel single alternative */ 172 switch (alt->pat->type) { 173 /* One of these will always be true for RE_NOSPEC, 174 * but can also be optimized for simple patterns */ 175 case Rep_Literal: 176 alt->pat->type = Rep_SearchLiteral; 177 break; 178 case Rep_CaseLiteral: 179 alt->pat->type = Rep_SearchCaseLiteral; 180 break; 181 case Rep_String: 182 alt->pat->type = Rep_SearchString; 183 break; 184 case Rep_CaseString: 185 alt->pat->type = Rep_SearchCaseString; 186 break; 187 default: 188 break; 189 } 190 } 191 192 while (alt) { 193 orec_pat(inf, alt->pat); 194 alt = alt->next; 195 } 196 } 197 198 return (inf->ecode); 199} 200 201static int 202orec_pat(orec_inf *inf, rec_pat *pat) 203{ 204 rec_pat *next; 205 206 while (pat) { 207 switch (pat->type) { 208 case Rep_AnyAnyTimes: 209 if (pat->next == NULL) { 210 rec_grp *grp = inf->grp; 211 212 next = NULL; 213 while (grp) { 214 next = grp->parent->next; 215 /* Cannot check if is .*$ as the input 216 * may be a substring */ 217 if (next) 218 break; 219 grp = grp->pgrp; 220 } 221 if (next == NULL) { 222 /* <re>.* */ 223 pat->type = Rep_AnyEatAnyTimes; 224 grp = inf->grp; 225 while (grp) { 226 --grp->comp; 227 next = grp->parent->next; 228 if (next) 229 break; 230 grp = grp->pgrp; 231 } 232 } 233 else if (orec_pat_bad_rpt(inf, next)) 234 return (inf->ecode); 235 } 236 else if (orec_pat_bad_rpt(inf, pat->next)) 237 return (inf->ecode); 238 break; 239 case Rep_AnyMaybe: 240 if (pat->next == NULL) { 241 rec_grp *grp = inf->grp; 242 243 next = NULL; 244 while (grp) { 245 next = grp->parent->next; 246 if (next) 247 break; 248 grp = grp->pgrp; 249 } 250 if (next == NULL) { 251 /* <re>.? */ 252 pat->type = Rep_AnyEatMaybe; 253 grp = inf->grp; 254 while (grp) { 255 --grp->comp; 256 next = grp->parent->next; 257 if (next) 258 break; 259 grp = grp->pgrp; 260 } 261 } 262 else if (orec_pat_bad_rpt(inf, next)) 263 return (inf->ecode); 264 } 265 else if (orec_pat_bad_rpt(inf, pat->next)) 266 return (inf->ecode); 267 break; 268 case Rep_AnyAtLeast: 269 if (pat->next == NULL) { 270 rec_grp *grp = inf->grp; 271 272 next = NULL; 273 while (grp) { 274 next = grp->parent->next; 275 if (next) 276 break; 277 grp = grp->pgrp; 278 } 279 if (next == NULL) { 280 /* <re>.+ */ 281 pat->type = Rep_AnyEatAtLeast; 282 grp = inf->grp; 283 while (grp) { 284 --grp->comp; 285 next = grp->parent->next; 286 if (next) 287 break; 288 grp = grp->pgrp; 289 } 290 } 291 else if (orec_pat_bad_rpt(inf, next)) 292 return (inf->ecode); 293 } 294 else if (orec_pat_bad_rpt(inf, pat->next)) 295 return (inf->ecode); 296 break; 297 case Rep_Range: 298 case Rep_RangeNot: 299 orec_pat_rng(inf, pat); 300 break; 301 case Rep_Group: 302 orec_grp(inf, pat->data.grp); 303 break; 304 default: 305 break; 306 } 307 pat = pat->next; 308 } 309 310 return (inf->ecode); 311} 312 313static int 314orec_pat_bad_rpt(orec_inf *inf, rec_pat *pat) 315{ 316 switch (pat->type) { 317 /* Not really an error, but aren't supported by the library. 318 * Includes: .*.*, .+<re>? .*<re>*, (.*)(<re>*), etc. 319 */ 320 321 /* Not a repetition, but mathes anything... */ 322 case Rep_Any: 323 324 /* Zero length matches */ 325 case Rep_Eol: 326 if (!(inf->flags & RE_NEWLINE)) 327 break; 328 case Rep_Bol: 329 case Rep_Bow: 330 case Rep_Eow: 331 332 /* Repetitions */ 333 case Rep_AnyAnyTimes: 334 case Rep_AnyMaybe: 335 case Rep_AnyAtLeast: 336 inf->ecode = RE_BADRPT; 337 break; 338 339 /* Check if the first group element is a complex pattern */ 340 case Rep_Group: 341 if (pat->rep == NULL) { 342 if (pat->data.grp->alt) { 343 for (pat = pat->data.grp->alt->pat; pat; pat = pat->next) { 344 if (orec_pat_bad_rpt(inf, pat)) 345 break; 346 } 347 } 348 break; 349 } 350 /*FALLTHROUGH*/ 351 default: 352 if (pat->rep) 353 inf->ecode = RE_BADRPT; 354 break; 355 } 356 357 if (!inf->ecode && pat && pat->next) 358 orec_pat_bad_forward_rpt(inf, pat->next); 359 360 return (inf->ecode); 361} 362 363static int 364orec_pat_bad_forward_rpt(orec_inf *inf, rec_pat *pat) 365{ 366 if (pat->rep) { 367 switch (pat->rep->type) { 368 case Rer_MinMax: 369 if (pat->rep->mine > 0) 370 break; 371 case Rer_AnyTimes: 372 case Rer_Maybe: 373 case Rer_Max: 374 inf->ecode = RE_BADRPT; 375 default: 376 break; 377 } 378 } 379 else if (pat->type == Rep_Group && 380 pat->data.grp->alt && 381 pat->data.grp->alt->pat) 382 orec_pat_bad_forward_rpt(inf, pat->data.grp->alt->pat); 383 384 return (inf->ecode); 385} 386 387static int 388orec_grp(orec_inf *inf, rec_grp *grp) 389{ 390 rec_grp *prev = inf->grp; 391 392 inf->grp = grp; 393 orec_alt(inf, grp->alt); 394 /* Could also just say: inf->grp = grp->gparent */ 395 inf->grp = prev; 396 397 return (inf->ecode); 398} 399 400static int 401orec_pat_rng(orec_inf *inf, rec_pat *pat) 402{ 403 int i, j[2], count; 404 rec_pat_t type = pat->type; 405 unsigned char *range = pat->data.rng->range; 406 407 for (i = count = j[0] = j[1] = 0; i < 256; i++) { 408 if (range[i]) { 409 if (count == 2) { 410 ++count; 411 break; 412 } 413 j[count++] = i; 414 } 415 } 416 417 if (count == 1 || 418 (count == 2 && 419 ((islower(j[0]) && toupper(j[0]) == j[1]) || 420 (isupper(j[0]) && tolower(j[0]) == j[1])))) { 421 free(pat->data.rng); 422 if (count == 1) { 423 pat->data.chr = j[0]; 424 pat->type = type == Rep_Range ? Rep_Literal : Rep_LiteralNot; 425 } 426 else { 427 pat->data.cse.upper = j[0]; 428 pat->data.cse.lower = j[1]; 429 pat->type = type == Rep_Range ? Rep_CaseLiteral : Rep_CaseLiteralNot; 430 } 431 } 432 else { 433 if (memcmp(re__alnum, range, 256) == 0) 434 type = type == Rep_Range ? Rep_Alnum : Rep_AlnumNot; 435 else if (memcmp(re__odigit, range, 256) == 0) 436 type = type == Rep_Range ? Rep_Odigit : Rep_OdigitNot; 437 else if (memcmp(re__ddigit, range, 256) == 0) 438 type = type == Rep_Range ? Rep_Digit : Rep_DigitNot; 439 else if (memcmp(re__xdigit, range, 256) == 0) 440 type = type == Rep_Range ? Rep_Xdigit : Rep_XdigitNot; 441 else if (memcmp(re__control, range, 256) == 0) 442 type = type == Rep_Range ? Rep_Control : Rep_ControlNot; 443 444 if (type != pat->type) { 445 free(pat->data.rng); 446 pat->type = type; 447 } 448 } 449 450 return (inf->ecode); 451} 452 453/* Join patterns if required, will only fail on memory allocation failure: 454 */ 455static int 456orec_pat_cse(orec_inf *inf, rec_pat *pat) 457{ 458 rec_pat_t type; 459 int i, len, length; 460 rec_pat *ptr, *next; 461 unsigned char *str, *tofree; 462 463 if (pat->next == NULL && pat->type == Rep_CaseString) 464 return (inf->ecode); 465 466 type = Rep_CaseString; 467 468 /* First calculate how many bytes will be required */ 469 for (ptr = pat, length = 1; ptr; ptr = ptr->next) { 470 switch (ptr->type) { 471 case Rep_Literal: 472 length += 2; 473 break; 474 case Rep_String: 475 length += strlen((char*)ptr->data.str) << 1; 476 break; 477 case Rep_CaseLiteral: 478 length += 2; 479 break; 480 case Rep_CaseString: 481 length += strlen((char*)ptr->data.str); 482 break; 483 default: 484 break; 485 } 486 } 487 488 if ((str = malloc(length)) == NULL) 489 return (inf->ecode = RE_ESPACE); 490 491 for (ptr = pat, length = 0; ptr; ptr = next) { 492 tofree = NULL; 493 next = ptr->next; 494 switch (ptr->type) { 495 case Rep_Literal: 496 str[length++] = ptr->data.chr; 497 str[length++] = ptr->data.chr; 498 break; 499 case Rep_String: 500 tofree = ptr->data.str; 501 len = strlen((char*)tofree); 502 for (i = 0; i < len; i++) { 503 str[length++] = tofree[i]; 504 str[length++] = tofree[i]; 505 } 506 break; 507 case Rep_CaseLiteral: 508 str[length++] = ptr->data.cse.lower; 509 str[length++] = ptr->data.cse.upper; 510 break; 511 case Rep_CaseString: 512 tofree = ptr->data.str; 513 len = strlen((char*)tofree); 514 memcpy(str + length, tofree, len); 515 length += len; 516 break; 517 default: 518 break; 519 } 520 if (tofree) 521 free(tofree); 522 if (ptr != pat) 523 free(ptr); 524 } 525 str[length] = '\0'; 526 527 pat->type = type; 528 pat->data.str = str; 529 pat->next = NULL; 530 531 return (inf->ecode); 532} 533 534/* Return 0 if the patterns in the list cannot be merged, 1 if will 535 * be a simple string, 2 if a case string. 536 * This is useful when building an alternative list that is composed 537 * only of strings, but the regex is case insensitive, in wich case 538 * the first pass may have splited some patterns, but if it is a member 539 * of an alternatives list, the cost of using a string list is smaller */ 540static int 541orec_pat_cse_can(orec_inf *inf, rec_pat *pat) 542{ 543 int ret; 544 545 if (pat == NULL) 546 return (0); 547 548 for (ret = 1; pat; pat = pat->next) { 549 if (pat->rep) 550 return (0); 551 switch (pat->type) { 552 case Rep_Literal: 553 case Rep_String: 554 break; 555 case Rep_CaseLiteral: 556 case Rep_CaseString: 557 ret = 2; 558 break; 559 default: 560 return (0); 561 } 562 } 563 564 return (ret); 565} 566 567 568/* XXX If everything is a (case) byte, the pattern should be 569 * [abcde] instead of a|b|c|d|e (or [aAbBcCdDeE] instead of aA|bB|cC|dD|eE) 570 * as a string list works fine, but as a character range 571 * should be faster, and maybe could be converted here. But not 572 * very important, if performance is required, it should have already 573 * been done in the pattern. 574 */ 575static int 576orec_str_list(orec_inf *inf, rec_alt *alt, int str, int count) 577{ 578 rec_stl *stl; 579 rec_pat *pat; 580 rec_alt *ptr, *next; 581 int i, j, tlen, len, is; 582 583 if ((stl = calloc(1, sizeof(rec_stl))) == NULL) 584 return (inf->ecode = RE_ESPACE); 585 586 if ((stl->lens = malloc(sizeof(unsigned char) * count)) == NULL) { 587 free(stl); 588 return (inf->ecode = RE_ESPACE); 589 } 590 591 if ((stl->strs = malloc(sizeof(char*) * count)) == NULL) { 592 free(stl->lens); 593 free(stl); 594 return (inf->ecode = RE_ESPACE); 595 } 596 597 if ((pat = calloc(1, sizeof(rec_pat))) == NULL) { 598 free(stl->strs); 599 free(stl->lens); 600 free(stl); 601 return (inf->ecode = RE_ESPACE); 602 } 603 604 pat->data.stl = stl; 605 pat->type = Rep_StringList; 606 stl->type = str ? Resl_StringList : Resl_CaseStringList; 607 for (i = tlen = 0, ptr = alt; i < count; i++) { 608 next = ptr->next; 609 switch (ptr->pat->type) { 610 case Rep_Literal: 611 is = len = 1; 612 break; 613 case Rep_CaseLiteral: 614 is = len = 2; 615 break; 616 default: 617 is = 0; 618 len = strlen((char*)ptr->pat->data.str); 619 break; 620 } 621 tlen += len; 622 stl->lens[i] = len; 623 if (!is) { 624 if (len > 2) 625 stl->strs[i] = ptr->pat->data.str; 626 else { 627 if (len == 1) 628 stl->strs[i] = (void*)(long)(ptr->pat->data.str[0]); 629 else 630 stl->strs[i] = (void*)(long) 631 (ptr->pat->data.str[0] | 632 ((int)ptr->pat->data.str[1] << 8)); 633 free(ptr->pat->data.str); 634 } 635 } 636 else { 637 if (is == 1) 638 stl->strs[i] = (void*)(long)ptr->pat->data.chr; 639 else 640 stl->strs[i] = (void*)(long) 641 (ptr->pat->data.cse.lower | 642 (ptr->pat->data.cse.upper << 8)); 643 } 644 free(ptr->pat); 645 if (i) 646 free(ptr); 647 ptr = next; 648 } 649 stl->tlen = tlen; 650 stl->nstrs = count; 651 652 alt->pat = pat; 653 alt->next = NULL; 654 655 { 656 int li, lj; 657 unsigned char ci, cj, *str; 658 659 /* Don't need a stable sort, there shouldn't be duplicated strings, 660 * but don't check for it either. Only need to make sure that all 661 * strings that start with the same byte are together */ 662 for (i = 0; i < count; i++) { 663 li = stl->lens[i]; 664 ci = li > 2 ? stl->strs[i][0] : (long)stl->strs[i] & 0xff; 665 for (j = i + 1; j < count; j++) { 666 lj = stl->lens[j]; 667 cj = lj > 2 ? stl->strs[j][0] : (long)stl->strs[j] & 0xff; 668 if ((count >= LARGE_STL_COUNT && cj < ci) || 669 (cj == ci && lj > li)) { 670 /* If both strings start with the same byte, 671 * put the longer first */ 672 str = stl->strs[j]; 673 stl->strs[j] = stl->strs[i]; 674 stl->strs[i] = str; 675 stl->lens[j] = li; 676 stl->lens[i] = lj; 677 li ^= lj; lj ^= li; li ^= lj; 678 ci ^= cj; cj ^= ci; ci ^= cj; 679 } 680 } 681 } 682 } 683 684 return (inf->ecode); 685} 686