reo.c revision 5dfecf96
15dfecf96Smrg/* 25dfecf96Smrg * Copyright (c) 2002 by The XFree86 Project, Inc. 35dfecf96Smrg * 45dfecf96Smrg * Permission is hereby granted, free of charge, to any person obtaining a 55dfecf96Smrg * copy of this software and associated documentation files (the "Software"), 65dfecf96Smrg * to deal in the Software without restriction, including without limitation 75dfecf96Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 85dfecf96Smrg * and/or sell copies of the Software, and to permit persons to whom the 95dfecf96Smrg * Software is furnished to do so, subject to the following conditions: 105dfecf96Smrg * 115dfecf96Smrg * The above copyright notice and this permission notice shall be included in 125dfecf96Smrg * all copies or substantial portions of the Software. 135dfecf96Smrg * 145dfecf96Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 155dfecf96Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 165dfecf96Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 175dfecf96Smrg * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 185dfecf96Smrg * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 195dfecf96Smrg * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 205dfecf96Smrg * SOFTWARE. 215dfecf96Smrg * 225dfecf96Smrg * Except as contained in this notice, the name of the XFree86 Project shall 235dfecf96Smrg * not be used in advertising or otherwise to promote the sale, use or other 245dfecf96Smrg * dealings in this Software without prior written authorization from the 255dfecf96Smrg * XFree86 Project. 265dfecf96Smrg * 275dfecf96Smrg * Author: Paulo César Pereira de Andrade 285dfecf96Smrg */ 295dfecf96Smrg 305dfecf96Smrg/* $XFree86: xc/programs/xedit/lisp/re/reo.c,v 1.8 2002/09/29 02:55:01 paulo Exp $ */ 315dfecf96Smrg 325dfecf96Smrg#include "rep.h" 335dfecf96Smrg 345dfecf96Smrg/* 355dfecf96Smrg * This file is a placeholder to add code to analyse and optimize the 365dfecf96Smrg * intermediate data structure generated in rep.c. 375dfecf96Smrg * Character ranges are optimized while being generated. 385dfecf96Smrg */ 395dfecf96Smrg 405dfecf96Smrg/* 415dfecf96Smrg * Types 425dfecf96Smrg */ 435dfecf96Smrgtypedef struct _orec_inf { 445dfecf96Smrg rec_alt *alt; /* Main alternatives list */ 455dfecf96Smrg rec_grp *grp; /* Current group pointer */ 465dfecf96Smrg int flags; 475dfecf96Smrg int ecode; 485dfecf96Smrg} orec_inf; 495dfecf96Smrg 505dfecf96Smrg/* 515dfecf96Smrg * Prototypes 525dfecf96Smrg */ 535dfecf96Smrgstatic int orec_alt(orec_inf*, rec_alt*); 545dfecf96Smrgstatic int orec_pat(orec_inf*, rec_pat*); 555dfecf96Smrgstatic int orec_grp(orec_inf*, rec_grp*); 565dfecf96Smrgstatic int orec_pat_bad_rpt(orec_inf*, rec_pat*); 575dfecf96Smrgstatic int orec_pat_bad_forward_rpt(orec_inf*, rec_pat*); 585dfecf96Smrgstatic int orec_pat_rng(orec_inf*, rec_pat*); 595dfecf96Smrgstatic int orec_pat_cse(orec_inf*, rec_pat*); 605dfecf96Smrgstatic int orec_pat_cse_can(orec_inf*, rec_pat*); 615dfecf96Smrgstatic int orec_str_list(orec_inf*, rec_alt*, int, int); 625dfecf96Smrg 635dfecf96Smrg/* 645dfecf96Smrg * Initialization 655dfecf96Smrg */ 665dfecf96Smrgextern unsigned char re__alnum[256]; 675dfecf96Smrgextern unsigned char re__odigit[256]; 685dfecf96Smrgextern unsigned char re__ddigit[256]; 695dfecf96Smrgextern unsigned char re__xdigit[256]; 705dfecf96Smrgextern unsigned char re__control[256]; 715dfecf96Smrg 725dfecf96Smrg/* 735dfecf96Smrg * Implementation 745dfecf96Smrg */ 755dfecf96Smrgint 765dfecf96Smrgorec_comp(rec_alt *alt, int flags) 775dfecf96Smrg{ 785dfecf96Smrg orec_inf inf; 795dfecf96Smrg 805dfecf96Smrg inf.alt = alt; 815dfecf96Smrg inf.grp = NULL; 825dfecf96Smrg inf.flags = flags; 835dfecf96Smrg inf.ecode = 0; 845dfecf96Smrg 855dfecf96Smrg orec_alt(&inf, alt); 865dfecf96Smrg 875dfecf96Smrg return (inf.ecode); 885dfecf96Smrg} 895dfecf96Smrg 905dfecf96Smrgvoid 915dfecf96Smrgorec_free_stl(rec_stl *stl) 925dfecf96Smrg{ 935dfecf96Smrg int i; 945dfecf96Smrg 955dfecf96Smrg for (i = 0; i < stl->nstrs; i++) { 965dfecf96Smrg if (stl->lens[i] > 2) 975dfecf96Smrg free(stl->strs[i]); 985dfecf96Smrg } 995dfecf96Smrg 1005dfecf96Smrg free(stl->lens); 1015dfecf96Smrg free(stl->strs); 1025dfecf96Smrg free(stl); 1035dfecf96Smrg} 1045dfecf96Smrg 1055dfecf96Smrg 1065dfecf96Smrgstatic int 1075dfecf96Smrgorec_alt(orec_inf *inf, rec_alt *alt) 1085dfecf96Smrg{ 1095dfecf96Smrg if (alt) { 1105dfecf96Smrg rec_alt *ptr = alt; 1115dfecf96Smrg int ret, count = 0, str = 1, cstr = 1, lits = 0, clits = 0; 1125dfecf96Smrg 1135dfecf96Smrg /* Check if can build a string list */ 1145dfecf96Smrg if (ptr->next) { 1155dfecf96Smrg /* If more than one alternative */ 1165dfecf96Smrg while (ptr && (str || cstr)) { 1175dfecf96Smrg if (ptr->pat == NULL || ptr->pat->rep != NULL) { 1185dfecf96Smrg cstr = str = 0; 1195dfecf96Smrg break; 1205dfecf96Smrg } 1215dfecf96Smrg if ((inf->flags & RE_ICASE)) { 1225dfecf96Smrg if (!(ret = orec_pat_cse_can(inf, ptr->pat))) { 1235dfecf96Smrg cstr = str = 0; 1245dfecf96Smrg break; 1255dfecf96Smrg } 1265dfecf96Smrg if (ret == 1) 1275dfecf96Smrg ++lits; 1285dfecf96Smrg else if (ret == 2) 1295dfecf96Smrg ++clits; 1305dfecf96Smrg } 1315dfecf96Smrg else if (ptr->pat->next == NULL) { 1325dfecf96Smrg if (ptr->pat->type != Rep_String) { 1335dfecf96Smrg if (ptr->pat->type != Rep_Literal) { 1345dfecf96Smrg str = 0; 1355dfecf96Smrg if (ptr->pat->type != Rep_CaseString) { 1365dfecf96Smrg if (ptr->pat->type != Rep_CaseLiteral) 1375dfecf96Smrg cstr = 0; 1385dfecf96Smrg else 1395dfecf96Smrg ++clits; 1405dfecf96Smrg } 1415dfecf96Smrg else if (strlen((char*)ptr->pat->data.str) >= 255) 1425dfecf96Smrg str = cstr = 0; 1435dfecf96Smrg } 1445dfecf96Smrg else 1455dfecf96Smrg ++lits; 1465dfecf96Smrg } 1475dfecf96Smrg else if (strlen((char*)ptr->pat->data.str) >= 255) 1485dfecf96Smrg str = cstr = 0; 1495dfecf96Smrg } 1505dfecf96Smrg else { 1515dfecf96Smrg str = cstr = 0; 1525dfecf96Smrg break; 1535dfecf96Smrg } 1545dfecf96Smrg if (++count >= 255) 1555dfecf96Smrg str = cstr = 0; 1565dfecf96Smrg ptr = ptr->next; 1575dfecf96Smrg } 1585dfecf96Smrg 1595dfecf96Smrg if (str || cstr) { 1605dfecf96Smrg if (inf->flags & RE_ICASE) { 1615dfecf96Smrg for (ptr = alt; ptr; ptr = ptr->next) { 1625dfecf96Smrg if (orec_pat_cse(inf, ptr->pat)) 1635dfecf96Smrg return (inf->ecode); 1645dfecf96Smrg } 1655dfecf96Smrg str = 0; 1665dfecf96Smrg } 1675dfecf96Smrg return (orec_str_list(inf, alt, str, count)); 1685dfecf96Smrg } 1695dfecf96Smrg } 1705dfecf96Smrg else if (alt == inf->alt && alt->pat && alt->pat->rep == NULL) { 1715dfecf96Smrg /* If the toplevel single alternative */ 1725dfecf96Smrg switch (alt->pat->type) { 1735dfecf96Smrg /* One of these will always be true for RE_NOSPEC, 1745dfecf96Smrg * but can also be optimized for simple patterns */ 1755dfecf96Smrg case Rep_Literal: 1765dfecf96Smrg alt->pat->type = Rep_SearchLiteral; 1775dfecf96Smrg break; 1785dfecf96Smrg case Rep_CaseLiteral: 1795dfecf96Smrg alt->pat->type = Rep_SearchCaseLiteral; 1805dfecf96Smrg break; 1815dfecf96Smrg case Rep_String: 1825dfecf96Smrg alt->pat->type = Rep_SearchString; 1835dfecf96Smrg break; 1845dfecf96Smrg case Rep_CaseString: 1855dfecf96Smrg alt->pat->type = Rep_SearchCaseString; 1865dfecf96Smrg break; 1875dfecf96Smrg default: 1885dfecf96Smrg break; 1895dfecf96Smrg } 1905dfecf96Smrg } 1915dfecf96Smrg 1925dfecf96Smrg while (alt) { 1935dfecf96Smrg orec_pat(inf, alt->pat); 1945dfecf96Smrg alt = alt->next; 1955dfecf96Smrg } 1965dfecf96Smrg } 1975dfecf96Smrg 1985dfecf96Smrg return (inf->ecode); 1995dfecf96Smrg} 2005dfecf96Smrg 2015dfecf96Smrgstatic int 2025dfecf96Smrgorec_pat(orec_inf *inf, rec_pat *pat) 2035dfecf96Smrg{ 2045dfecf96Smrg rec_pat *next; 2055dfecf96Smrg 2065dfecf96Smrg while (pat) { 2075dfecf96Smrg switch (pat->type) { 2085dfecf96Smrg case Rep_AnyAnyTimes: 2095dfecf96Smrg if (pat->next == NULL) { 2105dfecf96Smrg rec_grp *grp = inf->grp; 2115dfecf96Smrg 2125dfecf96Smrg next = NULL; 2135dfecf96Smrg while (grp) { 2145dfecf96Smrg next = grp->parent->next; 2155dfecf96Smrg /* Cannot check if is .*$ as the input 2165dfecf96Smrg * may be a substring */ 2175dfecf96Smrg if (next) 2185dfecf96Smrg break; 2195dfecf96Smrg grp = grp->pgrp; 2205dfecf96Smrg } 2215dfecf96Smrg if (next == NULL) { 2225dfecf96Smrg /* <re>.* */ 2235dfecf96Smrg pat->type = Rep_AnyEatAnyTimes; 2245dfecf96Smrg grp = inf->grp; 2255dfecf96Smrg while (grp) { 2265dfecf96Smrg --grp->comp; 2275dfecf96Smrg next = grp->parent->next; 2285dfecf96Smrg if (next) 2295dfecf96Smrg break; 2305dfecf96Smrg grp = grp->pgrp; 2315dfecf96Smrg } 2325dfecf96Smrg } 2335dfecf96Smrg else if (orec_pat_bad_rpt(inf, next)) 2345dfecf96Smrg return (inf->ecode); 2355dfecf96Smrg } 2365dfecf96Smrg else if (orec_pat_bad_rpt(inf, pat->next)) 2375dfecf96Smrg return (inf->ecode); 2385dfecf96Smrg break; 2395dfecf96Smrg case Rep_AnyMaybe: 2405dfecf96Smrg if (pat->next == NULL) { 2415dfecf96Smrg rec_grp *grp = inf->grp; 2425dfecf96Smrg 2435dfecf96Smrg next = NULL; 2445dfecf96Smrg while (grp) { 2455dfecf96Smrg next = grp->parent->next; 2465dfecf96Smrg if (next) 2475dfecf96Smrg break; 2485dfecf96Smrg grp = grp->pgrp; 2495dfecf96Smrg } 2505dfecf96Smrg if (next == NULL) { 2515dfecf96Smrg /* <re>.? */ 2525dfecf96Smrg pat->type = Rep_AnyEatMaybe; 2535dfecf96Smrg grp = inf->grp; 2545dfecf96Smrg while (grp) { 2555dfecf96Smrg --grp->comp; 2565dfecf96Smrg next = grp->parent->next; 2575dfecf96Smrg if (next) 2585dfecf96Smrg break; 2595dfecf96Smrg grp = grp->pgrp; 2605dfecf96Smrg } 2615dfecf96Smrg } 2625dfecf96Smrg else if (orec_pat_bad_rpt(inf, next)) 2635dfecf96Smrg return (inf->ecode); 2645dfecf96Smrg } 2655dfecf96Smrg else if (orec_pat_bad_rpt(inf, pat->next)) 2665dfecf96Smrg return (inf->ecode); 2675dfecf96Smrg break; 2685dfecf96Smrg case Rep_AnyAtLeast: 2695dfecf96Smrg if (pat->next == NULL) { 2705dfecf96Smrg rec_grp *grp = inf->grp; 2715dfecf96Smrg 2725dfecf96Smrg next = NULL; 2735dfecf96Smrg while (grp) { 2745dfecf96Smrg next = grp->parent->next; 2755dfecf96Smrg if (next) 2765dfecf96Smrg break; 2775dfecf96Smrg grp = grp->pgrp; 2785dfecf96Smrg } 2795dfecf96Smrg if (next == NULL) { 2805dfecf96Smrg /* <re>.+ */ 2815dfecf96Smrg pat->type = Rep_AnyEatAtLeast; 2825dfecf96Smrg grp = inf->grp; 2835dfecf96Smrg while (grp) { 2845dfecf96Smrg --grp->comp; 2855dfecf96Smrg next = grp->parent->next; 2865dfecf96Smrg if (next) 2875dfecf96Smrg break; 2885dfecf96Smrg grp = grp->pgrp; 2895dfecf96Smrg } 2905dfecf96Smrg } 2915dfecf96Smrg else if (orec_pat_bad_rpt(inf, next)) 2925dfecf96Smrg return (inf->ecode); 2935dfecf96Smrg } 2945dfecf96Smrg else if (orec_pat_bad_rpt(inf, pat->next)) 2955dfecf96Smrg return (inf->ecode); 2965dfecf96Smrg break; 2975dfecf96Smrg case Rep_Range: 2985dfecf96Smrg case Rep_RangeNot: 2995dfecf96Smrg orec_pat_rng(inf, pat); 3005dfecf96Smrg break; 3015dfecf96Smrg case Rep_Group: 3025dfecf96Smrg orec_grp(inf, pat->data.grp); 3035dfecf96Smrg break; 3045dfecf96Smrg default: 3055dfecf96Smrg break; 3065dfecf96Smrg } 3075dfecf96Smrg pat = pat->next; 3085dfecf96Smrg } 3095dfecf96Smrg 3105dfecf96Smrg return (inf->ecode); 3115dfecf96Smrg} 3125dfecf96Smrg 3135dfecf96Smrgstatic int 3145dfecf96Smrgorec_pat_bad_rpt(orec_inf *inf, rec_pat *pat) 3155dfecf96Smrg{ 3165dfecf96Smrg switch (pat->type) { 3175dfecf96Smrg /* Not really an error, but aren't supported by the library. 3185dfecf96Smrg * Includes: .*.*, .+<re>? .*<re>*, (.*)(<re>*), etc. 3195dfecf96Smrg */ 3205dfecf96Smrg 3215dfecf96Smrg /* Not a repetition, but mathes anything... */ 3225dfecf96Smrg case Rep_Any: 3235dfecf96Smrg 3245dfecf96Smrg /* Zero length matches */ 3255dfecf96Smrg case Rep_Eol: 3265dfecf96Smrg if (!(inf->flags & RE_NEWLINE)) 3275dfecf96Smrg break; 3285dfecf96Smrg case Rep_Bol: 3295dfecf96Smrg case Rep_Bow: 3305dfecf96Smrg case Rep_Eow: 3315dfecf96Smrg 3325dfecf96Smrg /* Repetitions */ 3335dfecf96Smrg case Rep_AnyAnyTimes: 3345dfecf96Smrg case Rep_AnyMaybe: 3355dfecf96Smrg case Rep_AnyAtLeast: 3365dfecf96Smrg inf->ecode = RE_BADRPT; 3375dfecf96Smrg break; 3385dfecf96Smrg 3395dfecf96Smrg /* Check if the first group element is a complex pattern */ 3405dfecf96Smrg case Rep_Group: 3415dfecf96Smrg if (pat->rep == NULL) { 3425dfecf96Smrg if (pat->data.grp->alt) { 3435dfecf96Smrg for (pat = pat->data.grp->alt->pat; pat; pat = pat->next) { 3445dfecf96Smrg if (orec_pat_bad_rpt(inf, pat)) 3455dfecf96Smrg break; 3465dfecf96Smrg } 3475dfecf96Smrg } 3485dfecf96Smrg break; 3495dfecf96Smrg } 3505dfecf96Smrg /*FALLTHROUGH*/ 3515dfecf96Smrg default: 3525dfecf96Smrg if (pat->rep) 3535dfecf96Smrg inf->ecode = RE_BADRPT; 3545dfecf96Smrg break; 3555dfecf96Smrg } 3565dfecf96Smrg 3575dfecf96Smrg if (!inf->ecode && pat && pat->next) 3585dfecf96Smrg orec_pat_bad_forward_rpt(inf, pat->next); 3595dfecf96Smrg 3605dfecf96Smrg return (inf->ecode); 3615dfecf96Smrg} 3625dfecf96Smrg 3635dfecf96Smrgstatic int 3645dfecf96Smrgorec_pat_bad_forward_rpt(orec_inf *inf, rec_pat *pat) 3655dfecf96Smrg{ 3665dfecf96Smrg if (pat->rep) { 3675dfecf96Smrg switch (pat->rep->type) { 3685dfecf96Smrg case Rer_MinMax: 3695dfecf96Smrg if (pat->rep->mine > 0) 3705dfecf96Smrg break; 3715dfecf96Smrg case Rer_AnyTimes: 3725dfecf96Smrg case Rer_Maybe: 3735dfecf96Smrg case Rer_Max: 3745dfecf96Smrg inf->ecode = RE_BADRPT; 3755dfecf96Smrg default: 3765dfecf96Smrg break; 3775dfecf96Smrg } 3785dfecf96Smrg } 3795dfecf96Smrg else if (pat->type == Rep_Group && 3805dfecf96Smrg pat->data.grp->alt && 3815dfecf96Smrg pat->data.grp->alt->pat) 3825dfecf96Smrg orec_pat_bad_forward_rpt(inf, pat->data.grp->alt->pat); 3835dfecf96Smrg 3845dfecf96Smrg return (inf->ecode); 3855dfecf96Smrg} 3865dfecf96Smrg 3875dfecf96Smrgstatic int 3885dfecf96Smrgorec_grp(orec_inf *inf, rec_grp *grp) 3895dfecf96Smrg{ 3905dfecf96Smrg rec_grp *prev = inf->grp; 3915dfecf96Smrg 3925dfecf96Smrg inf->grp = grp; 3935dfecf96Smrg orec_alt(inf, grp->alt); 3945dfecf96Smrg /* Could also just say: inf->grp = grp->gparent */ 3955dfecf96Smrg inf->grp = prev; 3965dfecf96Smrg 3975dfecf96Smrg return (inf->ecode); 3985dfecf96Smrg} 3995dfecf96Smrg 4005dfecf96Smrgstatic int 4015dfecf96Smrgorec_pat_rng(orec_inf *inf, rec_pat *pat) 4025dfecf96Smrg{ 4035dfecf96Smrg int i, j[2], count; 4045dfecf96Smrg rec_pat_t type = pat->type; 4055dfecf96Smrg unsigned char *range = pat->data.rng->range; 4065dfecf96Smrg 4075dfecf96Smrg for (i = count = j[0] = j[1] = 0; i < 256; i++) { 4085dfecf96Smrg if (range[i]) { 4095dfecf96Smrg if (count == 2) { 4105dfecf96Smrg ++count; 4115dfecf96Smrg break; 4125dfecf96Smrg } 4135dfecf96Smrg j[count++] = i; 4145dfecf96Smrg } 4155dfecf96Smrg } 4165dfecf96Smrg 4175dfecf96Smrg if (count == 1 || 4185dfecf96Smrg (count == 2 && 4195dfecf96Smrg ((islower(j[0]) && toupper(j[0]) == j[1]) || 4205dfecf96Smrg (isupper(j[0]) && tolower(j[0]) == j[1])))) { 4215dfecf96Smrg free(pat->data.rng); 4225dfecf96Smrg if (count == 1) { 4235dfecf96Smrg pat->data.chr = j[0]; 4245dfecf96Smrg pat->type = type == Rep_Range ? Rep_Literal : Rep_LiteralNot; 4255dfecf96Smrg } 4265dfecf96Smrg else { 4275dfecf96Smrg pat->data.cse.upper = j[0]; 4285dfecf96Smrg pat->data.cse.lower = j[1]; 4295dfecf96Smrg pat->type = type == Rep_Range ? Rep_CaseLiteral : Rep_CaseLiteralNot; 4305dfecf96Smrg } 4315dfecf96Smrg } 4325dfecf96Smrg else { 4335dfecf96Smrg if (memcmp(re__alnum, range, 256) == 0) 4345dfecf96Smrg type = type == Rep_Range ? Rep_Alnum : Rep_AlnumNot; 4355dfecf96Smrg else if (memcmp(re__odigit, range, 256) == 0) 4365dfecf96Smrg type = type == Rep_Range ? Rep_Odigit : Rep_OdigitNot; 4375dfecf96Smrg else if (memcmp(re__ddigit, range, 256) == 0) 4385dfecf96Smrg type = type == Rep_Range ? Rep_Digit : Rep_DigitNot; 4395dfecf96Smrg else if (memcmp(re__xdigit, range, 256) == 0) 4405dfecf96Smrg type = type == Rep_Range ? Rep_Xdigit : Rep_XdigitNot; 4415dfecf96Smrg else if (memcmp(re__control, range, 256) == 0) 4425dfecf96Smrg type = type == Rep_Range ? Rep_Control : Rep_ControlNot; 4435dfecf96Smrg 4445dfecf96Smrg if (type != pat->type) { 4455dfecf96Smrg free(pat->data.rng); 4465dfecf96Smrg pat->type = type; 4475dfecf96Smrg } 4485dfecf96Smrg } 4495dfecf96Smrg 4505dfecf96Smrg return (inf->ecode); 4515dfecf96Smrg} 4525dfecf96Smrg 4535dfecf96Smrg/* Join patterns if required, will only fail on memory allocation failure: 4545dfecf96Smrg */ 4555dfecf96Smrgstatic int 4565dfecf96Smrgorec_pat_cse(orec_inf *inf, rec_pat *pat) 4575dfecf96Smrg{ 4585dfecf96Smrg rec_pat_t type; 4595dfecf96Smrg int i, len, length; 4605dfecf96Smrg rec_pat *ptr, *next; 4615dfecf96Smrg unsigned char *str, *tofree; 4625dfecf96Smrg 4635dfecf96Smrg if (pat->next == NULL && pat->type == Rep_CaseString) 4645dfecf96Smrg return (inf->ecode); 4655dfecf96Smrg 4665dfecf96Smrg type = Rep_CaseString; 4675dfecf96Smrg 4685dfecf96Smrg /* First calculate how many bytes will be required */ 4695dfecf96Smrg for (ptr = pat, length = 1; ptr; ptr = ptr->next) { 4705dfecf96Smrg switch (ptr->type) { 4715dfecf96Smrg case Rep_Literal: 4725dfecf96Smrg length += 2; 4735dfecf96Smrg break; 4745dfecf96Smrg case Rep_String: 4755dfecf96Smrg length += strlen((char*)ptr->data.str) << 1; 4765dfecf96Smrg break; 4775dfecf96Smrg case Rep_CaseLiteral: 4785dfecf96Smrg length += 2; 4795dfecf96Smrg break; 4805dfecf96Smrg case Rep_CaseString: 4815dfecf96Smrg length += strlen((char*)ptr->data.str); 4825dfecf96Smrg break; 4835dfecf96Smrg default: 4845dfecf96Smrg break; 4855dfecf96Smrg } 4865dfecf96Smrg } 4875dfecf96Smrg 4885dfecf96Smrg if ((str = malloc(length)) == NULL) 4895dfecf96Smrg return (inf->ecode = RE_ESPACE); 4905dfecf96Smrg 4915dfecf96Smrg for (ptr = pat, length = 0; ptr; ptr = next) { 4925dfecf96Smrg tofree = NULL; 4935dfecf96Smrg next = ptr->next; 4945dfecf96Smrg switch (ptr->type) { 4955dfecf96Smrg case Rep_Literal: 4965dfecf96Smrg str[length++] = ptr->data.chr; 4975dfecf96Smrg str[length++] = ptr->data.chr; 4985dfecf96Smrg break; 4995dfecf96Smrg case Rep_String: 5005dfecf96Smrg tofree = ptr->data.str; 5015dfecf96Smrg len = strlen((char*)tofree); 5025dfecf96Smrg for (i = 0; i < len; i++) { 5035dfecf96Smrg str[length++] = tofree[i]; 5045dfecf96Smrg str[length++] = tofree[i]; 5055dfecf96Smrg } 5065dfecf96Smrg break; 5075dfecf96Smrg case Rep_CaseLiteral: 5085dfecf96Smrg str[length++] = ptr->data.cse.lower; 5095dfecf96Smrg str[length++] = ptr->data.cse.upper; 5105dfecf96Smrg break; 5115dfecf96Smrg case Rep_CaseString: 5125dfecf96Smrg tofree = ptr->data.str; 5135dfecf96Smrg len = strlen((char*)tofree); 5145dfecf96Smrg memcpy(str + length, tofree, len); 5155dfecf96Smrg length += len; 5165dfecf96Smrg break; 5175dfecf96Smrg default: 5185dfecf96Smrg break; 5195dfecf96Smrg } 5205dfecf96Smrg if (tofree) 5215dfecf96Smrg free(tofree); 5225dfecf96Smrg if (ptr != pat) 5235dfecf96Smrg free(ptr); 5245dfecf96Smrg } 5255dfecf96Smrg str[length] = '\0'; 5265dfecf96Smrg 5275dfecf96Smrg pat->type = type; 5285dfecf96Smrg pat->data.str = str; 5295dfecf96Smrg pat->next = NULL; 5305dfecf96Smrg 5315dfecf96Smrg return (inf->ecode); 5325dfecf96Smrg} 5335dfecf96Smrg 5345dfecf96Smrg/* Return 0 if the patterns in the list cannot be merged, 1 if will 5355dfecf96Smrg * be a simple string, 2 if a case string. 5365dfecf96Smrg * This is useful when building an alternative list that is composed 5375dfecf96Smrg * only of strings, but the regex is case insensitive, in wich case 5385dfecf96Smrg * the first pass may have splited some patterns, but if it is a member 5395dfecf96Smrg * of an alternatives list, the cost of using a string list is smaller */ 5405dfecf96Smrgstatic int 5415dfecf96Smrgorec_pat_cse_can(orec_inf *inf, rec_pat *pat) 5425dfecf96Smrg{ 5435dfecf96Smrg int ret; 5445dfecf96Smrg 5455dfecf96Smrg if (pat == NULL) 5465dfecf96Smrg return (0); 5475dfecf96Smrg 5485dfecf96Smrg for (ret = 1; pat; pat = pat->next) { 5495dfecf96Smrg if (pat->rep) 5505dfecf96Smrg return (0); 5515dfecf96Smrg switch (pat->type) { 5525dfecf96Smrg case Rep_Literal: 5535dfecf96Smrg case Rep_String: 5545dfecf96Smrg break; 5555dfecf96Smrg case Rep_CaseLiteral: 5565dfecf96Smrg case Rep_CaseString: 5575dfecf96Smrg ret = 2; 5585dfecf96Smrg break; 5595dfecf96Smrg default: 5605dfecf96Smrg return (0); 5615dfecf96Smrg } 5625dfecf96Smrg } 5635dfecf96Smrg 5645dfecf96Smrg return (ret); 5655dfecf96Smrg} 5665dfecf96Smrg 5675dfecf96Smrg 5685dfecf96Smrg/* XXX If everything is a (case) byte, the pattern should be 5695dfecf96Smrg * [abcde] instead of a|b|c|d|e (or [aAbBcCdDeE] instead of aA|bB|cC|dD|eE) 5705dfecf96Smrg * as a string list works fine, but as a character range 5715dfecf96Smrg * should be faster, and maybe could be converted here. But not 5725dfecf96Smrg * very important, if performance is required, it should have already 5735dfecf96Smrg * been done in the pattern. 5745dfecf96Smrg */ 5755dfecf96Smrgstatic int 5765dfecf96Smrgorec_str_list(orec_inf *inf, rec_alt *alt, int str, int count) 5775dfecf96Smrg{ 5785dfecf96Smrg rec_stl *stl; 5795dfecf96Smrg rec_pat *pat; 5805dfecf96Smrg rec_alt *ptr, *next; 5815dfecf96Smrg int i, j, tlen, len, is; 5825dfecf96Smrg 5835dfecf96Smrg if ((stl = calloc(1, sizeof(rec_stl))) == NULL) 5845dfecf96Smrg return (inf->ecode = RE_ESPACE); 5855dfecf96Smrg 5865dfecf96Smrg if ((stl->lens = malloc(sizeof(unsigned char) * count)) == NULL) { 5875dfecf96Smrg free(stl); 5885dfecf96Smrg return (inf->ecode = RE_ESPACE); 5895dfecf96Smrg } 5905dfecf96Smrg 5915dfecf96Smrg if ((stl->strs = malloc(sizeof(char*) * count)) == NULL) { 5925dfecf96Smrg free(stl->lens); 5935dfecf96Smrg free(stl); 5945dfecf96Smrg return (inf->ecode = RE_ESPACE); 5955dfecf96Smrg } 5965dfecf96Smrg 5975dfecf96Smrg if ((pat = calloc(1, sizeof(rec_pat))) == NULL) { 5985dfecf96Smrg free(stl->strs); 5995dfecf96Smrg free(stl->lens); 6005dfecf96Smrg free(stl); 6015dfecf96Smrg return (inf->ecode = RE_ESPACE); 6025dfecf96Smrg } 6035dfecf96Smrg 6045dfecf96Smrg pat->data.stl = stl; 6055dfecf96Smrg pat->type = Rep_StringList; 6065dfecf96Smrg stl->type = str ? Resl_StringList : Resl_CaseStringList; 6075dfecf96Smrg for (i = tlen = 0, ptr = alt; i < count; i++) { 6085dfecf96Smrg next = ptr->next; 6095dfecf96Smrg switch (ptr->pat->type) { 6105dfecf96Smrg case Rep_Literal: 6115dfecf96Smrg is = len = 1; 6125dfecf96Smrg break; 6135dfecf96Smrg case Rep_CaseLiteral: 6145dfecf96Smrg is = len = 2; 6155dfecf96Smrg break; 6165dfecf96Smrg default: 6175dfecf96Smrg is = 0; 6185dfecf96Smrg len = strlen((char*)ptr->pat->data.str); 6195dfecf96Smrg break; 6205dfecf96Smrg } 6215dfecf96Smrg tlen += len; 6225dfecf96Smrg stl->lens[i] = len; 6235dfecf96Smrg if (!is) { 6245dfecf96Smrg if (len > 2) 6255dfecf96Smrg stl->strs[i] = ptr->pat->data.str; 6265dfecf96Smrg else { 6275dfecf96Smrg if (len == 1) 6285dfecf96Smrg stl->strs[i] = (void*)(long)(ptr->pat->data.str[0]); 6295dfecf96Smrg else 6305dfecf96Smrg stl->strs[i] = (void*)(long) 6315dfecf96Smrg (ptr->pat->data.str[0] | 6325dfecf96Smrg ((int)ptr->pat->data.str[1] << 8)); 6335dfecf96Smrg free(ptr->pat->data.str); 6345dfecf96Smrg } 6355dfecf96Smrg } 6365dfecf96Smrg else { 6375dfecf96Smrg if (is == 1) 6385dfecf96Smrg stl->strs[i] = (void*)(long)ptr->pat->data.chr; 6395dfecf96Smrg else 6405dfecf96Smrg stl->strs[i] = (void*)(long) 6415dfecf96Smrg (ptr->pat->data.cse.lower | 6425dfecf96Smrg (ptr->pat->data.cse.upper << 8)); 6435dfecf96Smrg } 6445dfecf96Smrg free(ptr->pat); 6455dfecf96Smrg if (i) 6465dfecf96Smrg free(ptr); 6475dfecf96Smrg ptr = next; 6485dfecf96Smrg } 6495dfecf96Smrg stl->tlen = tlen; 6505dfecf96Smrg stl->nstrs = count; 6515dfecf96Smrg 6525dfecf96Smrg alt->pat = pat; 6535dfecf96Smrg alt->next = NULL; 6545dfecf96Smrg 6555dfecf96Smrg { 6565dfecf96Smrg int li, lj; 6575dfecf96Smrg unsigned char ci, cj, *str; 6585dfecf96Smrg 6595dfecf96Smrg /* Don't need a stable sort, there shouldn't be duplicated strings, 6605dfecf96Smrg * but don't check for it either. Only need to make sure that all 6615dfecf96Smrg * strings that start with the same byte are together */ 6625dfecf96Smrg for (i = 0; i < count; i++) { 6635dfecf96Smrg li = stl->lens[i]; 6645dfecf96Smrg ci = li > 2 ? stl->strs[i][0] : (long)stl->strs[i] & 0xff; 6655dfecf96Smrg for (j = i + 1; j < count; j++) { 6665dfecf96Smrg lj = stl->lens[j]; 6675dfecf96Smrg cj = lj > 2 ? stl->strs[j][0] : (long)stl->strs[j] & 0xff; 6685dfecf96Smrg if ((count >= LARGE_STL_COUNT && cj < ci) || 6695dfecf96Smrg (cj == ci && lj > li)) { 6705dfecf96Smrg /* If both strings start with the same byte, 6715dfecf96Smrg * put the longer first */ 6725dfecf96Smrg str = stl->strs[j]; 6735dfecf96Smrg stl->strs[j] = stl->strs[i]; 6745dfecf96Smrg stl->strs[i] = str; 6755dfecf96Smrg stl->lens[j] = li; 6765dfecf96Smrg stl->lens[i] = lj; 6775dfecf96Smrg li ^= lj; lj ^= li; li ^= lj; 6785dfecf96Smrg ci ^= cj; cj ^= ci; ci ^= cj; 6795dfecf96Smrg } 6805dfecf96Smrg } 6815dfecf96Smrg } 6825dfecf96Smrg } 6835dfecf96Smrg 6845dfecf96Smrg return (inf->ecode); 6855dfecf96Smrg} 686