15dfecf96Smrg/*
25dfecf96Smrg * Copyright (c) 2002 by The XFree86 Project, Inc.
35dfecf96Smrg *
45dfecf96Smrg * Permission is hereby granted, free of charge, to any person obtaining a
55dfecf96Smrg * copy of this software and associated documentation files (the "Software"),
65dfecf96Smrg * to deal in the Software without restriction, including without limitation
75dfecf96Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
85dfecf96Smrg * and/or sell copies of the Software, and to permit persons to whom the
95dfecf96Smrg * Software is furnished to do so, subject to the following conditions:
105dfecf96Smrg *
115dfecf96Smrg * The above copyright notice and this permission notice shall be included in
125dfecf96Smrg * all copies or substantial portions of the Software.
135dfecf96Smrg *
145dfecf96Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
155dfecf96Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
165dfecf96Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
175dfecf96Smrg * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
185dfecf96Smrg * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
195dfecf96Smrg * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
205dfecf96Smrg * SOFTWARE.
215dfecf96Smrg *
225dfecf96Smrg * Except as contained in this notice, the name of the XFree86 Project shall
235dfecf96Smrg * not be used in advertising or otherwise to promote the sale, use or other
245dfecf96Smrg * dealings in this Software without prior written authorization from the
255dfecf96Smrg * XFree86 Project.
265dfecf96Smrg *
275dfecf96Smrg * Author: Paulo César Pereira de Andrade
285dfecf96Smrg */
295dfecf96Smrg
305dfecf96Smrg/* $XFree86: xc/programs/xedit/lisp/re/reo.c,v 1.8 2002/09/29 02:55:01 paulo Exp $ */
315dfecf96Smrg
325dfecf96Smrg#include "rep.h"
335dfecf96Smrg
345dfecf96Smrg/*
355dfecf96Smrg *  This file is a placeholder to add code to analyse and optimize the
365dfecf96Smrg * intermediate data structure generated in rep.c.
375dfecf96Smrg *  Character ranges are optimized while being generated.
385dfecf96Smrg */
395dfecf96Smrg
405dfecf96Smrg/*
415dfecf96Smrg * Types
425dfecf96Smrg */
435dfecf96Smrgtypedef struct _orec_inf {
445dfecf96Smrg    rec_alt *alt;			/* Main alternatives list */
455dfecf96Smrg    rec_grp *grp;			/* Current group pointer */
465dfecf96Smrg    int flags;
475dfecf96Smrg    int ecode;
485dfecf96Smrg} orec_inf;
495dfecf96Smrg
505dfecf96Smrg/*
515dfecf96Smrg * Prototypes
525dfecf96Smrg */
535dfecf96Smrgstatic int orec_alt(orec_inf*, rec_alt*);
545dfecf96Smrgstatic int orec_pat(orec_inf*, rec_pat*);
555dfecf96Smrgstatic int orec_grp(orec_inf*, rec_grp*);
565dfecf96Smrgstatic int orec_pat_bad_rpt(orec_inf*, rec_pat*);
575dfecf96Smrgstatic int orec_pat_bad_forward_rpt(orec_inf*, rec_pat*);
585dfecf96Smrgstatic int orec_pat_rng(orec_inf*, rec_pat*);
595dfecf96Smrgstatic int orec_pat_cse(orec_inf*, rec_pat*);
605dfecf96Smrgstatic int orec_pat_cse_can(orec_inf*, rec_pat*);
615dfecf96Smrgstatic int orec_str_list(orec_inf*, rec_alt*, int, int);
625dfecf96Smrg
635dfecf96Smrg/*
645dfecf96Smrg * Initialization
655dfecf96Smrg */
665dfecf96Smrgextern unsigned char re__alnum[256];
675dfecf96Smrgextern unsigned char re__odigit[256];
685dfecf96Smrgextern unsigned char re__ddigit[256];
695dfecf96Smrgextern unsigned char re__xdigit[256];
705dfecf96Smrgextern unsigned char re__control[256];
715dfecf96Smrg
725dfecf96Smrg/*
735dfecf96Smrg * Implementation
745dfecf96Smrg */
755dfecf96Smrgint
765dfecf96Smrgorec_comp(rec_alt *alt, int flags)
775dfecf96Smrg{
785dfecf96Smrg    orec_inf inf;
795dfecf96Smrg
805dfecf96Smrg    inf.alt = alt;
815dfecf96Smrg    inf.grp = NULL;
825dfecf96Smrg    inf.flags = flags;
835dfecf96Smrg    inf.ecode = 0;
845dfecf96Smrg
855dfecf96Smrg    orec_alt(&inf, alt);
865dfecf96Smrg
875dfecf96Smrg    return (inf.ecode);
885dfecf96Smrg}
895dfecf96Smrg
905dfecf96Smrgvoid
915dfecf96Smrgorec_free_stl(rec_stl *stl)
925dfecf96Smrg{
935dfecf96Smrg    int i;
945dfecf96Smrg
955dfecf96Smrg    for (i = 0; i < stl->nstrs; i++) {
965dfecf96Smrg	if (stl->lens[i] > 2)
975dfecf96Smrg	    free(stl->strs[i]);
985dfecf96Smrg    }
995dfecf96Smrg
1005dfecf96Smrg    free(stl->lens);
1015dfecf96Smrg    free(stl->strs);
1025dfecf96Smrg    free(stl);
1035dfecf96Smrg}
1045dfecf96Smrg
1055dfecf96Smrg
1065dfecf96Smrgstatic int
1075dfecf96Smrgorec_alt(orec_inf *inf, rec_alt *alt)
1085dfecf96Smrg{
1095dfecf96Smrg    if (alt) {
1105dfecf96Smrg	rec_alt *ptr = alt;
1115dfecf96Smrg	int ret, count = 0, str = 1, cstr = 1, lits = 0, clits = 0;
1125dfecf96Smrg
1135dfecf96Smrg	/* Check if can build a string list */
1145dfecf96Smrg	if (ptr->next) {
1155dfecf96Smrg	    /* If more than one alternative */
1165dfecf96Smrg	    while (ptr && (str || cstr)) {
1175dfecf96Smrg		if (ptr->pat == NULL || ptr->pat->rep != NULL) {
1185dfecf96Smrg		    cstr = str = 0;
1195dfecf96Smrg		    break;
1205dfecf96Smrg		}
1215dfecf96Smrg		if ((inf->flags & RE_ICASE)) {
1225dfecf96Smrg		    if (!(ret = orec_pat_cse_can(inf, ptr->pat))) {
1235dfecf96Smrg			cstr = str = 0;
1245dfecf96Smrg			break;
1255dfecf96Smrg		    }
1265dfecf96Smrg		    if (ret == 1)
1275dfecf96Smrg			++lits;
1285dfecf96Smrg		    else if (ret == 2)
1295dfecf96Smrg			++clits;
1305dfecf96Smrg		}
1315dfecf96Smrg		else if (ptr->pat->next == NULL) {
1325dfecf96Smrg		    if (ptr->pat->type != Rep_String) {
1335dfecf96Smrg			if (ptr->pat->type != Rep_Literal) {
1345dfecf96Smrg			    str = 0;
1355dfecf96Smrg			    if (ptr->pat->type != Rep_CaseString) {
1365dfecf96Smrg				if (ptr->pat->type != Rep_CaseLiteral)
1375dfecf96Smrg				    cstr = 0;
1385dfecf96Smrg				else
1395dfecf96Smrg				    ++clits;
1405dfecf96Smrg			    }
1415dfecf96Smrg			    else if (strlen((char*)ptr->pat->data.str) >= 255)
1425dfecf96Smrg				str = cstr = 0;
1435dfecf96Smrg			}
1445dfecf96Smrg			else
1455dfecf96Smrg			    ++lits;
1465dfecf96Smrg		    }
1475dfecf96Smrg		    else if (strlen((char*)ptr->pat->data.str) >= 255)
1485dfecf96Smrg			str = cstr = 0;
1495dfecf96Smrg		}
1505dfecf96Smrg		else {
1515dfecf96Smrg		    str = cstr = 0;
1525dfecf96Smrg		    break;
1535dfecf96Smrg		}
1545dfecf96Smrg		if (++count >= 255)
1555dfecf96Smrg		    str = cstr = 0;
1565dfecf96Smrg		ptr = ptr->next;
1575dfecf96Smrg	    }
1585dfecf96Smrg
1595dfecf96Smrg	    if (str || cstr) {
1605dfecf96Smrg		if (inf->flags & RE_ICASE) {
1615dfecf96Smrg		    for (ptr = alt; ptr; ptr = ptr->next) {
1625dfecf96Smrg			if (orec_pat_cse(inf, ptr->pat))
1635dfecf96Smrg			    return (inf->ecode);
1645dfecf96Smrg		    }
1655dfecf96Smrg		    str = 0;
1665dfecf96Smrg		}
1675dfecf96Smrg		return (orec_str_list(inf, alt, str, count));
1685dfecf96Smrg	    }
1695dfecf96Smrg	}
1705dfecf96Smrg	else if (alt == inf->alt && alt->pat && alt->pat->rep == NULL) {
1715dfecf96Smrg	    /* If the toplevel single alternative */
1725dfecf96Smrg	    switch (alt->pat->type) {
1735dfecf96Smrg		/*  One of these will always be true for RE_NOSPEC,
1745dfecf96Smrg		 * but can also be optimized for simple patterns */
1755dfecf96Smrg		case Rep_Literal:
1765dfecf96Smrg		    alt->pat->type = Rep_SearchLiteral;
1775dfecf96Smrg		    break;
1785dfecf96Smrg		case Rep_CaseLiteral:
1795dfecf96Smrg		    alt->pat->type = Rep_SearchCaseLiteral;
1805dfecf96Smrg		    break;
1815dfecf96Smrg		case Rep_String:
1825dfecf96Smrg		    alt->pat->type = Rep_SearchString;
1835dfecf96Smrg		    break;
1845dfecf96Smrg		case Rep_CaseString:
1855dfecf96Smrg		    alt->pat->type = Rep_SearchCaseString;
1865dfecf96Smrg		    break;
1875dfecf96Smrg		default:
1885dfecf96Smrg		    break;
1895dfecf96Smrg	    }
1905dfecf96Smrg	}
1915dfecf96Smrg
1925dfecf96Smrg	while (alt) {
1935dfecf96Smrg	    orec_pat(inf, alt->pat);
1945dfecf96Smrg	    alt = alt->next;
1955dfecf96Smrg	}
1965dfecf96Smrg    }
1975dfecf96Smrg
1985dfecf96Smrg    return (inf->ecode);
1995dfecf96Smrg}
2005dfecf96Smrg
2015dfecf96Smrgstatic int
2025dfecf96Smrgorec_pat(orec_inf *inf, rec_pat *pat)
2035dfecf96Smrg{
2045dfecf96Smrg    rec_pat *next;
2055dfecf96Smrg
2065dfecf96Smrg    while (pat) {
2075dfecf96Smrg	switch (pat->type) {
2085dfecf96Smrg	    case Rep_AnyAnyTimes:
2095dfecf96Smrg		if (pat->next == NULL) {
2105dfecf96Smrg		    rec_grp *grp = inf->grp;
2115dfecf96Smrg
2125dfecf96Smrg		    next = NULL;
2135dfecf96Smrg		    while (grp) {
2145dfecf96Smrg			next = grp->parent->next;
2155dfecf96Smrg			/* Cannot check if is .*$ as the input
2165dfecf96Smrg			 * may be a substring */
2175dfecf96Smrg			if (next)
2185dfecf96Smrg			    break;
2195dfecf96Smrg			grp = grp->pgrp;
2205dfecf96Smrg		    }
2215dfecf96Smrg		    if (next == NULL) {
2225dfecf96Smrg			/*    <re>.*    */
2235dfecf96Smrg			pat->type = Rep_AnyEatAnyTimes;
2245dfecf96Smrg			grp = inf->grp;
2255dfecf96Smrg			while (grp) {
2265dfecf96Smrg			    --grp->comp;
2275dfecf96Smrg			    next = grp->parent->next;
2285dfecf96Smrg			    if (next)
2295dfecf96Smrg				break;
2305dfecf96Smrg			    grp = grp->pgrp;
2315dfecf96Smrg			}
2325dfecf96Smrg		    }
2335dfecf96Smrg		    else if (orec_pat_bad_rpt(inf, next))
2345dfecf96Smrg			return (inf->ecode);
2355dfecf96Smrg		}
2365dfecf96Smrg		else if (orec_pat_bad_rpt(inf, pat->next))
2375dfecf96Smrg		    return (inf->ecode);
2385dfecf96Smrg		break;
2395dfecf96Smrg	    case Rep_AnyMaybe:
2405dfecf96Smrg		if (pat->next == NULL) {
2415dfecf96Smrg		    rec_grp *grp = inf->grp;
2425dfecf96Smrg
2435dfecf96Smrg		    next = NULL;
2445dfecf96Smrg		    while (grp) {
2455dfecf96Smrg			next = grp->parent->next;
2465dfecf96Smrg			if (next)
2475dfecf96Smrg			    break;
2485dfecf96Smrg			grp = grp->pgrp;
2495dfecf96Smrg		    }
2505dfecf96Smrg		    if (next == NULL) {
2515dfecf96Smrg			/*    <re>.?    */
2525dfecf96Smrg			pat->type = Rep_AnyEatMaybe;
2535dfecf96Smrg			grp = inf->grp;
2545dfecf96Smrg			while (grp) {
2555dfecf96Smrg			    --grp->comp;
2565dfecf96Smrg			    next = grp->parent->next;
2575dfecf96Smrg			    if (next)
2585dfecf96Smrg				break;
2595dfecf96Smrg			    grp = grp->pgrp;
2605dfecf96Smrg			}
2615dfecf96Smrg		    }
2625dfecf96Smrg		    else if (orec_pat_bad_rpt(inf, next))
2635dfecf96Smrg			return (inf->ecode);
2645dfecf96Smrg		}
2655dfecf96Smrg		else if (orec_pat_bad_rpt(inf, pat->next))
2665dfecf96Smrg		    return (inf->ecode);
2675dfecf96Smrg		break;
2685dfecf96Smrg	    case Rep_AnyAtLeast:
2695dfecf96Smrg		if (pat->next == NULL) {
2705dfecf96Smrg		    rec_grp *grp = inf->grp;
2715dfecf96Smrg
2725dfecf96Smrg		    next = NULL;
2735dfecf96Smrg		    while (grp) {
2745dfecf96Smrg			next = grp->parent->next;
2755dfecf96Smrg			if (next)
2765dfecf96Smrg			    break;
2775dfecf96Smrg			grp = grp->pgrp;
2785dfecf96Smrg		    }
2795dfecf96Smrg		    if (next == NULL) {
2805dfecf96Smrg			/*    <re>.+    */
2815dfecf96Smrg			pat->type = Rep_AnyEatAtLeast;
2825dfecf96Smrg			grp = inf->grp;
2835dfecf96Smrg			while (grp) {
2845dfecf96Smrg			    --grp->comp;
2855dfecf96Smrg			    next = grp->parent->next;
2865dfecf96Smrg			    if (next)
2875dfecf96Smrg				break;
2885dfecf96Smrg			    grp = grp->pgrp;
2895dfecf96Smrg			}
2905dfecf96Smrg		    }
2915dfecf96Smrg		    else if (orec_pat_bad_rpt(inf, next))
2925dfecf96Smrg			return (inf->ecode);
2935dfecf96Smrg		}
2945dfecf96Smrg		else if (orec_pat_bad_rpt(inf, pat->next))
2955dfecf96Smrg		    return (inf->ecode);
2965dfecf96Smrg		break;
2975dfecf96Smrg	    case Rep_Range:
2985dfecf96Smrg	    case Rep_RangeNot:
2995dfecf96Smrg		orec_pat_rng(inf, pat);
3005dfecf96Smrg		break;
3015dfecf96Smrg	    case Rep_Group:
3025dfecf96Smrg		orec_grp(inf, pat->data.grp);
3035dfecf96Smrg		break;
3045dfecf96Smrg	    default:
3055dfecf96Smrg		break;
3065dfecf96Smrg	}
3075dfecf96Smrg	pat = pat->next;
3085dfecf96Smrg    }
3095dfecf96Smrg
3105dfecf96Smrg    return (inf->ecode);
3115dfecf96Smrg}
3125dfecf96Smrg
3135dfecf96Smrgstatic int
3145dfecf96Smrgorec_pat_bad_rpt(orec_inf *inf, rec_pat *pat)
3155dfecf96Smrg{
3165dfecf96Smrg    switch (pat->type) {
3175dfecf96Smrg	/* Not really an error, but aren't supported by the library.
3185dfecf96Smrg	 * Includes:  .*.*, .+<re>?  .*<re>*, (.*)(<re>*), etc.
3195dfecf96Smrg	 */
3205dfecf96Smrg
3215dfecf96Smrg	/* Not a repetition, but mathes anything... */
3225dfecf96Smrg	case Rep_Any:
3235dfecf96Smrg
3245dfecf96Smrg	/* Zero length matches */
3255dfecf96Smrg	case Rep_Eol:
3265dfecf96Smrg	    if (!(inf->flags & RE_NEWLINE))
3275dfecf96Smrg		break;
3285dfecf96Smrg	case Rep_Bol:
3295dfecf96Smrg	case Rep_Bow:
3305dfecf96Smrg	case Rep_Eow:
3315dfecf96Smrg
3325dfecf96Smrg	/* Repetitions */
3335dfecf96Smrg	case Rep_AnyAnyTimes:
3345dfecf96Smrg	case Rep_AnyMaybe:
3355dfecf96Smrg	case Rep_AnyAtLeast:
3365dfecf96Smrg	    inf->ecode = RE_BADRPT;
3375dfecf96Smrg	    break;
3385dfecf96Smrg
3395dfecf96Smrg	/* Check if the first group element is a complex pattern */
3405dfecf96Smrg	case Rep_Group:
3415dfecf96Smrg	    if (pat->rep == NULL) {
3425dfecf96Smrg		if (pat->data.grp->alt) {
3435dfecf96Smrg		    for (pat = pat->data.grp->alt->pat; pat; pat = pat->next) {
3445dfecf96Smrg			if (orec_pat_bad_rpt(inf, pat))
3455dfecf96Smrg			    break;
3465dfecf96Smrg		    }
3475dfecf96Smrg		}
3485dfecf96Smrg		break;
3495dfecf96Smrg	    }
3505dfecf96Smrg	    /*FALLTHROUGH*/
3515dfecf96Smrg	default:
3525dfecf96Smrg	    if (pat->rep)
3535dfecf96Smrg		inf->ecode = RE_BADRPT;
3545dfecf96Smrg	    break;
3555dfecf96Smrg    }
3565dfecf96Smrg
3575dfecf96Smrg    if (!inf->ecode && pat && pat->next)
3585dfecf96Smrg	orec_pat_bad_forward_rpt(inf, pat->next);
3595dfecf96Smrg
3605dfecf96Smrg    return (inf->ecode);
3615dfecf96Smrg}
3625dfecf96Smrg
3635dfecf96Smrgstatic int
3645dfecf96Smrgorec_pat_bad_forward_rpt(orec_inf *inf, rec_pat *pat)
3655dfecf96Smrg{
3665dfecf96Smrg    if (pat->rep) {
3675dfecf96Smrg	switch (pat->rep->type) {
3685dfecf96Smrg	    case Rer_MinMax:
3695dfecf96Smrg		if (pat->rep->mine > 0)
3705dfecf96Smrg		    break;
3715dfecf96Smrg	    case Rer_AnyTimes:
3725dfecf96Smrg	    case Rer_Maybe:
3735dfecf96Smrg	    case Rer_Max:
3745dfecf96Smrg		inf->ecode = RE_BADRPT;
3755dfecf96Smrg	    default:
3765dfecf96Smrg		break;
3775dfecf96Smrg	}
3785dfecf96Smrg    }
3795dfecf96Smrg    else if (pat->type == Rep_Group &&
3805dfecf96Smrg	     pat->data.grp->alt &&
3815dfecf96Smrg	     pat->data.grp->alt->pat)
3825dfecf96Smrg	orec_pat_bad_forward_rpt(inf, pat->data.grp->alt->pat);
3835dfecf96Smrg
3845dfecf96Smrg    return (inf->ecode);
3855dfecf96Smrg}
3865dfecf96Smrg
3875dfecf96Smrgstatic int
3885dfecf96Smrgorec_grp(orec_inf *inf, rec_grp *grp)
3895dfecf96Smrg{
3905dfecf96Smrg    rec_grp *prev = inf->grp;
3915dfecf96Smrg
3925dfecf96Smrg    inf->grp = grp;
3935dfecf96Smrg    orec_alt(inf, grp->alt);
3945dfecf96Smrg    /* Could also just say: inf->grp = grp->gparent */
3955dfecf96Smrg    inf->grp = prev;
3965dfecf96Smrg
3975dfecf96Smrg    return (inf->ecode);
3985dfecf96Smrg}
3995dfecf96Smrg
4005dfecf96Smrgstatic int
4015dfecf96Smrgorec_pat_rng(orec_inf *inf, rec_pat *pat)
4025dfecf96Smrg{
4035dfecf96Smrg    int i, j[2], count;
4045dfecf96Smrg    rec_pat_t type = pat->type;
4055dfecf96Smrg    unsigned char *range = pat->data.rng->range;
4065dfecf96Smrg
4075dfecf96Smrg    for (i = count = j[0] = j[1] = 0; i < 256; i++) {
4085dfecf96Smrg	if (range[i]) {
4095dfecf96Smrg	    if (count == 2) {
4105dfecf96Smrg		++count;
4115dfecf96Smrg		break;
4125dfecf96Smrg	    }
4135dfecf96Smrg	    j[count++] = i;
4145dfecf96Smrg	}
4155dfecf96Smrg    }
4165dfecf96Smrg
4175dfecf96Smrg    if (count == 1 ||
4185dfecf96Smrg	(count == 2 &&
4195dfecf96Smrg	 ((islower(j[0]) && toupper(j[0]) == j[1]) ||
4205dfecf96Smrg	  (isupper(j[0]) && tolower(j[0]) == j[1])))) {
4215dfecf96Smrg	free(pat->data.rng);
4225dfecf96Smrg	if (count == 1) {
4235dfecf96Smrg	    pat->data.chr = j[0];
4245dfecf96Smrg	    pat->type = type == Rep_Range ? Rep_Literal : Rep_LiteralNot;
4255dfecf96Smrg	}
4265dfecf96Smrg	else {
4275dfecf96Smrg	    pat->data.cse.upper = j[0];
4285dfecf96Smrg	    pat->data.cse.lower = j[1];
4295dfecf96Smrg	    pat->type = type == Rep_Range ? Rep_CaseLiteral : Rep_CaseLiteralNot;
4305dfecf96Smrg	}
4315dfecf96Smrg    }
4325dfecf96Smrg    else {
4335dfecf96Smrg	if (memcmp(re__alnum, range, 256) == 0)
4345dfecf96Smrg	    type = type == Rep_Range ? Rep_Alnum : Rep_AlnumNot;
4355dfecf96Smrg	else if (memcmp(re__odigit, range, 256) == 0)
4365dfecf96Smrg	    type = type == Rep_Range ? Rep_Odigit : Rep_OdigitNot;
4375dfecf96Smrg	else if (memcmp(re__ddigit, range, 256) == 0)
4385dfecf96Smrg	    type = type == Rep_Range ? Rep_Digit : Rep_DigitNot;
4395dfecf96Smrg	else if (memcmp(re__xdigit, range, 256) == 0)
4405dfecf96Smrg	    type = type == Rep_Range ? Rep_Xdigit : Rep_XdigitNot;
4415dfecf96Smrg	else if (memcmp(re__control, range, 256) == 0)
4425dfecf96Smrg	    type = type == Rep_Range ? Rep_Control : Rep_ControlNot;
4435dfecf96Smrg
4445dfecf96Smrg	if (type != pat->type) {
4455dfecf96Smrg	    free(pat->data.rng);
4465dfecf96Smrg	    pat->type = type;
4475dfecf96Smrg	}
4485dfecf96Smrg    }
4495dfecf96Smrg
4505dfecf96Smrg    return (inf->ecode);
4515dfecf96Smrg}
4525dfecf96Smrg
4535dfecf96Smrg/*  Join patterns if required, will only fail on memory allocation failure:
4545dfecf96Smrg */
4555dfecf96Smrgstatic int
4565dfecf96Smrgorec_pat_cse(orec_inf *inf, rec_pat *pat)
4575dfecf96Smrg{
4585dfecf96Smrg    rec_pat_t type;
4595dfecf96Smrg    int i, len, length;
4605dfecf96Smrg    rec_pat *ptr, *next;
4615dfecf96Smrg    unsigned char *str, *tofree;
4625dfecf96Smrg
4635dfecf96Smrg    if (pat->next == NULL && pat->type == Rep_CaseString)
4645dfecf96Smrg	return (inf->ecode);
4655dfecf96Smrg
4665dfecf96Smrg    type = Rep_CaseString;
4675dfecf96Smrg
4685dfecf96Smrg    /* First calculate how many bytes will be required */
4695dfecf96Smrg    for (ptr = pat, length = 1; ptr; ptr = ptr->next) {
4705dfecf96Smrg	switch (ptr->type) {
4715dfecf96Smrg	    case Rep_Literal:
4725dfecf96Smrg		length += 2;
4735dfecf96Smrg		break;
4745dfecf96Smrg	    case Rep_String:
4755dfecf96Smrg		length += strlen((char*)ptr->data.str) << 1;
4765dfecf96Smrg		break;
4775dfecf96Smrg	    case Rep_CaseLiteral:
4785dfecf96Smrg		length += 2;
4795dfecf96Smrg		break;
4805dfecf96Smrg	    case Rep_CaseString:
4815dfecf96Smrg		length += strlen((char*)ptr->data.str);
4825dfecf96Smrg		break;
4835dfecf96Smrg	    default:
4845dfecf96Smrg		break;
4855dfecf96Smrg	}
4865dfecf96Smrg    }
4875dfecf96Smrg
4885dfecf96Smrg    if ((str = malloc(length)) == NULL)
4895dfecf96Smrg	return (inf->ecode = RE_ESPACE);
4905dfecf96Smrg
4915dfecf96Smrg    for (ptr = pat, length = 0; ptr; ptr = next) {
4925dfecf96Smrg	tofree = NULL;
4935dfecf96Smrg	next = ptr->next;
4945dfecf96Smrg	switch (ptr->type) {
4955dfecf96Smrg	    case Rep_Literal:
4965dfecf96Smrg		str[length++] = ptr->data.chr;
4975dfecf96Smrg		str[length++] = ptr->data.chr;
4985dfecf96Smrg		break;
4995dfecf96Smrg	    case Rep_String:
5005dfecf96Smrg		tofree = ptr->data.str;
5015dfecf96Smrg		len = strlen((char*)tofree);
5025dfecf96Smrg		for (i = 0; i < len; i++) {
5035dfecf96Smrg		    str[length++] = tofree[i];
5045dfecf96Smrg		    str[length++] = tofree[i];
5055dfecf96Smrg		}
5065dfecf96Smrg		break;
5075dfecf96Smrg	    case Rep_CaseLiteral:
5085dfecf96Smrg		str[length++] = ptr->data.cse.lower;
5095dfecf96Smrg		str[length++] = ptr->data.cse.upper;
5105dfecf96Smrg		break;
5115dfecf96Smrg	    case Rep_CaseString:
5125dfecf96Smrg		tofree = ptr->data.str;
5135dfecf96Smrg		len = strlen((char*)tofree);
5145dfecf96Smrg		memcpy(str + length, tofree, len);
5155dfecf96Smrg		length += len;
5165dfecf96Smrg		break;
5175dfecf96Smrg	    default:
5185dfecf96Smrg		break;
5195dfecf96Smrg	}
5205dfecf96Smrg	if (tofree)
5215dfecf96Smrg	    free(tofree);
5225dfecf96Smrg	if (ptr != pat)
5235dfecf96Smrg	    free(ptr);
5245dfecf96Smrg    }
5255dfecf96Smrg    str[length] = '\0';
5265dfecf96Smrg
5275dfecf96Smrg    pat->type = type;
5285dfecf96Smrg    pat->data.str = str;
5295dfecf96Smrg    pat->next = NULL;
5305dfecf96Smrg
5315dfecf96Smrg    return (inf->ecode);
5325dfecf96Smrg}
5335dfecf96Smrg
5345dfecf96Smrg/*  Return 0 if the patterns in the list cannot be merged, 1 if will
5355dfecf96Smrg * be a simple string, 2 if a case string.
5365dfecf96Smrg *  This is useful when building an alternative list that is composed
5375dfecf96Smrg * only of strings, but the regex is case insensitive, in wich case
5385dfecf96Smrg * the first pass may have splited some patterns, but if it is a member
5395dfecf96Smrg * of an alternatives list, the cost of using a string list is smaller */
5405dfecf96Smrgstatic int
5415dfecf96Smrgorec_pat_cse_can(orec_inf *inf, rec_pat *pat)
5425dfecf96Smrg{
5435dfecf96Smrg    int ret;
5445dfecf96Smrg
5455dfecf96Smrg    if (pat == NULL)
5465dfecf96Smrg	return (0);
5475dfecf96Smrg
5485dfecf96Smrg    for (ret = 1; pat; pat = pat->next) {
5495dfecf96Smrg	if (pat->rep)
5505dfecf96Smrg	    return (0);
5515dfecf96Smrg	switch (pat->type) {
5525dfecf96Smrg	    case Rep_Literal:
5535dfecf96Smrg	    case Rep_String:
5545dfecf96Smrg		break;
5555dfecf96Smrg	    case Rep_CaseLiteral:
5565dfecf96Smrg	    case Rep_CaseString:
5575dfecf96Smrg		ret = 2;
5585dfecf96Smrg		break;
5595dfecf96Smrg	    default:
5605dfecf96Smrg		return (0);
5615dfecf96Smrg	}
5625dfecf96Smrg    }
5635dfecf96Smrg
5645dfecf96Smrg    return (ret);
5655dfecf96Smrg}
5665dfecf96Smrg
5675dfecf96Smrg
5685dfecf96Smrg/*  XXX If everything is a (case) byte, the pattern should be
5695dfecf96Smrg * [abcde] instead of a|b|c|d|e (or [aAbBcCdDeE] instead of aA|bB|cC|dD|eE)
5705dfecf96Smrg * as a string list works fine, but as a character range
5715dfecf96Smrg * should be faster, and maybe could be converted here. But not
5725dfecf96Smrg * very important, if performance is required, it should have already
5735dfecf96Smrg * been done in the pattern.
5745dfecf96Smrg */
5755dfecf96Smrgstatic int
5765dfecf96Smrgorec_str_list(orec_inf *inf, rec_alt *alt, int str, int count)
5775dfecf96Smrg{
5785dfecf96Smrg    rec_stl *stl;
5795dfecf96Smrg    rec_pat *pat;
5805dfecf96Smrg    rec_alt *ptr, *next;
5815dfecf96Smrg    int i, j, tlen, len, is;
5825dfecf96Smrg
5835dfecf96Smrg    if ((stl = calloc(1, sizeof(rec_stl))) == NULL)
5845dfecf96Smrg	return (inf->ecode = RE_ESPACE);
5855dfecf96Smrg
5865dfecf96Smrg    if ((stl->lens = malloc(sizeof(unsigned char) * count)) == NULL) {
5875dfecf96Smrg	free(stl);
5885dfecf96Smrg	return (inf->ecode = RE_ESPACE);
5895dfecf96Smrg    }
5905dfecf96Smrg
5915dfecf96Smrg    if ((stl->strs = malloc(sizeof(char*) * count)) == NULL) {
5925dfecf96Smrg	free(stl->lens);
5935dfecf96Smrg	free(stl);
5945dfecf96Smrg	return (inf->ecode = RE_ESPACE);
5955dfecf96Smrg    }
5965dfecf96Smrg
5975dfecf96Smrg    if ((pat = calloc(1, sizeof(rec_pat))) == NULL) {
5985dfecf96Smrg	free(stl->strs);
5995dfecf96Smrg	free(stl->lens);
6005dfecf96Smrg	free(stl);
6015dfecf96Smrg	return (inf->ecode = RE_ESPACE);
6025dfecf96Smrg    }
6035dfecf96Smrg
6045dfecf96Smrg    pat->data.stl = stl;
6055dfecf96Smrg    pat->type = Rep_StringList;
6065dfecf96Smrg    stl->type = str ? Resl_StringList : Resl_CaseStringList;
6075dfecf96Smrg    for (i = tlen = 0, ptr = alt; i < count; i++) {
6085dfecf96Smrg	next = ptr->next;
6095dfecf96Smrg	switch (ptr->pat->type) {
6105dfecf96Smrg	    case Rep_Literal:
6115dfecf96Smrg		is = len = 1;
6125dfecf96Smrg		break;
6135dfecf96Smrg	    case Rep_CaseLiteral:
6145dfecf96Smrg		is = len = 2;
6155dfecf96Smrg		break;
6165dfecf96Smrg	    default:
6175dfecf96Smrg		is = 0;
6185dfecf96Smrg		len = strlen((char*)ptr->pat->data.str);
6195dfecf96Smrg		break;
6205dfecf96Smrg	}
6215dfecf96Smrg	tlen += len;
6225dfecf96Smrg	stl->lens[i] = len;
6235dfecf96Smrg	if (!is) {
6245dfecf96Smrg	    if (len > 2)
6255dfecf96Smrg		stl->strs[i] = ptr->pat->data.str;
6265dfecf96Smrg	    else {
6275dfecf96Smrg		if (len == 1)
6285dfecf96Smrg		    stl->strs[i] = (void*)(long)(ptr->pat->data.str[0]);
6295dfecf96Smrg		else
6305dfecf96Smrg		    stl->strs[i] = (void*)(long)
6315dfecf96Smrg				   (ptr->pat->data.str[0] |
6325dfecf96Smrg				    ((int)ptr->pat->data.str[1] << 8));
6335dfecf96Smrg		free(ptr->pat->data.str);
6345dfecf96Smrg	    }
6355dfecf96Smrg	}
6365dfecf96Smrg	else {
6375dfecf96Smrg	    if (is == 1)
6385dfecf96Smrg		stl->strs[i] = (void*)(long)ptr->pat->data.chr;
6395dfecf96Smrg	    else
6405dfecf96Smrg		stl->strs[i] = (void*)(long)
6415dfecf96Smrg			       (ptr->pat->data.cse.lower |
6425dfecf96Smrg				(ptr->pat->data.cse.upper << 8));
6435dfecf96Smrg	}
6445dfecf96Smrg	free(ptr->pat);
6455dfecf96Smrg	if (i)
6465dfecf96Smrg	    free(ptr);
6475dfecf96Smrg	ptr = next;
6485dfecf96Smrg    }
6495dfecf96Smrg    stl->tlen = tlen;
6505dfecf96Smrg    stl->nstrs = count;
6515dfecf96Smrg
6525dfecf96Smrg    alt->pat = pat;
6535dfecf96Smrg    alt->next = NULL;
6545dfecf96Smrg
6555dfecf96Smrg    {
6565dfecf96Smrg	int li, lj;
6575dfecf96Smrg	unsigned char ci, cj, *str;
6585dfecf96Smrg
6595dfecf96Smrg	/*  Don't need a stable sort, there shouldn't be duplicated strings,
6605dfecf96Smrg	 * but don't check for it either. Only need to make sure that all
6615dfecf96Smrg	 * strings that start with the same byte are together */
6625dfecf96Smrg	for (i = 0; i < count; i++) {
6635dfecf96Smrg	    li = stl->lens[i];
6645dfecf96Smrg	    ci = li > 2 ? stl->strs[i][0] : (long)stl->strs[i] & 0xff;
6655dfecf96Smrg	    for (j = i + 1; j < count; j++) {
6665dfecf96Smrg		lj = stl->lens[j];
6675dfecf96Smrg		cj = lj > 2 ? stl->strs[j][0] : (long)stl->strs[j] & 0xff;
6685dfecf96Smrg		if ((count >= LARGE_STL_COUNT && cj < ci) ||
6695dfecf96Smrg		    (cj == ci && lj > li)) {
6705dfecf96Smrg		    /* If both strings start with the same byte,
6715dfecf96Smrg		     * put the longer first */
6725dfecf96Smrg		    str = stl->strs[j];
6735dfecf96Smrg		    stl->strs[j] = stl->strs[i];
6745dfecf96Smrg		    stl->strs[i] = str;
6755dfecf96Smrg		    stl->lens[j] = li;
6765dfecf96Smrg		    stl->lens[i] = lj;
6775dfecf96Smrg		    li ^= lj; lj ^= li; li ^= lj;
6785dfecf96Smrg		    ci ^= cj; cj ^= ci; ci ^= cj;
6795dfecf96Smrg		}
6805dfecf96Smrg	    }
6815dfecf96Smrg	}
6825dfecf96Smrg    }
6835dfecf96Smrg
6845dfecf96Smrg    return (inf->ecode);
6855dfecf96Smrg}
686