15dfecf96Smrg/*
25dfecf96Smrg * Copyright (c) 2002 by The XFree86 Project, Inc.
35dfecf96Smrg *
45dfecf96Smrg * Permission is hereby granted, free of charge, to any person obtaining a
55dfecf96Smrg * copy of this software and associated documentation files (the "Software"),
65dfecf96Smrg * to deal in the Software without restriction, including without limitation
75dfecf96Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
85dfecf96Smrg * and/or sell copies of the Software, and to permit persons to whom the
95dfecf96Smrg * Software is furnished to do so, subject to the following conditions:
105dfecf96Smrg *
115dfecf96Smrg * The above copyright notice and this permission notice shall be included in
125dfecf96Smrg * all copies or substantial portions of the Software.
135dfecf96Smrg *
145dfecf96Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
155dfecf96Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
165dfecf96Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
175dfecf96Smrg * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
185dfecf96Smrg * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
195dfecf96Smrg * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
205dfecf96Smrg * SOFTWARE.
215dfecf96Smrg *
225dfecf96Smrg * Except as contained in this notice, the name of the XFree86 Project shall
235dfecf96Smrg * not be used in advertising or otherwise to promote the sale, use or other
245dfecf96Smrg * dealings in this Software without prior written authorization from the
255dfecf96Smrg * XFree86 Project.
265dfecf96Smrg *
275dfecf96Smrg * Author: Paulo César Pereira de Andrade
285dfecf96Smrg */
295dfecf96Smrg
305dfecf96Smrg/* $XFree86: xc/programs/xedit/lisp/re/rep.h,v 1.2 2002/11/15 07:01:33 paulo Exp $ */
315dfecf96Smrg
325dfecf96Smrg#include "re.h"
335dfecf96Smrg
345dfecf96Smrg#ifndef _rep_h
355dfecf96Smrg#define _rep_h
365dfecf96Smrg
375dfecf96Smrg/*
385dfecf96Smrg * Local defines
395dfecf96Smrg */
405dfecf96Smrg
415dfecf96Smrg#ifdef MIN
425dfecf96Smrg#undef MIN
435dfecf96Smrg#endif
445dfecf96Smrg#define MIN(a, b)	((a) < (b) ? (a) : (b))
455dfecf96Smrg
465dfecf96Smrg#ifdef MAX
475dfecf96Smrg#undef MAX
485dfecf96Smrg#endif
495dfecf96Smrg#define MAX(a, b)	((a) > (b) ? (a) : (b))
505dfecf96Smrg
515dfecf96Smrg/*  This value can not be larger than 255, a depth value is the nesting of
525dfecf96Smrg * repetition operations and alternatives. The number of nested parenthesis
535dfecf96Smrg * does not matter, but a repetition on the pattern inside the parenthesis
545dfecf96Smrg * does. Note also that you cannot have more than 9 parenthesis pairs in
555dfecf96Smrg * an expression.
565dfecf96Smrg *  Depth is always at least 1. So for MAX_DEPTH 8, it is only allowed
575dfecf96Smrg * 7 complex repetitions. A complex repetition is a dot followed by an
585dfecf96Smrg * repetition operator. It is called a complex repetition because dot
595dfecf96Smrg * matches anything but the empty string, so the engine needs to test
605dfecf96Smrg * all possible combinations until the end of the string is found.
615dfecf96Smrg *  Repetitions like .* use one depth until the end of the string is found,
625dfecf96Smrg * for example a.*b.*c.*d has depth 4, while a*b*c*d has depth 2.
635dfecf96Smrg */
645dfecf96Smrg#define MAX_DEPTH	8
655dfecf96Smrg
665dfecf96Smrg/*  Minimum number of strings to generate a "large" string list, that is,
675dfecf96Smrg * sort the strings and allocate 512 extra bytes to map the first string
685dfecf96Smrg * with a given initial byte. */
695dfecf96Smrg#define LARGE_STL_COUNT	16
705dfecf96Smrg
715dfecf96Smrg/*
725dfecf96Smrg * Local types
735dfecf96Smrg */
745dfecf96Smrg/* Intermediate compilation types declaration */
755dfecf96Smrg	/* (r)egular (e)xpression (c)ompile (c)a(se) */
765dfecf96Smrgtypedef struct _rec_cse rec_cse;
775dfecf96Smrg
785dfecf96Smrg	/* (r)egular (e)xpression (c)ompile (r)a(ng)e */
795dfecf96Smrgtypedef struct _rec_rng rec_rng;
805dfecf96Smrg
815dfecf96Smrg	/* (r)egular (e)xpression (c)ompile (pat)tern */
825dfecf96Smrgtypedef struct _rec_pat rec_pat;
835dfecf96Smrg
845dfecf96Smrg	/* (r)egular (e)xpression (c)ompile (rep)etition */
855dfecf96Smrgtypedef struct _rec_rep rec_rep;
865dfecf96Smrg
875dfecf96Smrg	/* (r)egular (e)xpression (c)ompile (gr)ou(p) */
885dfecf96Smrgtypedef struct _rec_grp rec_grp;
895dfecf96Smrg
905dfecf96Smrg	/* (r)egular (e)xpression (c)ompile (alt)ernatives */
915dfecf96Smrgtypedef struct _rec_alt rec_alt;
925dfecf96Smrg
935dfecf96Smrg
945dfecf96Smrg/* Optimization types */
955dfecf96Smrg	/* (r)egular (e)xpression (c)ompile (st)ring (l)ist */
965dfecf96Smrgtypedef struct _rec_stl rec_stl;
975dfecf96Smrg
985dfecf96Smrg/* Final compilation and execution types */
995dfecf96Smrg	/* (re)gular expression (inf)ormation */
1005dfecf96Smrgtypedef struct _re_inf re_inf;
1015dfecf96Smrg
1025dfecf96Smrg	/* (re)gular expression (eng)ine */
1035dfecf96Smrgtypedef struct _re_eng re_eng;
1045dfecf96Smrg
1055dfecf96Smrg
1065dfecf96Smrg/* Codes used by the engine */
1075dfecf96Smrgtypedef enum {
1085dfecf96Smrg    /* Grouping */
1095dfecf96Smrg    Re_Open,			/* ( */
1105dfecf96Smrg    Re_Close,			/* ) */
1115dfecf96Smrg    Re_Update,			/* Like Re_Close, but is inside a loop */
1125dfecf96Smrg
1135dfecf96Smrg    /* Alternatives */
1145dfecf96Smrg    Re_Alt,			/* Start alternative list, + next offset */
1155dfecf96Smrg    Re_AltNext,			/* Next alternative, + next offset */
1165dfecf96Smrg    Re_AltDone,			/* Finish alternative list */
1175dfecf96Smrg
1185dfecf96Smrg    /* Repetition */
1195dfecf96Smrg    Re_AnyTimes,		/* * */
1205dfecf96Smrg    Re_Maybe,			/* ? */
1215dfecf96Smrg    Re_AtLeast,			/* +, at least one */
1225dfecf96Smrg
1235dfecf96Smrg    /* Repetition like */
1245dfecf96Smrg    Re_AnyAnyTimes,		/* .*<re> */
1255dfecf96Smrg    Re_AnyMaybe,		/* .?<re> */
1265dfecf96Smrg    Re_AnyAtLeast,		/* .+<re> */
1275dfecf96Smrg
1285dfecf96Smrg    Re_AnyEatAnyTimes,		/* Expression ends with .* */
1295dfecf96Smrg    Re_AnyEatMaybe,		/* Expression ends with .? */
1305dfecf96Smrg    Re_AnyEatAtLeast,		/* Expression ends with .+ */
1315dfecf96Smrg
1325dfecf96Smrg    /* Repetition with arguments */
1335dfecf96Smrg    Re_Exact,			/* {e} */
1345dfecf96Smrg    Re_Min,			/* {n,} */
1355dfecf96Smrg    Re_Max,			/* {,m} */
1365dfecf96Smrg    Re_MinMax,			/* {n,m} */
1375dfecf96Smrg
1385dfecf96Smrg    /* Repetition helper instruction */
1395dfecf96Smrg    Re_RepJump,			/* Special code, go back to repetition */
1405dfecf96Smrg    Re_RepLongJump,		/* Jump needs two bytes */
1415dfecf96Smrg	/*  After the repetition data, all repetitions have an offset
1425dfecf96Smrg	 * to the code after the repetition */
1435dfecf96Smrg
1445dfecf96Smrg    /* Matching */
1455dfecf96Smrg    Re_Any,			/* . */
1465dfecf96Smrg    Re_Odigit,			/* \o */
1475dfecf96Smrg    Re_OdigitNot,		/* \O */
1485dfecf96Smrg    Re_Digit,			/* \d */
1495dfecf96Smrg    Re_DigitNot,		/* \D */
1505dfecf96Smrg    Re_Xdigit,			/* \x */
1515dfecf96Smrg    Re_XdigitNot,		/* \x */
1525dfecf96Smrg    Re_Space,			/* \s */
1535dfecf96Smrg    Re_SpaceNot,		/* \S */
1545dfecf96Smrg    Re_Tab,			/* \t */
1555dfecf96Smrg    Re_Newline,			/* \n */
1565dfecf96Smrg    Re_Lower,			/* \l */
1575dfecf96Smrg    Re_Upper,			/* \u */
1585dfecf96Smrg    Re_Alnum,			/* \w */
1595dfecf96Smrg    Re_AlnumNot,		/* \W */
1605dfecf96Smrg    Re_Control,			/* \c */
1615dfecf96Smrg    Re_ControlNot,		/* \C */
1625dfecf96Smrg    Re_Bol,			/* ^ */
1635dfecf96Smrg    Re_Eol,			/* $ */
1645dfecf96Smrg    Re_Bow,			/* \< */
1655dfecf96Smrg    Re_Eow,			/* \> */
1665dfecf96Smrg
1675dfecf96Smrg    /* Range matching information */
1685dfecf96Smrg    Re_Range,			/* + 256 bytes */
1695dfecf96Smrg    Re_RangeNot,		/* + 256 bytes */
1705dfecf96Smrg
1715dfecf96Smrg    /* Matching with arguments */
1725dfecf96Smrg    Re_Literal,			/* + character */
1735dfecf96Smrg    Re_CaseLiteral,		/* + lower + upper */
1745dfecf96Smrg    Re_LiteralNot,		/* + character */
1755dfecf96Smrg    Re_CaseLiteralNot,		/* + lower + upper */
1765dfecf96Smrg    Re_String,			/* + length + string */
1775dfecf96Smrg    Re_CaseString,		/* + length + string in format lower-upper */
1785dfecf96Smrg
1795dfecf96Smrg    /* These are useful to start matching, or when RE_NOSPEC is used. */
1805dfecf96Smrg    Re_SearchLiteral,
1815dfecf96Smrg    Re_SearchCaseLiteral,
1825dfecf96Smrg    Re_SearchString,
1835dfecf96Smrg    Re_SearchCaseString,
1845dfecf96Smrg
1855dfecf96Smrg    Re_StringList,		/* + total-length + lengths + strings */
1865dfecf96Smrg    Re_CaseStringList,		/* + total-length + lengths + strings */
1875dfecf96Smrg
1885dfecf96Smrg    Re_LargeStringList,		/* + total-length + lengths + map + strings */
1895dfecf96Smrg    Re_LargeCaseStringList,	/* + total-length + lengths + map + strings */
1905dfecf96Smrg
1915dfecf96Smrg    /* Backreference */
1925dfecf96Smrg    Re_Backref,			/* + reference number */
1935dfecf96Smrg
1945dfecf96Smrg    /* The last codes */
1955dfecf96Smrg    Re_DoneIf,			/* Done if at end of input */
1965dfecf96Smrg    Re_MaybeDone,		/* Done */
1975dfecf96Smrg    Re_Done			/* If this code found, finished execution */
1985dfecf96Smrg} ReCode;
1995dfecf96Smrg
2005dfecf96Smrg
2015dfecf96Smrg/* (r)egular (e)xpresssion (pat)rern (t)ype */
2025dfecf96Smrgtypedef enum _rec_pat_t {
2035dfecf96Smrg    Rep_Literal			= Re_Literal,
2045dfecf96Smrg    Rep_CaseLiteral		= Re_CaseLiteral,
2055dfecf96Smrg    Rep_LiteralNot		= Re_LiteralNot,
2065dfecf96Smrg    Rep_CaseLiteralNot		= Re_CaseLiteralNot,
2075dfecf96Smrg    Rep_Range			= Re_Range,
2085dfecf96Smrg    Rep_RangeNot		= Re_RangeNot,
2095dfecf96Smrg    Rep_String			= Re_String,
2105dfecf96Smrg    Rep_CaseString		= Re_CaseString,
2115dfecf96Smrg    Rep_SearchLiteral		= Re_SearchLiteral,
2125dfecf96Smrg    Rep_SearchCaseLiteral	= Re_SearchCaseLiteral,
2135dfecf96Smrg    Rep_SearchString		= Re_SearchString,
2145dfecf96Smrg    Rep_SearchCaseString	= Re_SearchCaseString,
2155dfecf96Smrg    Rep_Any			= Re_Any,
2165dfecf96Smrg    Rep_AnyAnyTimes		= Re_AnyAnyTimes,
2175dfecf96Smrg    Rep_AnyEatAnyTimes		= Re_AnyEatAnyTimes,
2185dfecf96Smrg    Rep_AnyMaybe		= Re_AnyMaybe,
2195dfecf96Smrg    Rep_AnyEatMaybe		= Re_AnyEatMaybe,
2205dfecf96Smrg    Rep_AnyAtLeast		= Re_AnyAtLeast,
2215dfecf96Smrg    Rep_AnyEatAtLeast		= Re_AnyEatAtLeast,
2225dfecf96Smrg    Rep_Odigit			= Re_Odigit,
2235dfecf96Smrg    Rep_OdigitNot		= Re_OdigitNot,
2245dfecf96Smrg    Rep_Digit			= Re_Digit,
2255dfecf96Smrg    Rep_DigitNot		= Re_DigitNot,
2265dfecf96Smrg    Rep_Xdigit			= Re_Xdigit,
2275dfecf96Smrg    Rep_XdigitNot		= Re_XdigitNot,
2285dfecf96Smrg    Rep_Space			= Re_Space,
2295dfecf96Smrg    Rep_SpaceNot		= Re_SpaceNot,
2305dfecf96Smrg    Rep_Tab			= Re_Tab,
2315dfecf96Smrg    Rep_Newline			= Re_Newline,
2325dfecf96Smrg    Rep_Lower			= Re_Lower,
2335dfecf96Smrg    Rep_Upper			= Re_Upper,
2345dfecf96Smrg    Rep_Alnum			= Re_Alnum,
2355dfecf96Smrg    Rep_AlnumNot		= Re_AlnumNot,
2365dfecf96Smrg    Rep_Control			= Re_Control,
2375dfecf96Smrg    Rep_ControlNot		= Re_ControlNot,
2385dfecf96Smrg    Rep_Bol			= Re_Bol,
2395dfecf96Smrg    Rep_Eol			= Re_Eol,
2405dfecf96Smrg    Rep_Bow			= Re_Bow,
2415dfecf96Smrg    Rep_Eow			= Re_Eow,
2425dfecf96Smrg    Rep_Backref			= Re_Backref,
2435dfecf96Smrg    Rep_StringList		= Re_StringList,
2445dfecf96Smrg    Rep_Group			= Re_Open
2455dfecf96Smrg} rec_pat_t;
2465dfecf96Smrg
2475dfecf96Smrg
2485dfecf96Smrg/* (r)egular (e)xpression (rep)etition (t)ype */
2495dfecf96Smrgtypedef enum _rec_rep_t {
2505dfecf96Smrg    Rer_AnyTimes		= Re_AnyTimes,
2515dfecf96Smrg    Rer_AtLeast			= Re_AtLeast,
2525dfecf96Smrg    Rer_Maybe			= Re_Maybe,
2535dfecf96Smrg    Rer_Exact			= Re_Exact,
2545dfecf96Smrg    Rer_Min			= Re_Min,
2555dfecf96Smrg    Rer_Max			= Re_Max,
2565dfecf96Smrg    Rer_MinMax			= Re_MinMax
2575dfecf96Smrg} rec_rep_t;
2585dfecf96Smrg
2595dfecf96Smrg
2605dfecf96Smrg/*  Decide at re compilation time what is lowercase and what is uppercase */
2615dfecf96Smrgstruct _rec_cse {
2625dfecf96Smrg    unsigned char lower;
2635dfecf96Smrg    unsigned char upper;
2645dfecf96Smrg};
2655dfecf96Smrg
2665dfecf96Smrg
2675dfecf96Smrg/*  A rec_rng is used only during compilation, just a character map */
2685dfecf96Smrgstruct _rec_rng {
2695dfecf96Smrg    unsigned char range[256];
2705dfecf96Smrg};
2715dfecf96Smrg
2725dfecf96Smrg
2735dfecf96Smrg/*  A rec_pat is used only during compilation, and can be viewed as
2745dfecf96Smrg * a regular expression element like a match to any character, a match
2755dfecf96Smrg * to the beginning or end of the line, etc.
2765dfecf96Smrg *  It is implemented as a linked list, and does not have nesting.
2775dfecf96Smrg *  The data field can contain:
2785dfecf96Smrg *	chr:	the value of a single character to match.
2795dfecf96Smrg *	cse:	the upper and lower case value of a character to match.
2805dfecf96Smrg *	rng:	a character map to match or not match.
2815dfecf96Smrg *	str:	a simple string or a string where every two bytes
2825dfecf96Smrg *		represents the character to match, in lower/upper
2835dfecf96Smrg *		case sequence.
2845dfecf96Smrg *  The rep field is not used for strings, strings are broken in the
2855dfecf96Smrg * last character in this case. That is, strings are just a concatenation
2865dfecf96Smrg * of several character matches.
2875dfecf96Smrg */
2885dfecf96Smrgstruct _rec_pat {
2895dfecf96Smrg    rec_pat_t type;
2905dfecf96Smrg    rec_pat *next, *prev;	/* Linked list information */
2915dfecf96Smrg    union {
2925dfecf96Smrg	unsigned char chr;
2935dfecf96Smrg	rec_cse cse;
2945dfecf96Smrg	rec_rng *rng;
2955dfecf96Smrg	rec_grp *grp;
2965dfecf96Smrg	unsigned char *str;
2975dfecf96Smrg	rec_stl *stl;
2985dfecf96Smrg    } data;
2995dfecf96Smrg    rec_rep *rep;		/* Pattern repetition information */
3005dfecf96Smrg};
3015dfecf96Smrg
3025dfecf96Smrg
3035dfecf96Smrg/*  A rec_rep is used only during compilation, and can be viewed as:
3045dfecf96Smrg *
3055dfecf96Smrg *	? or * or + or {<e>} or {<m>,} or {,<M>} or {<m>,<M>}
3065dfecf96Smrg *
3075dfecf96Smrg * where <e> is "exact", <m> is "minimum" and <M> is "maximum".
3085dfecf96Smrg *  In the compiled step it can also be just a NULL pointer, that
3095dfecf96Smrg * is actually equivalent to {1}.
3105dfecf96Smrg */
3115dfecf96Smrgstruct _rec_rep {
3125dfecf96Smrg    rec_rep_t type;
3135dfecf96Smrg    short mine;			/* minimum or exact number of matches */
3145dfecf96Smrg    short maxc;			/* maximum number of matches */
3155dfecf96Smrg};
3165dfecf96Smrg
3175dfecf96Smrg
3185dfecf96Smrg/*  A rec_alt is used only during compilation, and can be viewed as:
3195dfecf96Smrg *
3205dfecf96Smrg *	<re>|<re>
3215dfecf96Smrg *
3225dfecf96Smrg * where <re> is any regular expression. The expressions are nested
3235dfecf96Smrg * using the grp field of the rec_pat structure.
3245dfecf96Smrg */
3255dfecf96Smrgstruct _rec_alt {
3265dfecf96Smrg    rec_alt *next, *prev;	/* Linked list information */
3275dfecf96Smrg    rec_pat *pat;
3285dfecf96Smrg};
3295dfecf96Smrg
3305dfecf96Smrg
3315dfecf96Smrg/*  A rec_grp is a place holder for expressions enclosed in parenthesis
3325dfecf96Smrg * and is linked to the compilation data by an rec_pat structure. */
3335dfecf96Smrgstruct _rec_grp {
3345dfecf96Smrg    rec_pat *parent;		/* Reference to parent pattern */
3355dfecf96Smrg    rec_alt *alt;		/* The pattern information */
3365dfecf96Smrg    rec_alt *palt;		/* Parent alternative */
3375dfecf96Smrg    rec_grp *pgrp;		/* Nested groups */
3385dfecf96Smrg    int comp;			/* (comp)lex repetition pattern inside group */
3395dfecf96Smrg};
3405dfecf96Smrg
3415dfecf96Smrg
3425dfecf96Smrg/* Optimization compilation types definition */
3435dfecf96Smrg	/* (r)egular (e)xpression (c)ompile (st)ring (l)ist (t)ype */
3445dfecf96Smrgtypedef enum {
3455dfecf96Smrg    Resl_StringList		= Re_StringList,
3465dfecf96Smrg    Resl_CaseStringList		= Re_CaseStringList
3475dfecf96Smrg} rec_stl_t;
3485dfecf96Smrg
3495dfecf96Smrgstruct _rec_stl {
3505dfecf96Smrg    rec_stl_t type;
3515dfecf96Smrg    int nstrs;			/* Number of strings in list */
3525dfecf96Smrg    int tlen;			/* Total length of all strings */
3535dfecf96Smrg    unsigned char *lens;	/* Vector of string lengths */
3545dfecf96Smrg    unsigned char **strs;	/* The strings */
3555dfecf96Smrg};
3565dfecf96Smrg
3575dfecf96Smrg
3585dfecf96Smrg/*
3595dfecf96Smrg * Prototypes
3605dfecf96Smrg */
3615dfecf96Smrg	/* rep.c */
3625dfecf96Smrgrec_alt *irec_comp(const char*, const char*, int, int*);
3635dfecf96Smrgvoid irec_free_alt(rec_alt*);
3645dfecf96Smrg
3655dfecf96Smrg	/* reo.c */
3665dfecf96Smrgint orec_comp(rec_alt*, int);
3675dfecf96Smrgvoid orec_free_stl(rec_stl*);
3685dfecf96Smrg
3695dfecf96Smrg#endif /* _rep_h */
370