15dfecf96Smrg/* 25dfecf96Smrg * Copyright (c) 2002 by The XFree86 Project, Inc. 35dfecf96Smrg * 45dfecf96Smrg * Permission is hereby granted, free of charge, to any person obtaining a 55dfecf96Smrg * copy of this software and associated documentation files (the "Software"), 65dfecf96Smrg * to deal in the Software without restriction, including without limitation 75dfecf96Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 85dfecf96Smrg * and/or sell copies of the Software, and to permit persons to whom the 95dfecf96Smrg * Software is furnished to do so, subject to the following conditions: 105dfecf96Smrg * 115dfecf96Smrg * The above copyright notice and this permission notice shall be included in 125dfecf96Smrg * all copies or substantial portions of the Software. 135dfecf96Smrg * 145dfecf96Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 155dfecf96Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 165dfecf96Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 175dfecf96Smrg * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 185dfecf96Smrg * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 195dfecf96Smrg * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 205dfecf96Smrg * SOFTWARE. 215dfecf96Smrg * 225dfecf96Smrg * Except as contained in this notice, the name of the XFree86 Project shall 235dfecf96Smrg * not be used in advertising or otherwise to promote the sale, use or other 245dfecf96Smrg * dealings in this Software without prior written authorization from the 255dfecf96Smrg * XFree86 Project. 265dfecf96Smrg * 275dfecf96Smrg * Author: Paulo César Pereira de Andrade 285dfecf96Smrg */ 295dfecf96Smrg 305dfecf96Smrg/* $XFree86: xc/programs/xedit/lisp/re/rep.h,v 1.2 2002/11/15 07:01:33 paulo Exp $ */ 315dfecf96Smrg 325dfecf96Smrg#include "re.h" 335dfecf96Smrg 345dfecf96Smrg#ifndef _rep_h 355dfecf96Smrg#define _rep_h 365dfecf96Smrg 375dfecf96Smrg/* 385dfecf96Smrg * Local defines 395dfecf96Smrg */ 405dfecf96Smrg 415dfecf96Smrg#ifdef MIN 425dfecf96Smrg#undef MIN 435dfecf96Smrg#endif 445dfecf96Smrg#define MIN(a, b) ((a) < (b) ? (a) : (b)) 455dfecf96Smrg 465dfecf96Smrg#ifdef MAX 475dfecf96Smrg#undef MAX 485dfecf96Smrg#endif 495dfecf96Smrg#define MAX(a, b) ((a) > (b) ? (a) : (b)) 505dfecf96Smrg 515dfecf96Smrg/* This value can not be larger than 255, a depth value is the nesting of 525dfecf96Smrg * repetition operations and alternatives. The number of nested parenthesis 535dfecf96Smrg * does not matter, but a repetition on the pattern inside the parenthesis 545dfecf96Smrg * does. Note also that you cannot have more than 9 parenthesis pairs in 555dfecf96Smrg * an expression. 565dfecf96Smrg * Depth is always at least 1. So for MAX_DEPTH 8, it is only allowed 575dfecf96Smrg * 7 complex repetitions. A complex repetition is a dot followed by an 585dfecf96Smrg * repetition operator. It is called a complex repetition because dot 595dfecf96Smrg * matches anything but the empty string, so the engine needs to test 605dfecf96Smrg * all possible combinations until the end of the string is found. 615dfecf96Smrg * Repetitions like .* use one depth until the end of the string is found, 625dfecf96Smrg * for example a.*b.*c.*d has depth 4, while a*b*c*d has depth 2. 635dfecf96Smrg */ 645dfecf96Smrg#define MAX_DEPTH 8 655dfecf96Smrg 665dfecf96Smrg/* Minimum number of strings to generate a "large" string list, that is, 675dfecf96Smrg * sort the strings and allocate 512 extra bytes to map the first string 685dfecf96Smrg * with a given initial byte. */ 695dfecf96Smrg#define LARGE_STL_COUNT 16 705dfecf96Smrg 715dfecf96Smrg/* 725dfecf96Smrg * Local types 735dfecf96Smrg */ 745dfecf96Smrg/* Intermediate compilation types declaration */ 755dfecf96Smrg /* (r)egular (e)xpression (c)ompile (c)a(se) */ 765dfecf96Smrgtypedef struct _rec_cse rec_cse; 775dfecf96Smrg 785dfecf96Smrg /* (r)egular (e)xpression (c)ompile (r)a(ng)e */ 795dfecf96Smrgtypedef struct _rec_rng rec_rng; 805dfecf96Smrg 815dfecf96Smrg /* (r)egular (e)xpression (c)ompile (pat)tern */ 825dfecf96Smrgtypedef struct _rec_pat rec_pat; 835dfecf96Smrg 845dfecf96Smrg /* (r)egular (e)xpression (c)ompile (rep)etition */ 855dfecf96Smrgtypedef struct _rec_rep rec_rep; 865dfecf96Smrg 875dfecf96Smrg /* (r)egular (e)xpression (c)ompile (gr)ou(p) */ 885dfecf96Smrgtypedef struct _rec_grp rec_grp; 895dfecf96Smrg 905dfecf96Smrg /* (r)egular (e)xpression (c)ompile (alt)ernatives */ 915dfecf96Smrgtypedef struct _rec_alt rec_alt; 925dfecf96Smrg 935dfecf96Smrg 945dfecf96Smrg/* Optimization types */ 955dfecf96Smrg /* (r)egular (e)xpression (c)ompile (st)ring (l)ist */ 965dfecf96Smrgtypedef struct _rec_stl rec_stl; 975dfecf96Smrg 985dfecf96Smrg/* Final compilation and execution types */ 995dfecf96Smrg /* (re)gular expression (inf)ormation */ 1005dfecf96Smrgtypedef struct _re_inf re_inf; 1015dfecf96Smrg 1025dfecf96Smrg /* (re)gular expression (eng)ine */ 1035dfecf96Smrgtypedef struct _re_eng re_eng; 1045dfecf96Smrg 1055dfecf96Smrg 1065dfecf96Smrg/* Codes used by the engine */ 1075dfecf96Smrgtypedef enum { 1085dfecf96Smrg /* Grouping */ 1095dfecf96Smrg Re_Open, /* ( */ 1105dfecf96Smrg Re_Close, /* ) */ 1115dfecf96Smrg Re_Update, /* Like Re_Close, but is inside a loop */ 1125dfecf96Smrg 1135dfecf96Smrg /* Alternatives */ 1145dfecf96Smrg Re_Alt, /* Start alternative list, + next offset */ 1155dfecf96Smrg Re_AltNext, /* Next alternative, + next offset */ 1165dfecf96Smrg Re_AltDone, /* Finish alternative list */ 1175dfecf96Smrg 1185dfecf96Smrg /* Repetition */ 1195dfecf96Smrg Re_AnyTimes, /* * */ 1205dfecf96Smrg Re_Maybe, /* ? */ 1215dfecf96Smrg Re_AtLeast, /* +, at least one */ 1225dfecf96Smrg 1235dfecf96Smrg /* Repetition like */ 1245dfecf96Smrg Re_AnyAnyTimes, /* .*<re> */ 1255dfecf96Smrg Re_AnyMaybe, /* .?<re> */ 1265dfecf96Smrg Re_AnyAtLeast, /* .+<re> */ 1275dfecf96Smrg 1285dfecf96Smrg Re_AnyEatAnyTimes, /* Expression ends with .* */ 1295dfecf96Smrg Re_AnyEatMaybe, /* Expression ends with .? */ 1305dfecf96Smrg Re_AnyEatAtLeast, /* Expression ends with .+ */ 1315dfecf96Smrg 1325dfecf96Smrg /* Repetition with arguments */ 1335dfecf96Smrg Re_Exact, /* {e} */ 1345dfecf96Smrg Re_Min, /* {n,} */ 1355dfecf96Smrg Re_Max, /* {,m} */ 1365dfecf96Smrg Re_MinMax, /* {n,m} */ 1375dfecf96Smrg 1385dfecf96Smrg /* Repetition helper instruction */ 1395dfecf96Smrg Re_RepJump, /* Special code, go back to repetition */ 1405dfecf96Smrg Re_RepLongJump, /* Jump needs two bytes */ 1415dfecf96Smrg /* After the repetition data, all repetitions have an offset 1425dfecf96Smrg * to the code after the repetition */ 1435dfecf96Smrg 1445dfecf96Smrg /* Matching */ 1455dfecf96Smrg Re_Any, /* . */ 1465dfecf96Smrg Re_Odigit, /* \o */ 1475dfecf96Smrg Re_OdigitNot, /* \O */ 1485dfecf96Smrg Re_Digit, /* \d */ 1495dfecf96Smrg Re_DigitNot, /* \D */ 1505dfecf96Smrg Re_Xdigit, /* \x */ 1515dfecf96Smrg Re_XdigitNot, /* \x */ 1525dfecf96Smrg Re_Space, /* \s */ 1535dfecf96Smrg Re_SpaceNot, /* \S */ 1545dfecf96Smrg Re_Tab, /* \t */ 1555dfecf96Smrg Re_Newline, /* \n */ 1565dfecf96Smrg Re_Lower, /* \l */ 1575dfecf96Smrg Re_Upper, /* \u */ 1585dfecf96Smrg Re_Alnum, /* \w */ 1595dfecf96Smrg Re_AlnumNot, /* \W */ 1605dfecf96Smrg Re_Control, /* \c */ 1615dfecf96Smrg Re_ControlNot, /* \C */ 1625dfecf96Smrg Re_Bol, /* ^ */ 1635dfecf96Smrg Re_Eol, /* $ */ 1645dfecf96Smrg Re_Bow, /* \< */ 1655dfecf96Smrg Re_Eow, /* \> */ 1665dfecf96Smrg 1675dfecf96Smrg /* Range matching information */ 1685dfecf96Smrg Re_Range, /* + 256 bytes */ 1695dfecf96Smrg Re_RangeNot, /* + 256 bytes */ 1705dfecf96Smrg 1715dfecf96Smrg /* Matching with arguments */ 1725dfecf96Smrg Re_Literal, /* + character */ 1735dfecf96Smrg Re_CaseLiteral, /* + lower + upper */ 1745dfecf96Smrg Re_LiteralNot, /* + character */ 1755dfecf96Smrg Re_CaseLiteralNot, /* + lower + upper */ 1765dfecf96Smrg Re_String, /* + length + string */ 1775dfecf96Smrg Re_CaseString, /* + length + string in format lower-upper */ 1785dfecf96Smrg 1795dfecf96Smrg /* These are useful to start matching, or when RE_NOSPEC is used. */ 1805dfecf96Smrg Re_SearchLiteral, 1815dfecf96Smrg Re_SearchCaseLiteral, 1825dfecf96Smrg Re_SearchString, 1835dfecf96Smrg Re_SearchCaseString, 1845dfecf96Smrg 1855dfecf96Smrg Re_StringList, /* + total-length + lengths + strings */ 1865dfecf96Smrg Re_CaseStringList, /* + total-length + lengths + strings */ 1875dfecf96Smrg 1885dfecf96Smrg Re_LargeStringList, /* + total-length + lengths + map + strings */ 1895dfecf96Smrg Re_LargeCaseStringList, /* + total-length + lengths + map + strings */ 1905dfecf96Smrg 1915dfecf96Smrg /* Backreference */ 1925dfecf96Smrg Re_Backref, /* + reference number */ 1935dfecf96Smrg 1945dfecf96Smrg /* The last codes */ 1955dfecf96Smrg Re_DoneIf, /* Done if at end of input */ 1965dfecf96Smrg Re_MaybeDone, /* Done */ 1975dfecf96Smrg Re_Done /* If this code found, finished execution */ 1985dfecf96Smrg} ReCode; 1995dfecf96Smrg 2005dfecf96Smrg 2015dfecf96Smrg/* (r)egular (e)xpresssion (pat)rern (t)ype */ 2025dfecf96Smrgtypedef enum _rec_pat_t { 2035dfecf96Smrg Rep_Literal = Re_Literal, 2045dfecf96Smrg Rep_CaseLiteral = Re_CaseLiteral, 2055dfecf96Smrg Rep_LiteralNot = Re_LiteralNot, 2065dfecf96Smrg Rep_CaseLiteralNot = Re_CaseLiteralNot, 2075dfecf96Smrg Rep_Range = Re_Range, 2085dfecf96Smrg Rep_RangeNot = Re_RangeNot, 2095dfecf96Smrg Rep_String = Re_String, 2105dfecf96Smrg Rep_CaseString = Re_CaseString, 2115dfecf96Smrg Rep_SearchLiteral = Re_SearchLiteral, 2125dfecf96Smrg Rep_SearchCaseLiteral = Re_SearchCaseLiteral, 2135dfecf96Smrg Rep_SearchString = Re_SearchString, 2145dfecf96Smrg Rep_SearchCaseString = Re_SearchCaseString, 2155dfecf96Smrg Rep_Any = Re_Any, 2165dfecf96Smrg Rep_AnyAnyTimes = Re_AnyAnyTimes, 2175dfecf96Smrg Rep_AnyEatAnyTimes = Re_AnyEatAnyTimes, 2185dfecf96Smrg Rep_AnyMaybe = Re_AnyMaybe, 2195dfecf96Smrg Rep_AnyEatMaybe = Re_AnyEatMaybe, 2205dfecf96Smrg Rep_AnyAtLeast = Re_AnyAtLeast, 2215dfecf96Smrg Rep_AnyEatAtLeast = Re_AnyEatAtLeast, 2225dfecf96Smrg Rep_Odigit = Re_Odigit, 2235dfecf96Smrg Rep_OdigitNot = Re_OdigitNot, 2245dfecf96Smrg Rep_Digit = Re_Digit, 2255dfecf96Smrg Rep_DigitNot = Re_DigitNot, 2265dfecf96Smrg Rep_Xdigit = Re_Xdigit, 2275dfecf96Smrg Rep_XdigitNot = Re_XdigitNot, 2285dfecf96Smrg Rep_Space = Re_Space, 2295dfecf96Smrg Rep_SpaceNot = Re_SpaceNot, 2305dfecf96Smrg Rep_Tab = Re_Tab, 2315dfecf96Smrg Rep_Newline = Re_Newline, 2325dfecf96Smrg Rep_Lower = Re_Lower, 2335dfecf96Smrg Rep_Upper = Re_Upper, 2345dfecf96Smrg Rep_Alnum = Re_Alnum, 2355dfecf96Smrg Rep_AlnumNot = Re_AlnumNot, 2365dfecf96Smrg Rep_Control = Re_Control, 2375dfecf96Smrg Rep_ControlNot = Re_ControlNot, 2385dfecf96Smrg Rep_Bol = Re_Bol, 2395dfecf96Smrg Rep_Eol = Re_Eol, 2405dfecf96Smrg Rep_Bow = Re_Bow, 2415dfecf96Smrg Rep_Eow = Re_Eow, 2425dfecf96Smrg Rep_Backref = Re_Backref, 2435dfecf96Smrg Rep_StringList = Re_StringList, 2445dfecf96Smrg Rep_Group = Re_Open 2455dfecf96Smrg} rec_pat_t; 2465dfecf96Smrg 2475dfecf96Smrg 2485dfecf96Smrg/* (r)egular (e)xpression (rep)etition (t)ype */ 2495dfecf96Smrgtypedef enum _rec_rep_t { 2505dfecf96Smrg Rer_AnyTimes = Re_AnyTimes, 2515dfecf96Smrg Rer_AtLeast = Re_AtLeast, 2525dfecf96Smrg Rer_Maybe = Re_Maybe, 2535dfecf96Smrg Rer_Exact = Re_Exact, 2545dfecf96Smrg Rer_Min = Re_Min, 2555dfecf96Smrg Rer_Max = Re_Max, 2565dfecf96Smrg Rer_MinMax = Re_MinMax 2575dfecf96Smrg} rec_rep_t; 2585dfecf96Smrg 2595dfecf96Smrg 2605dfecf96Smrg/* Decide at re compilation time what is lowercase and what is uppercase */ 2615dfecf96Smrgstruct _rec_cse { 2625dfecf96Smrg unsigned char lower; 2635dfecf96Smrg unsigned char upper; 2645dfecf96Smrg}; 2655dfecf96Smrg 2665dfecf96Smrg 2675dfecf96Smrg/* A rec_rng is used only during compilation, just a character map */ 2685dfecf96Smrgstruct _rec_rng { 2695dfecf96Smrg unsigned char range[256]; 2705dfecf96Smrg}; 2715dfecf96Smrg 2725dfecf96Smrg 2735dfecf96Smrg/* A rec_pat is used only during compilation, and can be viewed as 2745dfecf96Smrg * a regular expression element like a match to any character, a match 2755dfecf96Smrg * to the beginning or end of the line, etc. 2765dfecf96Smrg * It is implemented as a linked list, and does not have nesting. 2775dfecf96Smrg * The data field can contain: 2785dfecf96Smrg * chr: the value of a single character to match. 2795dfecf96Smrg * cse: the upper and lower case value of a character to match. 2805dfecf96Smrg * rng: a character map to match or not match. 2815dfecf96Smrg * str: a simple string or a string where every two bytes 2825dfecf96Smrg * represents the character to match, in lower/upper 2835dfecf96Smrg * case sequence. 2845dfecf96Smrg * The rep field is not used for strings, strings are broken in the 2855dfecf96Smrg * last character in this case. That is, strings are just a concatenation 2865dfecf96Smrg * of several character matches. 2875dfecf96Smrg */ 2885dfecf96Smrgstruct _rec_pat { 2895dfecf96Smrg rec_pat_t type; 2905dfecf96Smrg rec_pat *next, *prev; /* Linked list information */ 2915dfecf96Smrg union { 2925dfecf96Smrg unsigned char chr; 2935dfecf96Smrg rec_cse cse; 2945dfecf96Smrg rec_rng *rng; 2955dfecf96Smrg rec_grp *grp; 2965dfecf96Smrg unsigned char *str; 2975dfecf96Smrg rec_stl *stl; 2985dfecf96Smrg } data; 2995dfecf96Smrg rec_rep *rep; /* Pattern repetition information */ 3005dfecf96Smrg}; 3015dfecf96Smrg 3025dfecf96Smrg 3035dfecf96Smrg/* A rec_rep is used only during compilation, and can be viewed as: 3045dfecf96Smrg * 3055dfecf96Smrg * ? or * or + or {<e>} or {<m>,} or {,<M>} or {<m>,<M>} 3065dfecf96Smrg * 3075dfecf96Smrg * where <e> is "exact", <m> is "minimum" and <M> is "maximum". 3085dfecf96Smrg * In the compiled step it can also be just a NULL pointer, that 3095dfecf96Smrg * is actually equivalent to {1}. 3105dfecf96Smrg */ 3115dfecf96Smrgstruct _rec_rep { 3125dfecf96Smrg rec_rep_t type; 3135dfecf96Smrg short mine; /* minimum or exact number of matches */ 3145dfecf96Smrg short maxc; /* maximum number of matches */ 3155dfecf96Smrg}; 3165dfecf96Smrg 3175dfecf96Smrg 3185dfecf96Smrg/* A rec_alt is used only during compilation, and can be viewed as: 3195dfecf96Smrg * 3205dfecf96Smrg * <re>|<re> 3215dfecf96Smrg * 3225dfecf96Smrg * where <re> is any regular expression. The expressions are nested 3235dfecf96Smrg * using the grp field of the rec_pat structure. 3245dfecf96Smrg */ 3255dfecf96Smrgstruct _rec_alt { 3265dfecf96Smrg rec_alt *next, *prev; /* Linked list information */ 3275dfecf96Smrg rec_pat *pat; 3285dfecf96Smrg}; 3295dfecf96Smrg 3305dfecf96Smrg 3315dfecf96Smrg/* A rec_grp is a place holder for expressions enclosed in parenthesis 3325dfecf96Smrg * and is linked to the compilation data by an rec_pat structure. */ 3335dfecf96Smrgstruct _rec_grp { 3345dfecf96Smrg rec_pat *parent; /* Reference to parent pattern */ 3355dfecf96Smrg rec_alt *alt; /* The pattern information */ 3365dfecf96Smrg rec_alt *palt; /* Parent alternative */ 3375dfecf96Smrg rec_grp *pgrp; /* Nested groups */ 3385dfecf96Smrg int comp; /* (comp)lex repetition pattern inside group */ 3395dfecf96Smrg}; 3405dfecf96Smrg 3415dfecf96Smrg 3425dfecf96Smrg/* Optimization compilation types definition */ 3435dfecf96Smrg /* (r)egular (e)xpression (c)ompile (st)ring (l)ist (t)ype */ 3445dfecf96Smrgtypedef enum { 3455dfecf96Smrg Resl_StringList = Re_StringList, 3465dfecf96Smrg Resl_CaseStringList = Re_CaseStringList 3475dfecf96Smrg} rec_stl_t; 3485dfecf96Smrg 3495dfecf96Smrgstruct _rec_stl { 3505dfecf96Smrg rec_stl_t type; 3515dfecf96Smrg int nstrs; /* Number of strings in list */ 3525dfecf96Smrg int tlen; /* Total length of all strings */ 3535dfecf96Smrg unsigned char *lens; /* Vector of string lengths */ 3545dfecf96Smrg unsigned char **strs; /* The strings */ 3555dfecf96Smrg}; 3565dfecf96Smrg 3575dfecf96Smrg 3585dfecf96Smrg/* 3595dfecf96Smrg * Prototypes 3605dfecf96Smrg */ 3615dfecf96Smrg /* rep.c */ 3625dfecf96Smrgrec_alt *irec_comp(const char*, const char*, int, int*); 3635dfecf96Smrgvoid irec_free_alt(rec_alt*); 3645dfecf96Smrg 3655dfecf96Smrg /* reo.c */ 3665dfecf96Smrgint orec_comp(rec_alt*, int); 3675dfecf96Smrgvoid orec_free_stl(rec_stl*); 3685dfecf96Smrg 3695dfecf96Smrg#endif /* _rep_h */ 370