Home | History | Annotate | Line # | Download | only in re
      1 /*
      2  * Copyright (c) 2002 by The XFree86 Project, Inc.
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice shall be included in
     12  * all copies or substantial portions of the Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
     18  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
     19  * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     20  * SOFTWARE.
     21  *
     22  * Except as contained in this notice, the name of the XFree86 Project shall
     23  * not be used in advertising or otherwise to promote the sale, use or other
     24  * dealings in this Software without prior written authorization from the
     25  * XFree86 Project.
     26  *
     27  * Author: Paulo Csar Pereira de Andrade
     28  */
     29 
     30 /* $XFree86: xc/programs/xedit/lisp/re/rep.h,v 1.2 2002/11/15 07:01:33 paulo Exp $ */
     31 
     32 #include "re.h"
     33 
     34 #ifndef _rep_h
     35 #define _rep_h
     36 
     37 /*
     38  * Local defines
     39  */
     40 
     41 #ifdef MIN
     42 #undef MIN
     43 #endif
     44 #define MIN(a, b)	((a) < (b) ? (a) : (b))
     45 
     46 #ifdef MAX
     47 #undef MAX
     48 #endif
     49 #define MAX(a, b)	((a) > (b) ? (a) : (b))
     50 
     51 /*  This value can not be larger than 255, a depth value is the nesting of
     52  * repetition operations and alternatives. The number of nested parenthesis
     53  * does not matter, but a repetition on the pattern inside the parenthesis
     54  * does. Note also that you cannot have more than 9 parenthesis pairs in
     55  * an expression.
     56  *  Depth is always at least 1. So for MAX_DEPTH 8, it is only allowed
     57  * 7 complex repetitions. A complex repetition is a dot followed by an
     58  * repetition operator. It is called a complex repetition because dot
     59  * matches anything but the empty string, so the engine needs to test
     60  * all possible combinations until the end of the string is found.
     61  *  Repetitions like .* use one depth until the end of the string is found,
     62  * for example a.*b.*c.*d has depth 4, while a*b*c*d has depth 2.
     63  */
     64 #define MAX_DEPTH	8
     65 
     66 /*  Minimum number of strings to generate a "large" string list, that is,
     67  * sort the strings and allocate 512 extra bytes to map the first string
     68  * with a given initial byte. */
     69 #define LARGE_STL_COUNT	16
     70 
     71 /*
     72  * Local types
     73  */
     74 /* Intermediate compilation types declaration */
     75 	/* (r)egular (e)xpression (c)ompile (c)a(se) */
     76 typedef struct _rec_cse rec_cse;
     77 
     78 	/* (r)egular (e)xpression (c)ompile (r)a(ng)e */
     79 typedef struct _rec_rng rec_rng;
     80 
     81 	/* (r)egular (e)xpression (c)ompile (pat)tern */
     82 typedef struct _rec_pat rec_pat;
     83 
     84 	/* (r)egular (e)xpression (c)ompile (rep)etition */
     85 typedef struct _rec_rep rec_rep;
     86 
     87 	/* (r)egular (e)xpression (c)ompile (gr)ou(p) */
     88 typedef struct _rec_grp rec_grp;
     89 
     90 	/* (r)egular (e)xpression (c)ompile (alt)ernatives */
     91 typedef struct _rec_alt rec_alt;
     92 
     93 
     94 /* Optimization types */
     95 	/* (r)egular (e)xpression (c)ompile (st)ring (l)ist */
     96 typedef struct _rec_stl rec_stl;
     97 
     98 /* Final compilation and execution types */
     99 	/* (re)gular expression (inf)ormation */
    100 typedef struct _re_inf re_inf;
    101 
    102 	/* (re)gular expression (eng)ine */
    103 typedef struct _re_eng re_eng;
    104 
    105 
    106 /* Codes used by the engine */
    107 typedef enum {
    108     /* Grouping */
    109     Re_Open,			/* ( */
    110     Re_Close,			/* ) */
    111     Re_Update,			/* Like Re_Close, but is inside a loop */
    112 
    113     /* Alternatives */
    114     Re_Alt,			/* Start alternative list, + next offset */
    115     Re_AltNext,			/* Next alternative, + next offset */
    116     Re_AltDone,			/* Finish alternative list */
    117 
    118     /* Repetition */
    119     Re_AnyTimes,		/* * */
    120     Re_Maybe,			/* ? */
    121     Re_AtLeast,			/* +, at least one */
    122 
    123     /* Repetition like */
    124     Re_AnyAnyTimes,		/* .*<re> */
    125     Re_AnyMaybe,		/* .?<re> */
    126     Re_AnyAtLeast,		/* .+<re> */
    127 
    128     Re_AnyEatAnyTimes,		/* Expression ends with .* */
    129     Re_AnyEatMaybe,		/* Expression ends with .? */
    130     Re_AnyEatAtLeast,		/* Expression ends with .+ */
    131 
    132     /* Repetition with arguments */
    133     Re_Exact,			/* {e} */
    134     Re_Min,			/* {n,} */
    135     Re_Max,			/* {,m} */
    136     Re_MinMax,			/* {n,m} */
    137 
    138     /* Repetition helper instruction */
    139     Re_RepJump,			/* Special code, go back to repetition */
    140     Re_RepLongJump,		/* Jump needs two bytes */
    141 	/*  After the repetition data, all repetitions have an offset
    142 	 * to the code after the repetition */
    143 
    144     /* Matching */
    145     Re_Any,			/* . */
    146     Re_Odigit,			/* \o */
    147     Re_OdigitNot,		/* \O */
    148     Re_Digit,			/* \d */
    149     Re_DigitNot,		/* \D */
    150     Re_Xdigit,			/* \x */
    151     Re_XdigitNot,		/* \x */
    152     Re_Space,			/* \s */
    153     Re_SpaceNot,		/* \S */
    154     Re_Tab,			/* \t */
    155     Re_Newline,			/* \n */
    156     Re_Lower,			/* \l */
    157     Re_Upper,			/* \u */
    158     Re_Alnum,			/* \w */
    159     Re_AlnumNot,		/* \W */
    160     Re_Control,			/* \c */
    161     Re_ControlNot,		/* \C */
    162     Re_Bol,			/* ^ */
    163     Re_Eol,			/* $ */
    164     Re_Bow,			/* \< */
    165     Re_Eow,			/* \> */
    166 
    167     /* Range matching information */
    168     Re_Range,			/* + 256 bytes */
    169     Re_RangeNot,		/* + 256 bytes */
    170 
    171     /* Matching with arguments */
    172     Re_Literal,			/* + character */
    173     Re_CaseLiteral,		/* + lower + upper */
    174     Re_LiteralNot,		/* + character */
    175     Re_CaseLiteralNot,		/* + lower + upper */
    176     Re_String,			/* + length + string */
    177     Re_CaseString,		/* + length + string in format lower-upper */
    178 
    179     /* These are useful to start matching, or when RE_NOSPEC is used. */
    180     Re_SearchLiteral,
    181     Re_SearchCaseLiteral,
    182     Re_SearchString,
    183     Re_SearchCaseString,
    184 
    185     Re_StringList,		/* + total-length + lengths + strings */
    186     Re_CaseStringList,		/* + total-length + lengths + strings */
    187 
    188     Re_LargeStringList,		/* + total-length + lengths + map + strings */
    189     Re_LargeCaseStringList,	/* + total-length + lengths + map + strings */
    190 
    191     /* Backreference */
    192     Re_Backref,			/* + reference number */
    193 
    194     /* The last codes */
    195     Re_DoneIf,			/* Done if at end of input */
    196     Re_MaybeDone,		/* Done */
    197     Re_Done			/* If this code found, finished execution */
    198 } ReCode;
    199 
    200 
    201 /* (r)egular (e)xpresssion (pat)rern (t)ype */
    202 typedef enum _rec_pat_t {
    203     Rep_Literal			= Re_Literal,
    204     Rep_CaseLiteral		= Re_CaseLiteral,
    205     Rep_LiteralNot		= Re_LiteralNot,
    206     Rep_CaseLiteralNot		= Re_CaseLiteralNot,
    207     Rep_Range			= Re_Range,
    208     Rep_RangeNot		= Re_RangeNot,
    209     Rep_String			= Re_String,
    210     Rep_CaseString		= Re_CaseString,
    211     Rep_SearchLiteral		= Re_SearchLiteral,
    212     Rep_SearchCaseLiteral	= Re_SearchCaseLiteral,
    213     Rep_SearchString		= Re_SearchString,
    214     Rep_SearchCaseString	= Re_SearchCaseString,
    215     Rep_Any			= Re_Any,
    216     Rep_AnyAnyTimes		= Re_AnyAnyTimes,
    217     Rep_AnyEatAnyTimes		= Re_AnyEatAnyTimes,
    218     Rep_AnyMaybe		= Re_AnyMaybe,
    219     Rep_AnyEatMaybe		= Re_AnyEatMaybe,
    220     Rep_AnyAtLeast		= Re_AnyAtLeast,
    221     Rep_AnyEatAtLeast		= Re_AnyEatAtLeast,
    222     Rep_Odigit			= Re_Odigit,
    223     Rep_OdigitNot		= Re_OdigitNot,
    224     Rep_Digit			= Re_Digit,
    225     Rep_DigitNot		= Re_DigitNot,
    226     Rep_Xdigit			= Re_Xdigit,
    227     Rep_XdigitNot		= Re_XdigitNot,
    228     Rep_Space			= Re_Space,
    229     Rep_SpaceNot		= Re_SpaceNot,
    230     Rep_Tab			= Re_Tab,
    231     Rep_Newline			= Re_Newline,
    232     Rep_Lower			= Re_Lower,
    233     Rep_Upper			= Re_Upper,
    234     Rep_Alnum			= Re_Alnum,
    235     Rep_AlnumNot		= Re_AlnumNot,
    236     Rep_Control			= Re_Control,
    237     Rep_ControlNot		= Re_ControlNot,
    238     Rep_Bol			= Re_Bol,
    239     Rep_Eol			= Re_Eol,
    240     Rep_Bow			= Re_Bow,
    241     Rep_Eow			= Re_Eow,
    242     Rep_Backref			= Re_Backref,
    243     Rep_StringList		= Re_StringList,
    244     Rep_Group			= Re_Open
    245 } rec_pat_t;
    246 
    247 
    248 /* (r)egular (e)xpression (rep)etition (t)ype */
    249 typedef enum _rec_rep_t {
    250     Rer_AnyTimes		= Re_AnyTimes,
    251     Rer_AtLeast			= Re_AtLeast,
    252     Rer_Maybe			= Re_Maybe,
    253     Rer_Exact			= Re_Exact,
    254     Rer_Min			= Re_Min,
    255     Rer_Max			= Re_Max,
    256     Rer_MinMax			= Re_MinMax
    257 } rec_rep_t;
    258 
    259 
    260 /*  Decide at re compilation time what is lowercase and what is uppercase */
    261 struct _rec_cse {
    262     unsigned char lower;
    263     unsigned char upper;
    264 };
    265 
    266 
    267 /*  A rec_rng is used only during compilation, just a character map */
    268 struct _rec_rng {
    269     unsigned char range[256];
    270 };
    271 
    272 
    273 /*  A rec_pat is used only during compilation, and can be viewed as
    274  * a regular expression element like a match to any character, a match
    275  * to the beginning or end of the line, etc.
    276  *  It is implemented as a linked list, and does not have nesting.
    277  *  The data field can contain:
    278  *	chr:	the value of a single character to match.
    279  *	cse:	the upper and lower case value of a character to match.
    280  *	rng:	a character map to match or not match.
    281  *	str:	a simple string or a string where every two bytes
    282  *		represents the character to match, in lower/upper
    283  *		case sequence.
    284  *  The rep field is not used for strings, strings are broken in the
    285  * last character in this case. That is, strings are just a concatenation
    286  * of several character matches.
    287  */
    288 struct _rec_pat {
    289     rec_pat_t type;
    290     rec_pat *next, *prev;	/* Linked list information */
    291     union {
    292 	unsigned char chr;
    293 	rec_cse cse;
    294 	rec_rng *rng;
    295 	rec_grp *grp;
    296 	unsigned char *str;
    297 	rec_stl *stl;
    298     } data;
    299     rec_rep *rep;		/* Pattern repetition information */
    300 };
    301 
    302 
    303 /*  A rec_rep is used only during compilation, and can be viewed as:
    304  *
    305  *	? or * or + or {<e>} or {<m>,} or {,<M>} or {<m>,<M>}
    306  *
    307  * where <e> is "exact", <m> is "minimum" and <M> is "maximum".
    308  *  In the compiled step it can also be just a NULL pointer, that
    309  * is actually equivalent to {1}.
    310  */
    311 struct _rec_rep {
    312     rec_rep_t type;
    313     short mine;			/* minimum or exact number of matches */
    314     short maxc;			/* maximum number of matches */
    315 };
    316 
    317 
    318 /*  A rec_alt is used only during compilation, and can be viewed as:
    319  *
    320  *	<re>|<re>
    321  *
    322  * where <re> is any regular expression. The expressions are nested
    323  * using the grp field of the rec_pat structure.
    324  */
    325 struct _rec_alt {
    326     rec_alt *next, *prev;	/* Linked list information */
    327     rec_pat *pat;
    328 };
    329 
    330 
    331 /*  A rec_grp is a place holder for expressions enclosed in parenthesis
    332  * and is linked to the compilation data by an rec_pat structure. */
    333 struct _rec_grp {
    334     rec_pat *parent;		/* Reference to parent pattern */
    335     rec_alt *alt;		/* The pattern information */
    336     rec_alt *palt;		/* Parent alternative */
    337     rec_grp *pgrp;		/* Nested groups */
    338     int comp;			/* (comp)lex repetition pattern inside group */
    339 };
    340 
    341 
    342 /* Optimization compilation types definition */
    343 	/* (r)egular (e)xpression (c)ompile (st)ring (l)ist (t)ype */
    344 typedef enum {
    345     Resl_StringList		= Re_StringList,
    346     Resl_CaseStringList		= Re_CaseStringList
    347 } rec_stl_t;
    348 
    349 struct _rec_stl {
    350     rec_stl_t type;
    351     int nstrs;			/* Number of strings in list */
    352     int tlen;			/* Total length of all strings */
    353     unsigned char *lens;	/* Vector of string lengths */
    354     unsigned char **strs;	/* The strings */
    355 };
    356 
    357 
    358 /*
    359  * Prototypes
    360  */
    361 	/* rep.c */
    362 rec_alt *irec_comp(const char*, const char*, int, int*);
    363 void irec_free_alt(rec_alt*);
    364 
    365 	/* reo.c */
    366 int orec_comp(rec_alt*, int);
    367 void orec_free_stl(rec_stl*);
    368 
    369 #endif /* _rep_h */
    370