rep.h revision 5dfecf96
1/*
2 * Copyright (c) 2002 by The XFree86 Project, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 *
22 * Except as contained in this notice, the name of the XFree86 Project shall
23 * not be used in advertising or otherwise to promote the sale, use or other
24 * dealings in this Software without prior written authorization from the
25 * XFree86 Project.
26 *
27 * Author: Paulo César Pereira de Andrade
28 */
29
30/* $XFree86: xc/programs/xedit/lisp/re/rep.h,v 1.2 2002/11/15 07:01:33 paulo Exp $ */
31
32#include "re.h"
33
34#ifndef _rep_h
35#define _rep_h
36
37/*
38 * Local defines
39 */
40
41#ifdef MIN
42#undef MIN
43#endif
44#define MIN(a, b)	((a) < (b) ? (a) : (b))
45
46#ifdef MAX
47#undef MAX
48#endif
49#define MAX(a, b)	((a) > (b) ? (a) : (b))
50
51/*  This value can not be larger than 255, a depth value is the nesting of
52 * repetition operations and alternatives. The number of nested parenthesis
53 * does not matter, but a repetition on the pattern inside the parenthesis
54 * does. Note also that you cannot have more than 9 parenthesis pairs in
55 * an expression.
56 *  Depth is always at least 1. So for MAX_DEPTH 8, it is only allowed
57 * 7 complex repetitions. A complex repetition is a dot followed by an
58 * repetition operator. It is called a complex repetition because dot
59 * matches anything but the empty string, so the engine needs to test
60 * all possible combinations until the end of the string is found.
61 *  Repetitions like .* use one depth until the end of the string is found,
62 * for example a.*b.*c.*d has depth 4, while a*b*c*d has depth 2.
63 */
64#define MAX_DEPTH	8
65
66/*  Minimum number of strings to generate a "large" string list, that is,
67 * sort the strings and allocate 512 extra bytes to map the first string
68 * with a given initial byte. */
69#define LARGE_STL_COUNT	16
70
71/*
72 * Local types
73 */
74/* Intermediate compilation types declaration */
75	/* (r)egular (e)xpression (c)ompile (c)a(se) */
76typedef struct _rec_cse rec_cse;
77
78	/* (r)egular (e)xpression (c)ompile (r)a(ng)e */
79typedef struct _rec_rng rec_rng;
80
81	/* (r)egular (e)xpression (c)ompile (pat)tern */
82typedef struct _rec_pat rec_pat;
83
84	/* (r)egular (e)xpression (c)ompile (rep)etition */
85typedef struct _rec_rep rec_rep;
86
87	/* (r)egular (e)xpression (c)ompile (gr)ou(p) */
88typedef struct _rec_grp rec_grp;
89
90	/* (r)egular (e)xpression (c)ompile (alt)ernatives */
91typedef struct _rec_alt rec_alt;
92
93
94/* Optimization types */
95	/* (r)egular (e)xpression (c)ompile (st)ring (l)ist */
96typedef struct _rec_stl rec_stl;
97
98/* Final compilation and execution types */
99	/* (re)gular expression (inf)ormation */
100typedef struct _re_inf re_inf;
101
102	/* (re)gular expression (eng)ine */
103typedef struct _re_eng re_eng;
104
105
106/* Codes used by the engine */
107typedef enum {
108    /* Grouping */
109    Re_Open,			/* ( */
110    Re_Close,			/* ) */
111    Re_Update,			/* Like Re_Close, but is inside a loop */
112
113    /* Alternatives */
114    Re_Alt,			/* Start alternative list, + next offset */
115    Re_AltNext,			/* Next alternative, + next offset */
116    Re_AltDone,			/* Finish alternative list */
117
118    /* Repetition */
119    Re_AnyTimes,		/* * */
120    Re_Maybe,			/* ? */
121    Re_AtLeast,			/* +, at least one */
122
123    /* Repetition like */
124    Re_AnyAnyTimes,		/* .*<re> */
125    Re_AnyMaybe,		/* .?<re> */
126    Re_AnyAtLeast,		/* .+<re> */
127
128    Re_AnyEatAnyTimes,		/* Expression ends with .* */
129    Re_AnyEatMaybe,		/* Expression ends with .? */
130    Re_AnyEatAtLeast,		/* Expression ends with .+ */
131
132    /* Repetition with arguments */
133    Re_Exact,			/* {e} */
134    Re_Min,			/* {n,} */
135    Re_Max,			/* {,m} */
136    Re_MinMax,			/* {n,m} */
137
138    /* Repetition helper instruction */
139    Re_RepJump,			/* Special code, go back to repetition */
140    Re_RepLongJump,		/* Jump needs two bytes */
141	/*  After the repetition data, all repetitions have an offset
142	 * to the code after the repetition */
143
144    /* Matching */
145    Re_Any,			/* . */
146    Re_Odigit,			/* \o */
147    Re_OdigitNot,		/* \O */
148    Re_Digit,			/* \d */
149    Re_DigitNot,		/* \D */
150    Re_Xdigit,			/* \x */
151    Re_XdigitNot,		/* \x */
152    Re_Space,			/* \s */
153    Re_SpaceNot,		/* \S */
154    Re_Tab,			/* \t */
155    Re_Newline,			/* \n */
156    Re_Lower,			/* \l */
157    Re_Upper,			/* \u */
158    Re_Alnum,			/* \w */
159    Re_AlnumNot,		/* \W */
160    Re_Control,			/* \c */
161    Re_ControlNot,		/* \C */
162    Re_Bol,			/* ^ */
163    Re_Eol,			/* $ */
164    Re_Bow,			/* \< */
165    Re_Eow,			/* \> */
166
167    /* Range matching information */
168    Re_Range,			/* + 256 bytes */
169    Re_RangeNot,		/* + 256 bytes */
170
171    /* Matching with arguments */
172    Re_Literal,			/* + character */
173    Re_CaseLiteral,		/* + lower + upper */
174    Re_LiteralNot,		/* + character */
175    Re_CaseLiteralNot,		/* + lower + upper */
176    Re_String,			/* + length + string */
177    Re_CaseString,		/* + length + string in format lower-upper */
178
179    /* These are useful to start matching, or when RE_NOSPEC is used. */
180    Re_SearchLiteral,
181    Re_SearchCaseLiteral,
182    Re_SearchString,
183    Re_SearchCaseString,
184
185    Re_StringList,		/* + total-length + lengths + strings */
186    Re_CaseStringList,		/* + total-length + lengths + strings */
187
188    Re_LargeStringList,		/* + total-length + lengths + map + strings */
189    Re_LargeCaseStringList,	/* + total-length + lengths + map + strings */
190
191    /* Backreference */
192    Re_Backref,			/* + reference number */
193
194    /* The last codes */
195    Re_DoneIf,			/* Done if at end of input */
196    Re_MaybeDone,		/* Done */
197    Re_Done			/* If this code found, finished execution */
198} ReCode;
199
200
201/* (r)egular (e)xpresssion (pat)rern (t)ype */
202typedef enum _rec_pat_t {
203    Rep_Literal			= Re_Literal,
204    Rep_CaseLiteral		= Re_CaseLiteral,
205    Rep_LiteralNot		= Re_LiteralNot,
206    Rep_CaseLiteralNot		= Re_CaseLiteralNot,
207    Rep_Range			= Re_Range,
208    Rep_RangeNot		= Re_RangeNot,
209    Rep_String			= Re_String,
210    Rep_CaseString		= Re_CaseString,
211    Rep_SearchLiteral		= Re_SearchLiteral,
212    Rep_SearchCaseLiteral	= Re_SearchCaseLiteral,
213    Rep_SearchString		= Re_SearchString,
214    Rep_SearchCaseString	= Re_SearchCaseString,
215    Rep_Any			= Re_Any,
216    Rep_AnyAnyTimes		= Re_AnyAnyTimes,
217    Rep_AnyEatAnyTimes		= Re_AnyEatAnyTimes,
218    Rep_AnyMaybe		= Re_AnyMaybe,
219    Rep_AnyEatMaybe		= Re_AnyEatMaybe,
220    Rep_AnyAtLeast		= Re_AnyAtLeast,
221    Rep_AnyEatAtLeast		= Re_AnyEatAtLeast,
222    Rep_Odigit			= Re_Odigit,
223    Rep_OdigitNot		= Re_OdigitNot,
224    Rep_Digit			= Re_Digit,
225    Rep_DigitNot		= Re_DigitNot,
226    Rep_Xdigit			= Re_Xdigit,
227    Rep_XdigitNot		= Re_XdigitNot,
228    Rep_Space			= Re_Space,
229    Rep_SpaceNot		= Re_SpaceNot,
230    Rep_Tab			= Re_Tab,
231    Rep_Newline			= Re_Newline,
232    Rep_Lower			= Re_Lower,
233    Rep_Upper			= Re_Upper,
234    Rep_Alnum			= Re_Alnum,
235    Rep_AlnumNot		= Re_AlnumNot,
236    Rep_Control			= Re_Control,
237    Rep_ControlNot		= Re_ControlNot,
238    Rep_Bol			= Re_Bol,
239    Rep_Eol			= Re_Eol,
240    Rep_Bow			= Re_Bow,
241    Rep_Eow			= Re_Eow,
242    Rep_Backref			= Re_Backref,
243    Rep_StringList		= Re_StringList,
244    Rep_Group			= Re_Open
245} rec_pat_t;
246
247
248/* (r)egular (e)xpression (rep)etition (t)ype */
249typedef enum _rec_rep_t {
250    Rer_AnyTimes		= Re_AnyTimes,
251    Rer_AtLeast			= Re_AtLeast,
252    Rer_Maybe			= Re_Maybe,
253    Rer_Exact			= Re_Exact,
254    Rer_Min			= Re_Min,
255    Rer_Max			= Re_Max,
256    Rer_MinMax			= Re_MinMax
257} rec_rep_t;
258
259
260/*  Decide at re compilation time what is lowercase and what is uppercase */
261struct _rec_cse {
262    unsigned char lower;
263    unsigned char upper;
264};
265
266
267/*  A rec_rng is used only during compilation, just a character map */
268struct _rec_rng {
269    unsigned char range[256];
270};
271
272
273/*  A rec_pat is used only during compilation, and can be viewed as
274 * a regular expression element like a match to any character, a match
275 * to the beginning or end of the line, etc.
276 *  It is implemented as a linked list, and does not have nesting.
277 *  The data field can contain:
278 *	chr:	the value of a single character to match.
279 *	cse:	the upper and lower case value of a character to match.
280 *	rng:	a character map to match or not match.
281 *	str:	a simple string or a string where every two bytes
282 *		represents the character to match, in lower/upper
283 *		case sequence.
284 *  The rep field is not used for strings, strings are broken in the
285 * last character in this case. That is, strings are just a concatenation
286 * of several character matches.
287 */
288struct _rec_pat {
289    rec_pat_t type;
290    rec_pat *next, *prev;	/* Linked list information */
291    union {
292	unsigned char chr;
293	rec_cse cse;
294	rec_rng *rng;
295	rec_grp *grp;
296	unsigned char *str;
297	rec_stl *stl;
298    } data;
299    rec_rep *rep;		/* Pattern repetition information */
300};
301
302
303/*  A rec_rep is used only during compilation, and can be viewed as:
304 *
305 *	? or * or + or {<e>} or {<m>,} or {,<M>} or {<m>,<M>}
306 *
307 * where <e> is "exact", <m> is "minimum" and <M> is "maximum".
308 *  In the compiled step it can also be just a NULL pointer, that
309 * is actually equivalent to {1}.
310 */
311struct _rec_rep {
312    rec_rep_t type;
313    short mine;			/* minimum or exact number of matches */
314    short maxc;			/* maximum number of matches */
315};
316
317
318/*  A rec_alt is used only during compilation, and can be viewed as:
319 *
320 *	<re>|<re>
321 *
322 * where <re> is any regular expression. The expressions are nested
323 * using the grp field of the rec_pat structure.
324 */
325struct _rec_alt {
326    rec_alt *next, *prev;	/* Linked list information */
327    rec_pat *pat;
328};
329
330
331/*  A rec_grp is a place holder for expressions enclosed in parenthesis
332 * and is linked to the compilation data by an rec_pat structure. */
333struct _rec_grp {
334    rec_pat *parent;		/* Reference to parent pattern */
335    rec_alt *alt;		/* The pattern information */
336    rec_alt *palt;		/* Parent alternative */
337    rec_grp *pgrp;		/* Nested groups */
338    int comp;			/* (comp)lex repetition pattern inside group */
339};
340
341
342/* Optimization compilation types definition */
343	/* (r)egular (e)xpression (c)ompile (st)ring (l)ist (t)ype */
344typedef enum {
345    Resl_StringList		= Re_StringList,
346    Resl_CaseStringList		= Re_CaseStringList
347} rec_stl_t;
348
349struct _rec_stl {
350    rec_stl_t type;
351    int nstrs;			/* Number of strings in list */
352    int tlen;			/* Total length of all strings */
353    unsigned char *lens;	/* Vector of string lengths */
354    unsigned char **strs;	/* The strings */
355};
356
357
358/*
359 * Prototypes
360 */
361	/* rep.c */
362rec_alt *irec_comp(const char*, const char*, int, int*);
363void irec_free_alt(rec_alt*);
364
365	/* reo.c */
366int orec_comp(rec_alt*, int);
367void orec_free_stl(rec_stl*);
368
369#endif /* _rep_h */
370