regexp.c revision 1.2 1 /*
2 * Copyright (c) 1980 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 */
33
34 #ifndef lint
35 /*static char sccsid[] = "from: @(#)regexp.c 5.3 (Berkeley) 6/1/90";*/
36 static char rcsid[] = "$Id: regexp.c,v 1.2 1993/08/01 18:03:29 mycroft Exp $";
37 #endif /* not lint */
38
39 #include <ctype.h>
40
41 typedef int boolean;
42 #define TRUE 1
43 #define FALSE 0
44 #define NIL 0
45
46 boolean l_onecase; /* true if upper and lower equivalent */
47
48 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
49
50 /* STRNCMP - like strncmp except that we convert the
51 * first string to lower case before comparing
52 * if l_onecase is set.
53 */
54
55 STRNCMP(s1, s2, len)
56 register char *s1,*s2;
57 register int len;
58 {
59 if (l_onecase) {
60 do
61 if (*s2 - makelower(*s1))
62 return (*s2 - makelower(*s1));
63 else {
64 s2++;
65 s1++;
66 }
67 while (--len);
68 } else {
69 do
70 if (*s2 - *s1)
71 return (*s2 - *s1);
72 else {
73 s2++;
74 s1++;
75 }
76 while (--len);
77 }
78 return(0);
79 }
80
81 /* The following routine converts an irregular expression to
82 * internal format.
83 *
84 * Either meta symbols (\a \d or \p) or character strings or
85 * operations ( alternation or perenthesizing ) can be
86 * specified. Each starts with a descriptor byte. The descriptor
87 * byte has STR set for strings, META set for meta symbols
88 * and OPER set for operations.
89 * The descriptor byte can also have the OPT bit set if the object
90 * defined is optional. Also ALT can be set to indicate an alternation.
91 *
92 * For metasymbols the byte following the descriptor byte identities
93 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
94 * strings the byte after the descriptor is a character count for
95 * the string:
96 *
97 * meta symbols := descriptor
98 * symbol
99 *
100 * strings := descriptor
101 * character count
102 * the string
103 *
104 * operatins := descriptor
105 * symbol
106 * character count
107 */
108
109 /*
110 * handy macros for accessing parts of match blocks
111 */
112 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
113 #define MNEXT(A) (A+2) /* character following a metasymbol block */
114
115 #define OSYM(A) (*(A+1)) /* symbol in an operation block */
116 #define OCNT(A) (*(A+2)) /* character count */
117 #define ONEXT(A) (A+3) /* next character after the operation */
118 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
119
120 #define SCNT(A) (*(A+1)) /* byte count of a string */
121 #define SSTR(A) (A+2) /* address of the string */
122 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
123
124 /*
125 * bit flags in the descriptor
126 */
127 #define OPT 1
128 #define STR 2
129 #define META 4
130 #define ALT 8
131 #define OPER 16
132
133 char *ure; /* pointer current position in unconverted exp */
134 char *ccre; /* pointer to current position in converted exp*/
135 char *malloc();
136
137 char *
138 convexp(re)
139 char *re; /* unconverted irregular expression */
140 {
141 register char *cre; /* pointer to converted regular expression */
142
143 /* allocate room for the converted expression */
144 if (re == NIL)
145 return (NIL);
146 if (*re == '\0')
147 return (NIL);
148 cre = malloc (4 * strlen(re) + 3);
149 ccre = cre;
150 ure = re;
151
152 /* start the conversion with a \a */
153 *cre = META | OPT;
154 MSYM(cre) = 'a';
155 ccre = MNEXT(cre);
156
157 /* start the conversion (its recursive) */
158 expconv ();
159 *ccre = 0;
160 return (cre);
161 }
162
163 expconv()
164 {
165 register char *cs; /* pointer to current symbol in converted exp */
166 register char c; /* character being processed */
167 register char *acs; /* pinter to last alternate */
168 register int temp;
169
170 /* let the conversion begin */
171 acs = NIL;
172 cs = NIL;
173 while (*ure != NIL) {
174 switch (c = *ure++) {
175
176 case '\\':
177 switch (c = *ure++) {
178
179 /* escaped characters are just characters */
180 default:
181 if (cs == NIL || (*cs & STR) == 0) {
182 cs = ccre;
183 *cs = STR;
184 SCNT(cs) = 1;
185 ccre += 2;
186 } else
187 SCNT(cs)++;
188 *ccre++ = c;
189 break;
190
191 /* normal(?) metacharacters */
192 case 'a':
193 case 'd':
194 case 'e':
195 case 'p':
196 if (acs != NIL && acs != cs) {
197 do {
198 temp = OCNT(acs);
199 OCNT(acs) = ccre - acs;
200 acs -= temp;
201 } while (temp != 0);
202 acs = NIL;
203 }
204 cs = ccre;
205 *cs = META;
206 MSYM(cs) = c;
207 ccre = MNEXT(cs);
208 break;
209 }
210 break;
211
212 /* just put the symbol in */
213 case '^':
214 case '$':
215 if (acs != NIL && acs != cs) {
216 do {
217 temp = OCNT(acs);
218 OCNT(acs) = ccre - acs;
219 acs -= temp;
220 } while (temp != 0);
221 acs = NIL;
222 }
223 cs = ccre;
224 *cs = META;
225 MSYM(cs) = c;
226 ccre = MNEXT(cs);
227 break;
228
229 /* mark the last match sequence as optional */
230 case '?':
231 if (cs)
232 *cs = *cs | OPT;
233 break;
234
235 /* recurse and define a subexpression */
236 case '(':
237 if (acs != NIL && acs != cs) {
238 do {
239 temp = OCNT(acs);
240 OCNT(acs) = ccre - acs;
241 acs -= temp;
242 } while (temp != 0);
243 acs = NIL;
244 }
245 cs = ccre;
246 *cs = OPER;
247 OSYM(cs) = '(';
248 ccre = ONEXT(cs);
249 expconv ();
250 OCNT(cs) = ccre - cs; /* offset to next symbol */
251 break;
252
253 /* return from a recursion */
254 case ')':
255 if (acs != NIL) {
256 do {
257 temp = OCNT(acs);
258 OCNT(acs) = ccre - acs;
259 acs -= temp;
260 } while (temp != 0);
261 acs = NIL;
262 }
263 cs = ccre;
264 *cs = META;
265 MSYM(cs) = c;
266 ccre = MNEXT(cs);
267 return;
268
269 /* mark the last match sequence as having an alternate */
270 /* the third byte will contain an offset to jump over the */
271 /* alternate match in case the first did not fail */
272 case '|':
273 if (acs != NIL && acs != cs)
274 OCNT(ccre) = ccre - acs; /* make a back pointer */
275 else
276 OCNT(ccre) = 0;
277 *cs |= ALT;
278 cs = ccre;
279 *cs = OPER;
280 OSYM(cs) = '|';
281 ccre = ONEXT(cs);
282 acs = cs; /* remember that the pointer is to be filles */
283 break;
284
285 /* if its not a metasymbol just build a scharacter string */
286 default:
287 if (cs == NIL || (*cs & STR) == 0) {
288 cs = ccre;
289 *cs = STR;
290 SCNT(cs) = 1;
291 ccre = SSTR(cs);
292 } else
293 SCNT(cs)++;
294 *ccre++ = c;
295 break;
296 }
297 }
298 if (acs != NIL) {
299 do {
300 temp = OCNT(acs);
301 OCNT(acs) = ccre - acs;
302 acs -= temp;
303 } while (temp != 0);
304 acs = NIL;
305 }
306 return;
307 }
308 /* end of convertre */
309
310
311 /*
312 * The following routine recognises an irregular expresion
313 * with the following special characters:
314 *
315 * \? - means last match was optional
316 * \a - matches any number of characters
317 * \d - matches any number of spaces and tabs
318 * \p - matches any number of alphanumeric
319 * characters. The
320 * characters matched will be copied into
321 * the area pointed to by 'name'.
322 * \| - alternation
323 * \( \) - grouping used mostly for alternation and
324 * optionality
325 *
326 * The irregular expression must be translated to internal form
327 * prior to calling this routine
328 *
329 * The value returned is the pointer to the first non \a
330 * character matched.
331 */
332
333 boolean _escaped; /* true if we are currently _escaped */
334 char *_start; /* start of string */
335
336 char *
337 expmatch (s, re, mstring)
338 register char *s; /* string to check for a match in */
339 register char *re; /* a converted irregular expression */
340 register char *mstring; /* where to put whatever matches a \p */
341 {
342 register char *cs; /* the current symbol */
343 register char *ptr,*s1; /* temporary pointer */
344 boolean matched; /* a temporary boolean */
345
346 /* initial conditions */
347 if (re == NIL)
348 return (NIL);
349 cs = re;
350 matched = FALSE;
351
352 /* loop till expression string is exhausted (or at least pretty tired) */
353 while (*cs) {
354 switch (*cs & (OPER | STR | META)) {
355
356 /* try to match a string */
357 case STR:
358 matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
359 if (matched) {
360
361 /* hoorah it matches */
362 s += SCNT(cs);
363 cs = SNEXT(cs);
364 } else if (*cs & ALT) {
365
366 /* alternation, skip to next expression */
367 cs = SNEXT(cs);
368 } else if (*cs & OPT) {
369
370 /* the match is optional */
371 cs = SNEXT(cs);
372 matched = 1; /* indicate a successful match */
373 } else {
374
375 /* no match, error return */
376 return (NIL);
377 }
378 break;
379
380 /* an operator, do something fancy */
381 case OPER:
382 switch (OSYM(cs)) {
383
384 /* this is an alternation */
385 case '|':
386 if (matched)
387
388 /* last thing in the alternation was a match, skip ahead */
389 cs = OPTR(cs);
390 else
391
392 /* no match, keep trying */
393 cs = ONEXT(cs);
394 break;
395
396 /* this is a grouping, recurse */
397 case '(':
398 ptr = expmatch (s, ONEXT(cs), mstring);
399 if (ptr != NIL) {
400
401 /* the subexpression matched */
402 matched = 1;
403 s = ptr;
404 } else if (*cs & ALT) {
405
406 /* alternation, skip to next expression */
407 matched = 0;
408 } else if (*cs & OPT) {
409
410 /* the match is optional */
411 matched = 1; /* indicate a successful match */
412 } else {
413
414 /* no match, error return */
415 return (NIL);
416 }
417 cs = OPTR(cs);
418 break;
419 }
420 break;
421
422 /* try to match a metasymbol */
423 case META:
424 switch (MSYM(cs)) {
425
426 /* try to match anything and remember what was matched */
427 case 'p':
428 /*
429 * This is really the same as trying the match the
430 * remaining parts of the expression to any subset
431 * of the string.
432 */
433 s1 = s;
434 do {
435 ptr = expmatch (s1, MNEXT(cs), mstring);
436 if (ptr != NIL && s1 != s) {
437
438 /* we have a match, remember the match */
439 strncpy (mstring, s, s1 - s);
440 mstring[s1 - s] = '\0';
441 return (ptr);
442 } else if (ptr != NIL && (*cs & OPT)) {
443
444 /* it was aoptional so no match is ok */
445 return (ptr);
446 } else if (ptr != NIL) {
447
448 /* not optional and we still matched */
449 return (NIL);
450 }
451 if (!isalnum(*s1) && *s1 != '_')
452 return (NIL);
453 if (*s1 == '\\')
454 _escaped = _escaped ? FALSE : TRUE;
455 else
456 _escaped = FALSE;
457 } while (*s1++);
458 return (NIL);
459
460 /* try to match anything */
461 case 'a':
462 /*
463 * This is really the same as trying the match the
464 * remaining parts of the expression to any subset
465 * of the string.
466 */
467 s1 = s;
468 do {
469 ptr = expmatch (s1, MNEXT(cs), mstring);
470 if (ptr != NIL && s1 != s) {
471
472 /* we have a match */
473 return (ptr);
474 } else if (ptr != NIL && (*cs & OPT)) {
475
476 /* it was aoptional so no match is ok */
477 return (ptr);
478 } else if (ptr != NIL) {
479
480 /* not optional and we still matched */
481 return (NIL);
482 }
483 if (*s1 == '\\')
484 _escaped = _escaped ? FALSE : TRUE;
485 else
486 _escaped = FALSE;
487 } while (*s1++);
488 return (NIL);
489
490 /* fail if we are currently _escaped */
491 case 'e':
492 if (_escaped)
493 return(NIL);
494 cs = MNEXT(cs);
495 break;
496
497 /* match any number of tabs and spaces */
498 case 'd':
499 ptr = s;
500 while (*s == ' ' || *s == '\t')
501 s++;
502 if (s != ptr || s == _start) {
503
504 /* match, be happy */
505 matched = 1;
506 cs = MNEXT(cs);
507 } else if (*s == '\n' || *s == '\0') {
508
509 /* match, be happy */
510 matched = 1;
511 cs = MNEXT(cs);
512 } else if (*cs & ALT) {
513
514 /* try the next part */
515 matched = 0;
516 cs = MNEXT(cs);
517 } else if (*cs & OPT) {
518
519 /* doesn't matter */
520 matched = 1;
521 cs = MNEXT(cs);
522 } else
523
524 /* no match, error return */
525 return (NIL);
526 break;
527
528 /* check for end of line */
529 case '$':
530 if (*s == '\0' || *s == '\n') {
531
532 /* match, be happy */
533 s++;
534 matched = 1;
535 cs = MNEXT(cs);
536 } else if (*cs & ALT) {
537
538 /* try the next part */
539 matched = 0;
540 cs = MNEXT(cs);
541 } else if (*cs & OPT) {
542
543 /* doesn't matter */
544 matched = 1;
545 cs = MNEXT(cs);
546 } else
547
548 /* no match, error return */
549 return (NIL);
550 break;
551
552 /* check for start of line */
553 case '^':
554 if (s == _start) {
555
556 /* match, be happy */
557 matched = 1;
558 cs = MNEXT(cs);
559 } else if (*cs & ALT) {
560
561 /* try the next part */
562 matched = 0;
563 cs = MNEXT(cs);
564 } else if (*cs & OPT) {
565
566 /* doesn't matter */
567 matched = 1;
568 cs = MNEXT(cs);
569 } else
570
571 /* no match, error return */
572 return (NIL);
573 break;
574
575 /* end of a subexpression, return success */
576 case ')':
577 return (s);
578 }
579 break;
580 }
581 }
582 return (s);
583 }
584