regexp.c revision 1.1.1.1 1 /*
2 * Copyright (c) 1980, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. All advertising materials mentioning features or use of this software
15 * must display the following acknowledgement:
16 * This product includes software developed by the University of
17 * California, Berkeley and its contributors.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #ifndef lint
36 static char copyright[] =
37 "@(#) Copyright (c) 1980, 1993\n\
38 The Regents of the University of California. All rights reserved.\n";
39 #endif /* not lint */
40
41 #ifndef lint
42 static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93";
43 #endif /* not lint */
44
45 #include <ctype.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include "extern.h"
49
50 #define FALSE 0
51 #define TRUE !(FALSE)
52 #define NIL 0
53
54 static void expconv __P((void));
55
56 boolean _escaped; /* true if we are currently _escaped */
57 char *_start; /* start of string */
58 boolean l_onecase; /* true if upper and lower equivalent */
59
60 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
61
62 /* STRNCMP - like strncmp except that we convert the
63 * first string to lower case before comparing
64 * if l_onecase is set.
65 */
66
67 int
68 STRNCMP(s1, s2, len)
69 register char *s1,*s2;
70 register int len;
71 {
72 if (l_onecase) {
73 do
74 if (*s2 - makelower(*s1))
75 return (*s2 - makelower(*s1));
76 else {
77 s2++;
78 s1++;
79 }
80 while (--len);
81 } else {
82 do
83 if (*s2 - *s1)
84 return (*s2 - *s1);
85 else {
86 s2++;
87 s1++;
88 }
89 while (--len);
90 }
91 return(0);
92 }
93
94 /* The following routine converts an irregular expression to
95 * internal format.
96 *
97 * Either meta symbols (\a \d or \p) or character strings or
98 * operations ( alternation or perenthesizing ) can be
99 * specified. Each starts with a descriptor byte. The descriptor
100 * byte has STR set for strings, META set for meta symbols
101 * and OPER set for operations.
102 * The descriptor byte can also have the OPT bit set if the object
103 * defined is optional. Also ALT can be set to indicate an alternation.
104 *
105 * For metasymbols the byte following the descriptor byte identities
106 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
107 * strings the byte after the descriptor is a character count for
108 * the string:
109 *
110 * meta symbols := descriptor
111 * symbol
112 *
113 * strings := descriptor
114 * character count
115 * the string
116 *
117 * operatins := descriptor
118 * symbol
119 * character count
120 */
121
122 /*
123 * handy macros for accessing parts of match blocks
124 */
125 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
126 #define MNEXT(A) (A+2) /* character following a metasymbol block */
127
128 #define OSYM(A) (*(A+1)) /* symbol in an operation block */
129 #define OCNT(A) (*(A+2)) /* character count */
130 #define ONEXT(A) (A+3) /* next character after the operation */
131 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
132
133 #define SCNT(A) (*(A+1)) /* byte count of a string */
134 #define SSTR(A) (A+2) /* address of the string */
135 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
136
137 /*
138 * bit flags in the descriptor
139 */
140 #define OPT 1
141 #define STR 2
142 #define META 4
143 #define ALT 8
144 #define OPER 16
145
146 static char *ccre; /* pointer to current position in converted exp*/
147 static char *ure; /* pointer current position in unconverted exp */
148
149 char *
150 convexp(re)
151 char *re; /* unconverted irregular expression */
152 {
153 register char *cre; /* pointer to converted regular expression */
154
155 /* allocate room for the converted expression */
156 if (re == NIL)
157 return (NIL);
158 if (*re == '\0')
159 return (NIL);
160 cre = malloc (4 * strlen(re) + 3);
161 ccre = cre;
162 ure = re;
163
164 /* start the conversion with a \a */
165 *cre = META | OPT;
166 MSYM(cre) = 'a';
167 ccre = MNEXT(cre);
168
169 /* start the conversion (its recursive) */
170 expconv ();
171 *ccre = 0;
172 return (cre);
173 }
174
175 static void
176 expconv()
177 {
178 register char *cs; /* pointer to current symbol in converted exp */
179 register char c; /* character being processed */
180 register char *acs; /* pinter to last alternate */
181 register int temp;
182
183 /* let the conversion begin */
184 acs = NIL;
185 cs = NIL;
186 while (*ure != NIL) {
187 switch (c = *ure++) {
188
189 case '\\':
190 switch (c = *ure++) {
191
192 /* escaped characters are just characters */
193 default:
194 if (cs == NIL || (*cs & STR) == 0) {
195 cs = ccre;
196 *cs = STR;
197 SCNT(cs) = 1;
198 ccre += 2;
199 } else
200 SCNT(cs)++;
201 *ccre++ = c;
202 break;
203
204 /* normal(?) metacharacters */
205 case 'a':
206 case 'd':
207 case 'e':
208 case 'p':
209 if (acs != NIL && acs != cs) {
210 do {
211 temp = OCNT(acs);
212 OCNT(acs) = ccre - acs;
213 acs -= temp;
214 } while (temp != 0);
215 acs = NIL;
216 }
217 cs = ccre;
218 *cs = META;
219 MSYM(cs) = c;
220 ccre = MNEXT(cs);
221 break;
222 }
223 break;
224
225 /* just put the symbol in */
226 case '^':
227 case '$':
228 if (acs != NIL && acs != cs) {
229 do {
230 temp = OCNT(acs);
231 OCNT(acs) = ccre - acs;
232 acs -= temp;
233 } while (temp != 0);
234 acs = NIL;
235 }
236 cs = ccre;
237 *cs = META;
238 MSYM(cs) = c;
239 ccre = MNEXT(cs);
240 break;
241
242 /* mark the last match sequence as optional */
243 case '?':
244 if (cs)
245 *cs = *cs | OPT;
246 break;
247
248 /* recurse and define a subexpression */
249 case '(':
250 if (acs != NIL && acs != cs) {
251 do {
252 temp = OCNT(acs);
253 OCNT(acs) = ccre - acs;
254 acs -= temp;
255 } while (temp != 0);
256 acs = NIL;
257 }
258 cs = ccre;
259 *cs = OPER;
260 OSYM(cs) = '(';
261 ccre = ONEXT(cs);
262 expconv ();
263 OCNT(cs) = ccre - cs; /* offset to next symbol */
264 break;
265
266 /* reurn from a recursion */
267 case ')':
268 if (acs != NIL) {
269 do {
270 temp = OCNT(acs);
271 OCNT(acs) = ccre - acs;
272 acs -= temp;
273 } while (temp != 0);
274 acs = NIL;
275 }
276 cs = ccre;
277 *cs = META;
278 MSYM(cs) = c;
279 ccre = MNEXT(cs);
280 return;
281
282 /* mark the last match sequence as having an alternate */
283 /* the third byte will contain an offset to jump over the */
284 /* alternate match in case the first did not fail */
285 case '|':
286 if (acs != NIL && acs != cs)
287 OCNT(ccre) = ccre - acs; /* make a back pointer */
288 else
289 OCNT(ccre) = 0;
290 *cs |= ALT;
291 cs = ccre;
292 *cs = OPER;
293 OSYM(cs) = '|';
294 ccre = ONEXT(cs);
295 acs = cs; /* remember that the pointer is to be filles */
296 break;
297
298 /* if its not a metasymbol just build a scharacter string */
299 default:
300 if (cs == NIL || (*cs & STR) == 0) {
301 cs = ccre;
302 *cs = STR;
303 SCNT(cs) = 1;
304 ccre = SSTR(cs);
305 } else
306 SCNT(cs)++;
307 *ccre++ = c;
308 break;
309 }
310 }
311 if (acs != NIL) {
312 do {
313 temp = OCNT(acs);
314 OCNT(acs) = ccre - acs;
315 acs -= temp;
316 } while (temp != 0);
317 acs = NIL;
318 }
319 return;
320 }
321 /* end of convertre */
322
323
324 /*
325 * The following routine recognises an irregular expresion
326 * with the following special characters:
327 *
328 * \? - means last match was optional
329 * \a - matches any number of characters
330 * \d - matches any number of spaces and tabs
331 * \p - matches any number of alphanumeric
332 * characters. The
333 * characters matched will be copied into
334 * the area pointed to by 'name'.
335 * \| - alternation
336 * \( \) - grouping used mostly for alternation and
337 * optionality
338 *
339 * The irregular expression must be translated to internal form
340 * prior to calling this routine
341 *
342 * The value returned is the pointer to the first non \a
343 * character matched.
344 */
345
346 char *
347 expmatch (s, re, mstring)
348 register char *s; /* string to check for a match in */
349 register char *re; /* a converted irregular expression */
350 register char *mstring; /* where to put whatever matches a \p */
351 {
352 register char *cs; /* the current symbol */
353 register char *ptr,*s1; /* temporary pointer */
354 boolean matched; /* a temporary boolean */
355
356 /* initial conditions */
357 if (re == NIL)
358 return (NIL);
359 cs = re;
360 matched = FALSE;
361
362 /* loop till expression string is exhausted (or at least pretty tired) */
363 while (*cs) {
364 switch (*cs & (OPER | STR | META)) {
365
366 /* try to match a string */
367 case STR:
368 matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
369 if (matched) {
370
371 /* hoorah it matches */
372 s += SCNT(cs);
373 cs = SNEXT(cs);
374 } else if (*cs & ALT) {
375
376 /* alternation, skip to next expression */
377 cs = SNEXT(cs);
378 } else if (*cs & OPT) {
379
380 /* the match is optional */
381 cs = SNEXT(cs);
382 matched = 1; /* indicate a successful match */
383 } else {
384
385 /* no match, error return */
386 return (NIL);
387 }
388 break;
389
390 /* an operator, do something fancy */
391 case OPER:
392 switch (OSYM(cs)) {
393
394 /* this is an alternation */
395 case '|':
396 if (matched)
397
398 /* last thing in the alternation was a match, skip ahead */
399 cs = OPTR(cs);
400 else
401
402 /* no match, keep trying */
403 cs = ONEXT(cs);
404 break;
405
406 /* this is a grouping, recurse */
407 case '(':
408 ptr = expmatch (s, ONEXT(cs), mstring);
409 if (ptr != NIL) {
410
411 /* the subexpression matched */
412 matched = 1;
413 s = ptr;
414 } else if (*cs & ALT) {
415
416 /* alternation, skip to next expression */
417 matched = 0;
418 } else if (*cs & OPT) {
419
420 /* the match is optional */
421 matched = 1; /* indicate a successful match */
422 } else {
423
424 /* no match, error return */
425 return (NIL);
426 }
427 cs = OPTR(cs);
428 break;
429 }
430 break;
431
432 /* try to match a metasymbol */
433 case META:
434 switch (MSYM(cs)) {
435
436 /* try to match anything and remember what was matched */
437 case 'p':
438 /*
439 * This is really the same as trying the match the
440 * remaining parts of the expression to any subset
441 * of the string.
442 */
443 s1 = s;
444 do {
445 ptr = expmatch (s1, MNEXT(cs), mstring);
446 if (ptr != NIL && s1 != s) {
447
448 /* we have a match, remember the match */
449 strncpy (mstring, s, s1 - s);
450 mstring[s1 - s] = '\0';
451 return (ptr);
452 } else if (ptr != NIL && (*cs & OPT)) {
453
454 /* it was aoptional so no match is ok */
455 return (ptr);
456 } else if (ptr != NIL) {
457
458 /* not optional and we still matched */
459 return (NIL);
460 }
461 if (!isalnum(*s1) && *s1 != '_')
462 return (NIL);
463 if (*s1 == '\\')
464 _escaped = _escaped ? FALSE : TRUE;
465 else
466 _escaped = FALSE;
467 } while (*s1++);
468 return (NIL);
469
470 /* try to match anything */
471 case 'a':
472 /*
473 * This is really the same as trying the match the
474 * remaining parts of the expression to any subset
475 * of the string.
476 */
477 s1 = s;
478 do {
479 ptr = expmatch (s1, MNEXT(cs), mstring);
480 if (ptr != NIL && s1 != s) {
481
482 /* we have a match */
483 return (ptr);
484 } else if (ptr != NIL && (*cs & OPT)) {
485
486 /* it was aoptional so no match is ok */
487 return (ptr);
488 } else if (ptr != NIL) {
489
490 /* not optional and we still matched */
491 return (NIL);
492 }
493 if (*s1 == '\\')
494 _escaped = _escaped ? FALSE : TRUE;
495 else
496 _escaped = FALSE;
497 } while (*s1++);
498 return (NIL);
499
500 /* fail if we are currently _escaped */
501 case 'e':
502 if (_escaped)
503 return(NIL);
504 cs = MNEXT(cs);
505 break;
506
507 /* match any number of tabs and spaces */
508 case 'd':
509 ptr = s;
510 while (*s == ' ' || *s == '\t')
511 s++;
512 if (s != ptr || s == _start) {
513
514 /* match, be happy */
515 matched = 1;
516 cs = MNEXT(cs);
517 } else if (*s == '\n' || *s == '\0') {
518
519 /* match, be happy */
520 matched = 1;
521 cs = MNEXT(cs);
522 } else if (*cs & ALT) {
523
524 /* try the next part */
525 matched = 0;
526 cs = MNEXT(cs);
527 } else if (*cs & OPT) {
528
529 /* doesn't matter */
530 matched = 1;
531 cs = MNEXT(cs);
532 } else
533
534 /* no match, error return */
535 return (NIL);
536 break;
537
538 /* check for end of line */
539 case '$':
540 if (*s == '\0' || *s == '\n') {
541
542 /* match, be happy */
543 s++;
544 matched = 1;
545 cs = MNEXT(cs);
546 } else if (*cs & ALT) {
547
548 /* try the next part */
549 matched = 0;
550 cs = MNEXT(cs);
551 } else if (*cs & OPT) {
552
553 /* doesn't matter */
554 matched = 1;
555 cs = MNEXT(cs);
556 } else
557
558 /* no match, error return */
559 return (NIL);
560 break;
561
562 /* check for start of line */
563 case '^':
564 if (s == _start) {
565
566 /* match, be happy */
567 matched = 1;
568 cs = MNEXT(cs);
569 } else if (*cs & ALT) {
570
571 /* try the next part */
572 matched = 0;
573 cs = MNEXT(cs);
574 } else if (*cs & OPT) {
575
576 /* doesn't matter */
577 matched = 1;
578 cs = MNEXT(cs);
579 } else
580
581 /* no match, error return */
582 return (NIL);
583 break;
584
585 /* end of a subexpression, return success */
586 case ')':
587 return (s);
588 }
589 break;
590 }
591 }
592 return (s);
593 }
594