regexp.c revision 1.7 1 /* $NetBSD: regexp.c,v 1.7 2003/08/07 11:17:01 agc Exp $ */
2
3 /*
4 * Copyright (c) 1980, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32
33 #include <sys/cdefs.h>
34 #ifndef lint
35 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\n\
36 The Regents of the University of California. All rights reserved.\n");
37 #endif /* not lint */
38
39 #ifndef lint
40 #if 0
41 static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93";
42 #endif
43 __RCSID("$NetBSD: regexp.c,v 1.7 2003/08/07 11:17:01 agc Exp $");
44 #endif /* not lint */
45
46 #include <ctype.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include "extern.h"
50
51 #define FALSE 0
52 #define TRUE !(FALSE)
53 #define NIL 0
54
55 static void expconv __P((void));
56
57 boolean x_escaped; /* true if we are currently x_escaped */
58 char *x_start; /* start of string */
59 boolean l_onecase; /* true if upper and lower equivalent */
60
61 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((c)) : (c))
62
63 /* STRNCMP - like strncmp except that we convert the
64 * first string to lower case before comparing
65 * if l_onecase is set.
66 */
67
68 int
69 STRNCMP(s1, s2, len)
70 char *s1,*s2;
71 int len;
72 {
73 if (l_onecase) {
74 do
75 if (*s2 - makelower(*s1))
76 return (*s2 - makelower(*s1));
77 else {
78 s2++;
79 s1++;
80 }
81 while (--len);
82 } else {
83 do
84 if (*s2 - *s1)
85 return (*s2 - *s1);
86 else {
87 s2++;
88 s1++;
89 }
90 while (--len);
91 }
92 return(0);
93 }
94
95 /* The following routine converts an irregular expression to
96 * internal format.
97 *
98 * Either meta symbols (\a \d or \p) or character strings or
99 * operations ( alternation or perenthesizing ) can be
100 * specified. Each starts with a descriptor byte. The descriptor
101 * byte has STR set for strings, META set for meta symbols
102 * and OPER set for operations.
103 * The descriptor byte can also have the OPT bit set if the object
104 * defined is optional. Also ALT can be set to indicate an alternation.
105 *
106 * For metasymbols the byte following the descriptor byte identities
107 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
108 * strings the byte after the descriptor is a character count for
109 * the string:
110 *
111 * meta symbols := descriptor
112 * symbol
113 *
114 * strings := descriptor
115 * character count
116 * the string
117 *
118 * operatins := descriptor
119 * symbol
120 * character count
121 */
122
123 /*
124 * handy macros for accessing parts of match blocks
125 */
126 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
127 #define MNEXT(A) (A+2) /* character following a metasymbol block */
128
129 #define OSYM(A) (*(A+1)) /* symbol in an operation block */
130 #define OCNT(A) (*(A+2)) /* character count */
131 #define ONEXT(A) (A+3) /* next character after the operation */
132 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
133
134 #define SCNT(A) (*(A+1)) /* byte count of a string */
135 #define SSTR(A) (A+2) /* address of the string */
136 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
137
138 /*
139 * bit flags in the descriptor
140 */
141 #define OPT 1
142 #define STR 2
143 #define META 4
144 #define ALT 8
145 #define OPER 16
146
147 static char *ccre; /* pointer to current position in converted exp*/
148 static char *ure; /* pointer current position in unconverted exp */
149
150 char *
151 convexp(re)
152 char *re; /* unconverted irregular expression */
153 {
154 char *cre; /* pointer to converted regular expression */
155
156 /* allocate room for the converted expression */
157 if (re == NIL)
158 return (NIL);
159 if (*re == '\0')
160 return (NIL);
161 cre = malloc(4 * strlen(re) + 3);
162 ccre = cre;
163 ure = re;
164
165 /* start the conversion with a \a */
166 *cre = META | OPT;
167 MSYM(cre) = 'a';
168 ccre = MNEXT(cre);
169
170 /* start the conversion (its recursive) */
171 expconv();
172 *ccre = 0;
173 return (cre);
174 }
175
176 static void
177 expconv()
178 {
179 char *cs; /* pointer to current symbol in converted exp */
180 char c; /* character being processed */
181 char *acs; /* pinter to last alternate */
182 int temp;
183
184 /* let the conversion begin */
185 acs = NIL;
186 cs = NIL;
187 while (*ure != NIL) {
188 switch (c = *ure++) {
189
190 case '\\':
191 switch (c = *ure++) {
192
193 /* escaped characters are just characters */
194 default:
195 if (cs == NIL || (*cs & STR) == 0) {
196 cs = ccre;
197 *cs = STR;
198 SCNT(cs) = 1;
199 ccre += 2;
200 } else
201 SCNT(cs)++;
202 *ccre++ = c;
203 break;
204
205 /* normal(?) metacharacters */
206 case 'a':
207 case 'd':
208 case 'e':
209 case 'p':
210 if (acs != NIL && acs != cs) {
211 do {
212 temp = OCNT(acs);
213 OCNT(acs) = ccre - acs;
214 acs -= temp;
215 } while (temp != 0);
216 acs = NIL;
217 }
218 cs = ccre;
219 *cs = META;
220 MSYM(cs) = c;
221 ccre = MNEXT(cs);
222 break;
223 }
224 break;
225
226 /* just put the symbol in */
227 case '^':
228 case '$':
229 if (acs != NIL && acs != cs) {
230 do {
231 temp = OCNT(acs);
232 OCNT(acs) = ccre - acs;
233 acs -= temp;
234 } while (temp != 0);
235 acs = NIL;
236 }
237 cs = ccre;
238 *cs = META;
239 MSYM(cs) = c;
240 ccre = MNEXT(cs);
241 break;
242
243 /* mark the last match sequence as optional */
244 case '?':
245 if (cs)
246 *cs = *cs | OPT;
247 break;
248
249 /* recurse and define a subexpression */
250 case '(':
251 if (acs != NIL && acs != cs) {
252 do {
253 temp = OCNT(acs);
254 OCNT(acs) = ccre - acs;
255 acs -= temp;
256 } while (temp != 0);
257 acs = NIL;
258 }
259 cs = ccre;
260 *cs = OPER;
261 OSYM(cs) = '(';
262 ccre = ONEXT(cs);
263 expconv();
264 OCNT(cs) = ccre - cs; /* offset to next symbol */
265 break;
266
267 /* reurn from a recursion */
268 case ')':
269 if (acs != NIL) {
270 do {
271 temp = OCNT(acs);
272 OCNT(acs) = ccre - acs;
273 acs -= temp;
274 } while (temp != 0);
275 acs = NIL;
276 }
277 cs = ccre;
278 *cs = META;
279 MSYM(cs) = c;
280 ccre = MNEXT(cs);
281 return;
282
283 /* mark the last match sequence as having an alternate */
284 /* the third byte will contain an offset to jump over the */
285 /* alternate match in case the first did not fail */
286 case '|':
287 if (acs != NIL && acs != cs)
288 OCNT(ccre) = ccre - acs; /* make a back pointer */
289 else
290 OCNT(ccre) = 0;
291 *cs |= ALT;
292 cs = ccre;
293 *cs = OPER;
294 OSYM(cs) = '|';
295 ccre = ONEXT(cs);
296 acs = cs; /* remember that the pointer is to be filles */
297 break;
298
299 /* if its not a metasymbol just build a scharacter string */
300 default:
301 if (cs == NIL || (*cs & STR) == 0) {
302 cs = ccre;
303 *cs = STR;
304 SCNT(cs) = 1;
305 ccre = SSTR(cs);
306 } else
307 SCNT(cs)++;
308 *ccre++ = c;
309 break;
310 }
311 }
312 if (acs != NIL) {
313 do {
314 temp = OCNT(acs);
315 OCNT(acs) = ccre - acs;
316 acs -= temp;
317 } while (temp != 0);
318 acs = NIL;
319 }
320 return;
321 }
322 /* end of convertre */
323
324
325 /*
326 * The following routine recognises an irregular expresion
327 * with the following special characters:
328 *
329 * \? - means last match was optional
330 * \a - matches any number of characters
331 * \d - matches any number of spaces and tabs
332 * \p - matches any number of alphanumeric
333 * characters. The
334 * characters matched will be copied into
335 * the area pointed to by 'name'.
336 * \| - alternation
337 * \( \) - grouping used mostly for alternation and
338 * optionality
339 *
340 * The irregular expression must be translated to internal form
341 * prior to calling this routine
342 *
343 * The value returned is the pointer to the first non \a
344 * character matched.
345 */
346
347 char *
348 expmatch(s, re, mstring)
349 char *s; /* string to check for a match in */
350 char *re; /* a converted irregular expression */
351 char *mstring; /* where to put whatever matches a \p */
352 {
353 char *cs; /* the current symbol */
354 char *ptr,*s1; /* temporary pointer */
355 boolean matched; /* a temporary boolean */
356
357 /* initial conditions */
358 if (re == NIL)
359 return (NIL);
360 cs = re;
361 matched = FALSE;
362
363 /* loop till expression string is exhausted (or at least pretty tired) */
364 while (*cs) {
365 switch (*cs & (OPER | STR | META)) {
366
367 /* try to match a string */
368 case STR:
369 matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
370 if (matched) {
371
372 /* hoorah it matches */
373 s += SCNT(cs);
374 cs = SNEXT(cs);
375 } else if (*cs & ALT) {
376
377 /* alternation, skip to next expression */
378 cs = SNEXT(cs);
379 } else if (*cs & OPT) {
380
381 /* the match is optional */
382 cs = SNEXT(cs);
383 matched = 1; /* indicate a successful match */
384 } else {
385
386 /* no match, error return */
387 return (NIL);
388 }
389 break;
390
391 /* an operator, do something fancy */
392 case OPER:
393 switch (OSYM(cs)) {
394
395 /* this is an alternation */
396 case '|':
397 if (matched)
398
399 /* last thing in the alternation was a match, skip ahead */
400 cs = OPTR(cs);
401 else
402
403 /* no match, keep trying */
404 cs = ONEXT(cs);
405 break;
406
407 /* this is a grouping, recurse */
408 case '(':
409 ptr = expmatch(s, ONEXT(cs), mstring);
410 if (ptr != NIL) {
411
412 /* the subexpression matched */
413 matched = 1;
414 s = ptr;
415 } else if (*cs & ALT) {
416
417 /* alternation, skip to next expression */
418 matched = 0;
419 } else if (*cs & OPT) {
420
421 /* the match is optional */
422 matched = 1; /* indicate a successful match */
423 } else {
424
425 /* no match, error return */
426 return (NIL);
427 }
428 cs = OPTR(cs);
429 break;
430 }
431 break;
432
433 /* try to match a metasymbol */
434 case META:
435 switch (MSYM(cs)) {
436
437 /* try to match anything and remember what was matched */
438 case 'p':
439 /*
440 * This is really the same as trying the match the
441 * remaining parts of the expression to any subset
442 * of the string.
443 */
444 s1 = s;
445 do {
446 ptr = expmatch(s1, MNEXT(cs), mstring);
447 if (ptr != NIL && s1 != s) {
448
449 /* we have a match, remember the match */
450 strncpy(mstring, s, s1 - s);
451 mstring[s1 - s] = '\0';
452 return (ptr);
453 } else if (ptr != NIL && (*cs & OPT)) {
454
455 /* it was aoptional so no match is ok */
456 return (ptr);
457 } else if (ptr != NIL) {
458
459 /* not optional and we still matched */
460 return (NIL);
461 }
462 if (!isalnum((unsigned char)*s1) && *s1 != '_')
463 return (NIL);
464 if (*s1 == '\\')
465 x_escaped = x_escaped ? FALSE : TRUE;
466 else
467 x_escaped = FALSE;
468 } while (*s1++);
469 return (NIL);
470
471 /* try to match anything */
472 case 'a':
473 /*
474 * This is really the same as trying the match the
475 * remaining parts of the expression to any subset
476 * of the string.
477 */
478 s1 = s;
479 do {
480 ptr = expmatch(s1, MNEXT(cs), mstring);
481 if (ptr != NIL && s1 != s) {
482
483 /* we have a match */
484 return (ptr);
485 } else if (ptr != NIL && (*cs & OPT)) {
486
487 /* it was aoptional so no match is ok */
488 return (ptr);
489 } else if (ptr != NIL) {
490
491 /* not optional and we still matched */
492 return (NIL);
493 }
494 if (*s1 == '\\')
495 x_escaped = x_escaped ? FALSE : TRUE;
496 else
497 x_escaped = FALSE;
498 } while (*s1++);
499 return (NIL);
500
501 /* fail if we are currently x_escaped */
502 case 'e':
503 if (x_escaped)
504 return(NIL);
505 cs = MNEXT(cs);
506 break;
507
508 /* match any number of tabs and spaces */
509 case 'd':
510 ptr = s;
511 while (*s == ' ' || *s == '\t')
512 s++;
513 if (s != ptr || s == x_start) {
514
515 /* match, be happy */
516 matched = 1;
517 cs = MNEXT(cs);
518 } else if (*s == '\n' || *s == '\0') {
519
520 /* match, be happy */
521 matched = 1;
522 cs = MNEXT(cs);
523 } else if (*cs & ALT) {
524
525 /* try the next part */
526 matched = 0;
527 cs = MNEXT(cs);
528 } else if (*cs & OPT) {
529
530 /* doesn't matter */
531 matched = 1;
532 cs = MNEXT(cs);
533 } else
534
535 /* no match, error return */
536 return (NIL);
537 break;
538
539 /* check for end of line */
540 case '$':
541 if (*s == '\0' || *s == '\n') {
542
543 /* match, be happy */
544 s++;
545 matched = 1;
546 cs = MNEXT(cs);
547 } else if (*cs & ALT) {
548
549 /* try the next part */
550 matched = 0;
551 cs = MNEXT(cs);
552 } else if (*cs & OPT) {
553
554 /* doesn't matter */
555 matched = 1;
556 cs = MNEXT(cs);
557 } else
558
559 /* no match, error return */
560 return (NIL);
561 break;
562
563 /* check for start of line */
564 case '^':
565 if (s == x_start) {
566
567 /* match, be happy */
568 matched = 1;
569 cs = MNEXT(cs);
570 } else if (*cs & ALT) {
571
572 /* try the next part */
573 matched = 0;
574 cs = MNEXT(cs);
575 } else if (*cs & OPT) {
576
577 /* doesn't matter */
578 matched = 1;
579 cs = MNEXT(cs);
580 } else
581
582 /* no match, error return */
583 return (NIL);
584 break;
585
586 /* end of a subexpression, return success */
587 case ')':
588 return (s);
589 }
590 break;
591 }
592 }
593 return (s);
594 }
595