regexp.c revision 1.9 1 /* $NetBSD: regexp.c,v 1.9 2006/05/01 05:13:58 christos Exp $ */
2
3 /*
4 * Copyright (c) 1980, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32
33 #include <sys/cdefs.h>
34 #ifndef lint
35 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\n\
36 The Regents of the University of California. All rights reserved.\n");
37 #endif /* not lint */
38
39 #ifndef lint
40 #if 0
41 static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93";
42 #endif
43 __RCSID("$NetBSD: regexp.c,v 1.9 2006/05/01 05:13:58 christos Exp $");
44 #endif /* not lint */
45
46 #include <ctype.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include "extern.h"
50
51 #define FALSE 0
52 #define TRUE !(FALSE)
53 #define NIL 0
54
55 static void expconv __P((void));
56
57 boolean x_escaped; /* true if we are currently x_escaped */
58 char *x_start; /* start of string */
59 boolean l_onecase; /* true if upper and lower equivalent */
60
61 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
62
63 /* STRNCMP - like strncmp except that we convert the
64 * first string to lower case before comparing
65 * if l_onecase is set.
66 */
67
68 int
69 STRNCMP(s1, s2, len)
70 char *s1,*s2;
71 int len;
72 {
73 if (l_onecase) {
74 do
75 if (*s2 - makelower(*s1))
76 return (*s2 - makelower(*s1));
77 else {
78 s2++;
79 s1++;
80 }
81 while (--len);
82 } else {
83 do
84 if (*s2 - *s1)
85 return (*s2 - *s1);
86 else {
87 s2++;
88 s1++;
89 }
90 while (--len);
91 }
92 return(0);
93 }
94
95 /* The following routine converts an irregular expression to
96 * internal format.
97 *
98 * Either meta symbols (\a \d or \p) or character strings or
99 * operations ( alternation or perenthesizing ) can be
100 * specified. Each starts with a descriptor byte. The descriptor
101 * byte has STR set for strings, META set for meta symbols
102 * and OPER set for operations.
103 * The descriptor byte can also have the OPT bit set if the object
104 * defined is optional. Also ALT can be set to indicate an alternation.
105 *
106 * For metasymbols the byte following the descriptor byte identities
107 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
108 * strings the byte after the descriptor is a character count for
109 * the string:
110 *
111 * meta symbols := descriptor
112 * symbol
113 *
114 * strings := descriptor
115 * character count
116 * the string
117 *
118 * operatins := descriptor
119 * symbol
120 * character count
121 */
122
123 /*
124 * handy macros for accessing parts of match blocks
125 */
126 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
127 #define MNEXT(A) (A+2) /* character following a metasymbol block */
128
129 #define OSYM(A) (*(A+1)) /* symbol in an operation block */
130 #define OCNT(A) (*(A+2)) /* character count */
131 #define ONEXT(A) (A+3) /* next character after the operation */
132 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
133
134 #define SCNT(A) (*(A+1)) /* byte count of a string */
135 #define SSTR(A) (A+2) /* address of the string */
136 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
137
138 /*
139 * bit flags in the descriptor
140 */
141 #define OPT 1
142 #define STR 2
143 #define META 4
144 #define ALT 8
145 #define OPER 16
146
147 static char *ccre; /* pointer to current position in converted exp*/
148 static char *ure; /* pointer current position in unconverted exp */
149
150 char *
151 convexp(re)
152 char *re; /* unconverted irregular expression */
153 {
154 char *cre; /* pointer to converted regular expression */
155
156 /* allocate room for the converted expression */
157 if (re == NIL)
158 return (NIL);
159 if (*re == '\0')
160 return (NIL);
161 cre = malloc(4 * strlen(re) + 3);
162 ccre = cre;
163 ure = re;
164
165 /* start the conversion with a \a */
166 *cre = META | OPT;
167 MSYM(cre) = 'a';
168 ccre = MNEXT(cre);
169
170 /* start the conversion (its recursive) */
171 expconv();
172 *ccre = 0;
173 return (cre);
174 }
175
176 static void
177 expconv()
178 {
179 char *cs; /* pointer to current symbol in converted exp */
180 char c; /* character being processed */
181 char *acs; /* pinter to last alternate */
182 int temp;
183
184 /* let the conversion begin */
185 acs = NIL;
186 cs = NIL;
187 while (*ure != NIL) {
188 switch (c = *ure++) {
189
190 case '\\':
191 switch (c = *ure++) {
192
193 /* escaped characters are just characters */
194 default:
195 if (cs == NIL || (*cs & STR) == 0) {
196 cs = ccre;
197 *cs = STR;
198 SCNT(cs) = 1;
199 ccre += 2;
200 } else
201 SCNT(cs)++;
202 *ccre++ = c;
203 break;
204
205 /* normal(?) metacharacters */
206 case 'a':
207 case 'd':
208 case 'e':
209 case 'p':
210 if (acs != NIL && acs != cs) {
211 do {
212 temp = OCNT(acs);
213 OCNT(acs) = ccre - acs;
214 acs -= temp;
215 } while (temp != 0);
216 acs = NIL;
217 }
218 cs = ccre;
219 *cs = META;
220 MSYM(cs) = c;
221 ccre = MNEXT(cs);
222 break;
223 }
224 break;
225
226 /* just put the symbol in */
227 case '^':
228 case '$':
229 if (acs != NIL && acs != cs) {
230 do {
231 temp = OCNT(acs);
232 OCNT(acs) = ccre - acs;
233 acs -= temp;
234 } while (temp != 0);
235 acs = NIL;
236 }
237 cs = ccre;
238 *cs = META;
239 MSYM(cs) = c;
240 ccre = MNEXT(cs);
241 break;
242
243 /* mark the last match sequence as optional */
244 case '?':
245 if (cs)
246 *cs = *cs | OPT;
247 break;
248
249 /* recurse and define a subexpression */
250 case '(':
251 if (acs != NIL && acs != cs) {
252 do {
253 temp = OCNT(acs);
254 OCNT(acs) = ccre - acs;
255 acs -= temp;
256 } while (temp != 0);
257 acs = NIL;
258 }
259 cs = ccre;
260 *cs = OPER;
261 OSYM(cs) = '(';
262 ccre = ONEXT(cs);
263 expconv();
264 OCNT(cs) = ccre - cs; /* offset to next symbol */
265 break;
266
267 /* reurn from a recursion */
268 case ')':
269 if (acs != NIL) {
270 do {
271 temp = OCNT(acs);
272 OCNT(acs) = ccre - acs;
273 acs -= temp;
274 } while (temp != 0);
275 acs = NIL;
276 }
277 cs = ccre;
278 *cs = META;
279 MSYM(cs) = c;
280 ccre = MNEXT(cs);
281 return;
282
283 /* mark the last match sequence as having an alternate */
284 /* the third byte will contain an offset to jump over the */
285 /* alternate match in case the first did not fail */
286 case '|':
287 if (acs != NIL && acs != cs)
288 OCNT(ccre) = ccre - acs; /* make a back pointer */
289 else
290 OCNT(ccre) = 0;
291 assert(cs != NULL);
292 *cs |= ALT;
293 cs = ccre;
294 *cs = OPER;
295 OSYM(cs) = '|';
296 ccre = ONEXT(cs);
297 acs = cs; /* remember that the pointer is to be filles */
298 break;
299
300 /* if its not a metasymbol just build a scharacter string */
301 default:
302 if (cs == NIL || (*cs & STR) == 0) {
303 cs = ccre;
304 *cs = STR;
305 SCNT(cs) = 1;
306 ccre = SSTR(cs);
307 } else
308 SCNT(cs)++;
309 *ccre++ = c;
310 break;
311 }
312 }
313 if (acs != NIL) {
314 do {
315 temp = OCNT(acs);
316 OCNT(acs) = ccre - acs;
317 acs -= temp;
318 } while (temp != 0);
319 acs = NIL;
320 }
321 return;
322 }
323 /* end of convertre */
324
325
326 /*
327 * The following routine recognises an irregular expresion
328 * with the following special characters:
329 *
330 * \? - means last match was optional
331 * \a - matches any number of characters
332 * \d - matches any number of spaces and tabs
333 * \p - matches any number of alphanumeric
334 * characters. The
335 * characters matched will be copied into
336 * the area pointed to by 'name'.
337 * \| - alternation
338 * \( \) - grouping used mostly for alternation and
339 * optionality
340 *
341 * The irregular expression must be translated to internal form
342 * prior to calling this routine
343 *
344 * The value returned is the pointer to the first non \a
345 * character matched.
346 */
347
348 char *
349 expmatch(s, re, mstring)
350 char *s; /* string to check for a match in */
351 char *re; /* a converted irregular expression */
352 char *mstring; /* where to put whatever matches a \p */
353 {
354 char *cs; /* the current symbol */
355 char *ptr,*s1; /* temporary pointer */
356 boolean matched; /* a temporary boolean */
357
358 /* initial conditions */
359 if (re == NIL)
360 return (NIL);
361 cs = re;
362 matched = FALSE;
363
364 /* loop till expression string is exhausted (or at least pretty tired) */
365 while (*cs) {
366 switch (*cs & (OPER | STR | META)) {
367
368 /* try to match a string */
369 case STR:
370 matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
371 if (matched) {
372
373 /* hoorah it matches */
374 s += SCNT(cs);
375 cs = SNEXT(cs);
376 } else if (*cs & ALT) {
377
378 /* alternation, skip to next expression */
379 cs = SNEXT(cs);
380 } else if (*cs & OPT) {
381
382 /* the match is optional */
383 cs = SNEXT(cs);
384 matched = 1; /* indicate a successful match */
385 } else {
386
387 /* no match, error return */
388 return (NIL);
389 }
390 break;
391
392 /* an operator, do something fancy */
393 case OPER:
394 switch (OSYM(cs)) {
395
396 /* this is an alternation */
397 case '|':
398 if (matched)
399
400 /* last thing in the alternation was a match, skip ahead */
401 cs = OPTR(cs);
402 else
403
404 /* no match, keep trying */
405 cs = ONEXT(cs);
406 break;
407
408 /* this is a grouping, recurse */
409 case '(':
410 ptr = expmatch(s, ONEXT(cs), mstring);
411 if (ptr != NIL) {
412
413 /* the subexpression matched */
414 matched = 1;
415 s = ptr;
416 } else if (*cs & ALT) {
417
418 /* alternation, skip to next expression */
419 matched = 0;
420 } else if (*cs & OPT) {
421
422 /* the match is optional */
423 matched = 1; /* indicate a successful match */
424 } else {
425
426 /* no match, error return */
427 return (NIL);
428 }
429 cs = OPTR(cs);
430 break;
431 }
432 break;
433
434 /* try to match a metasymbol */
435 case META:
436 switch (MSYM(cs)) {
437
438 /* try to match anything and remember what was matched */
439 case 'p':
440 /*
441 * This is really the same as trying the match the
442 * remaining parts of the expression to any subset
443 * of the string.
444 */
445 s1 = s;
446 do {
447 ptr = expmatch(s1, MNEXT(cs), mstring);
448 if (ptr != NIL && s1 != s) {
449
450 /* we have a match, remember the match */
451 strncpy(mstring, s, s1 - s);
452 mstring[s1 - s] = '\0';
453 return (ptr);
454 } else if (ptr != NIL && (*cs & OPT)) {
455
456 /* it was aoptional so no match is ok */
457 return (ptr);
458 } else if (ptr != NIL) {
459
460 /* not optional and we still matched */
461 return (NIL);
462 }
463 if (!isalnum((unsigned char)*s1) && *s1 != '_')
464 return (NIL);
465 if (*s1 == '\\')
466 x_escaped = x_escaped ? FALSE : TRUE;
467 else
468 x_escaped = FALSE;
469 } while (*s1++);
470 return (NIL);
471
472 /* try to match anything */
473 case 'a':
474 /*
475 * This is really the same as trying the match the
476 * remaining parts of the expression to any subset
477 * of the string.
478 */
479 s1 = s;
480 do {
481 ptr = expmatch(s1, MNEXT(cs), mstring);
482 if (ptr != NIL && s1 != s) {
483
484 /* we have a match */
485 return (ptr);
486 } else if (ptr != NIL && (*cs & OPT)) {
487
488 /* it was aoptional so no match is ok */
489 return (ptr);
490 } else if (ptr != NIL) {
491
492 /* not optional and we still matched */
493 return (NIL);
494 }
495 if (*s1 == '\\')
496 x_escaped = x_escaped ? FALSE : TRUE;
497 else
498 x_escaped = FALSE;
499 } while (*s1++);
500 return (NIL);
501
502 /* fail if we are currently x_escaped */
503 case 'e':
504 if (x_escaped)
505 return(NIL);
506 cs = MNEXT(cs);
507 break;
508
509 /* match any number of tabs and spaces */
510 case 'd':
511 ptr = s;
512 while (*s == ' ' || *s == '\t')
513 s++;
514 if (s != ptr || s == x_start) {
515
516 /* match, be happy */
517 matched = 1;
518 cs = MNEXT(cs);
519 } else if (*s == '\n' || *s == '\0') {
520
521 /* match, be happy */
522 matched = 1;
523 cs = MNEXT(cs);
524 } else if (*cs & ALT) {
525
526 /* try the next part */
527 matched = 0;
528 cs = MNEXT(cs);
529 } else if (*cs & OPT) {
530
531 /* doesn't matter */
532 matched = 1;
533 cs = MNEXT(cs);
534 } else
535
536 /* no match, error return */
537 return (NIL);
538 break;
539
540 /* check for end of line */
541 case '$':
542 if (*s == '\0' || *s == '\n') {
543
544 /* match, be happy */
545 s++;
546 matched = 1;
547 cs = MNEXT(cs);
548 } else if (*cs & ALT) {
549
550 /* try the next part */
551 matched = 0;
552 cs = MNEXT(cs);
553 } else if (*cs & OPT) {
554
555 /* doesn't matter */
556 matched = 1;
557 cs = MNEXT(cs);
558 } else
559
560 /* no match, error return */
561 return (NIL);
562 break;
563
564 /* check for start of line */
565 case '^':
566 if (s == x_start) {
567
568 /* match, be happy */
569 matched = 1;
570 cs = MNEXT(cs);
571 } else if (*cs & ALT) {
572
573 /* try the next part */
574 matched = 0;
575 cs = MNEXT(cs);
576 } else if (*cs & OPT) {
577
578 /* doesn't matter */
579 matched = 1;
580 cs = MNEXT(cs);
581 } else
582
583 /* no match, error return */
584 return (NIL);
585 break;
586
587 /* end of a subexpression, return success */
588 case ')':
589 return (s);
590 }
591 break;
592 }
593 }
594 return (s);
595 }
596