regexp.c revision 1.6 1 /* $NetBSD: regexp.c,v 1.6 2003/07/12 13:37:15 itojun Exp $ */
2
3 /*
4 * Copyright (c) 1980, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37 #include <sys/cdefs.h>
38 #ifndef lint
39 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\n\
40 The Regents of the University of California. All rights reserved.\n");
41 #endif /* not lint */
42
43 #ifndef lint
44 #if 0
45 static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93";
46 #endif
47 __RCSID("$NetBSD: regexp.c,v 1.6 2003/07/12 13:37:15 itojun Exp $");
48 #endif /* not lint */
49
50 #include <ctype.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include "extern.h"
54
55 #define FALSE 0
56 #define TRUE !(FALSE)
57 #define NIL 0
58
59 static void expconv __P((void));
60
61 boolean x_escaped; /* true if we are currently x_escaped */
62 char *x_start; /* start of string */
63 boolean l_onecase; /* true if upper and lower equivalent */
64
65 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((c)) : (c))
66
67 /* STRNCMP - like strncmp except that we convert the
68 * first string to lower case before comparing
69 * if l_onecase is set.
70 */
71
72 int
73 STRNCMP(s1, s2, len)
74 char *s1,*s2;
75 int len;
76 {
77 if (l_onecase) {
78 do
79 if (*s2 - makelower(*s1))
80 return (*s2 - makelower(*s1));
81 else {
82 s2++;
83 s1++;
84 }
85 while (--len);
86 } else {
87 do
88 if (*s2 - *s1)
89 return (*s2 - *s1);
90 else {
91 s2++;
92 s1++;
93 }
94 while (--len);
95 }
96 return(0);
97 }
98
99 /* The following routine converts an irregular expression to
100 * internal format.
101 *
102 * Either meta symbols (\a \d or \p) or character strings or
103 * operations ( alternation or perenthesizing ) can be
104 * specified. Each starts with a descriptor byte. The descriptor
105 * byte has STR set for strings, META set for meta symbols
106 * and OPER set for operations.
107 * The descriptor byte can also have the OPT bit set if the object
108 * defined is optional. Also ALT can be set to indicate an alternation.
109 *
110 * For metasymbols the byte following the descriptor byte identities
111 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
112 * strings the byte after the descriptor is a character count for
113 * the string:
114 *
115 * meta symbols := descriptor
116 * symbol
117 *
118 * strings := descriptor
119 * character count
120 * the string
121 *
122 * operatins := descriptor
123 * symbol
124 * character count
125 */
126
127 /*
128 * handy macros for accessing parts of match blocks
129 */
130 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
131 #define MNEXT(A) (A+2) /* character following a metasymbol block */
132
133 #define OSYM(A) (*(A+1)) /* symbol in an operation block */
134 #define OCNT(A) (*(A+2)) /* character count */
135 #define ONEXT(A) (A+3) /* next character after the operation */
136 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
137
138 #define SCNT(A) (*(A+1)) /* byte count of a string */
139 #define SSTR(A) (A+2) /* address of the string */
140 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
141
142 /*
143 * bit flags in the descriptor
144 */
145 #define OPT 1
146 #define STR 2
147 #define META 4
148 #define ALT 8
149 #define OPER 16
150
151 static char *ccre; /* pointer to current position in converted exp*/
152 static char *ure; /* pointer current position in unconverted exp */
153
154 char *
155 convexp(re)
156 char *re; /* unconverted irregular expression */
157 {
158 char *cre; /* pointer to converted regular expression */
159
160 /* allocate room for the converted expression */
161 if (re == NIL)
162 return (NIL);
163 if (*re == '\0')
164 return (NIL);
165 cre = malloc(4 * strlen(re) + 3);
166 ccre = cre;
167 ure = re;
168
169 /* start the conversion with a \a */
170 *cre = META | OPT;
171 MSYM(cre) = 'a';
172 ccre = MNEXT(cre);
173
174 /* start the conversion (its recursive) */
175 expconv();
176 *ccre = 0;
177 return (cre);
178 }
179
180 static void
181 expconv()
182 {
183 char *cs; /* pointer to current symbol in converted exp */
184 char c; /* character being processed */
185 char *acs; /* pinter to last alternate */
186 int temp;
187
188 /* let the conversion begin */
189 acs = NIL;
190 cs = NIL;
191 while (*ure != NIL) {
192 switch (c = *ure++) {
193
194 case '\\':
195 switch (c = *ure++) {
196
197 /* escaped characters are just characters */
198 default:
199 if (cs == NIL || (*cs & STR) == 0) {
200 cs = ccre;
201 *cs = STR;
202 SCNT(cs) = 1;
203 ccre += 2;
204 } else
205 SCNT(cs)++;
206 *ccre++ = c;
207 break;
208
209 /* normal(?) metacharacters */
210 case 'a':
211 case 'd':
212 case 'e':
213 case 'p':
214 if (acs != NIL && acs != cs) {
215 do {
216 temp = OCNT(acs);
217 OCNT(acs) = ccre - acs;
218 acs -= temp;
219 } while (temp != 0);
220 acs = NIL;
221 }
222 cs = ccre;
223 *cs = META;
224 MSYM(cs) = c;
225 ccre = MNEXT(cs);
226 break;
227 }
228 break;
229
230 /* just put the symbol in */
231 case '^':
232 case '$':
233 if (acs != NIL && acs != cs) {
234 do {
235 temp = OCNT(acs);
236 OCNT(acs) = ccre - acs;
237 acs -= temp;
238 } while (temp != 0);
239 acs = NIL;
240 }
241 cs = ccre;
242 *cs = META;
243 MSYM(cs) = c;
244 ccre = MNEXT(cs);
245 break;
246
247 /* mark the last match sequence as optional */
248 case '?':
249 if (cs)
250 *cs = *cs | OPT;
251 break;
252
253 /* recurse and define a subexpression */
254 case '(':
255 if (acs != NIL && acs != cs) {
256 do {
257 temp = OCNT(acs);
258 OCNT(acs) = ccre - acs;
259 acs -= temp;
260 } while (temp != 0);
261 acs = NIL;
262 }
263 cs = ccre;
264 *cs = OPER;
265 OSYM(cs) = '(';
266 ccre = ONEXT(cs);
267 expconv();
268 OCNT(cs) = ccre - cs; /* offset to next symbol */
269 break;
270
271 /* reurn from a recursion */
272 case ')':
273 if (acs != NIL) {
274 do {
275 temp = OCNT(acs);
276 OCNT(acs) = ccre - acs;
277 acs -= temp;
278 } while (temp != 0);
279 acs = NIL;
280 }
281 cs = ccre;
282 *cs = META;
283 MSYM(cs) = c;
284 ccre = MNEXT(cs);
285 return;
286
287 /* mark the last match sequence as having an alternate */
288 /* the third byte will contain an offset to jump over the */
289 /* alternate match in case the first did not fail */
290 case '|':
291 if (acs != NIL && acs != cs)
292 OCNT(ccre) = ccre - acs; /* make a back pointer */
293 else
294 OCNT(ccre) = 0;
295 *cs |= ALT;
296 cs = ccre;
297 *cs = OPER;
298 OSYM(cs) = '|';
299 ccre = ONEXT(cs);
300 acs = cs; /* remember that the pointer is to be filles */
301 break;
302
303 /* if its not a metasymbol just build a scharacter string */
304 default:
305 if (cs == NIL || (*cs & STR) == 0) {
306 cs = ccre;
307 *cs = STR;
308 SCNT(cs) = 1;
309 ccre = SSTR(cs);
310 } else
311 SCNT(cs)++;
312 *ccre++ = c;
313 break;
314 }
315 }
316 if (acs != NIL) {
317 do {
318 temp = OCNT(acs);
319 OCNT(acs) = ccre - acs;
320 acs -= temp;
321 } while (temp != 0);
322 acs = NIL;
323 }
324 return;
325 }
326 /* end of convertre */
327
328
329 /*
330 * The following routine recognises an irregular expresion
331 * with the following special characters:
332 *
333 * \? - means last match was optional
334 * \a - matches any number of characters
335 * \d - matches any number of spaces and tabs
336 * \p - matches any number of alphanumeric
337 * characters. The
338 * characters matched will be copied into
339 * the area pointed to by 'name'.
340 * \| - alternation
341 * \( \) - grouping used mostly for alternation and
342 * optionality
343 *
344 * The irregular expression must be translated to internal form
345 * prior to calling this routine
346 *
347 * The value returned is the pointer to the first non \a
348 * character matched.
349 */
350
351 char *
352 expmatch(s, re, mstring)
353 char *s; /* string to check for a match in */
354 char *re; /* a converted irregular expression */
355 char *mstring; /* where to put whatever matches a \p */
356 {
357 char *cs; /* the current symbol */
358 char *ptr,*s1; /* temporary pointer */
359 boolean matched; /* a temporary boolean */
360
361 /* initial conditions */
362 if (re == NIL)
363 return (NIL);
364 cs = re;
365 matched = FALSE;
366
367 /* loop till expression string is exhausted (or at least pretty tired) */
368 while (*cs) {
369 switch (*cs & (OPER | STR | META)) {
370
371 /* try to match a string */
372 case STR:
373 matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
374 if (matched) {
375
376 /* hoorah it matches */
377 s += SCNT(cs);
378 cs = SNEXT(cs);
379 } else if (*cs & ALT) {
380
381 /* alternation, skip to next expression */
382 cs = SNEXT(cs);
383 } else if (*cs & OPT) {
384
385 /* the match is optional */
386 cs = SNEXT(cs);
387 matched = 1; /* indicate a successful match */
388 } else {
389
390 /* no match, error return */
391 return (NIL);
392 }
393 break;
394
395 /* an operator, do something fancy */
396 case OPER:
397 switch (OSYM(cs)) {
398
399 /* this is an alternation */
400 case '|':
401 if (matched)
402
403 /* last thing in the alternation was a match, skip ahead */
404 cs = OPTR(cs);
405 else
406
407 /* no match, keep trying */
408 cs = ONEXT(cs);
409 break;
410
411 /* this is a grouping, recurse */
412 case '(':
413 ptr = expmatch(s, ONEXT(cs), mstring);
414 if (ptr != NIL) {
415
416 /* the subexpression matched */
417 matched = 1;
418 s = ptr;
419 } else if (*cs & ALT) {
420
421 /* alternation, skip to next expression */
422 matched = 0;
423 } else if (*cs & OPT) {
424
425 /* the match is optional */
426 matched = 1; /* indicate a successful match */
427 } else {
428
429 /* no match, error return */
430 return (NIL);
431 }
432 cs = OPTR(cs);
433 break;
434 }
435 break;
436
437 /* try to match a metasymbol */
438 case META:
439 switch (MSYM(cs)) {
440
441 /* try to match anything and remember what was matched */
442 case 'p':
443 /*
444 * This is really the same as trying the match the
445 * remaining parts of the expression to any subset
446 * of the string.
447 */
448 s1 = s;
449 do {
450 ptr = expmatch(s1, MNEXT(cs), mstring);
451 if (ptr != NIL && s1 != s) {
452
453 /* we have a match, remember the match */
454 strncpy(mstring, s, s1 - s);
455 mstring[s1 - s] = '\0';
456 return (ptr);
457 } else if (ptr != NIL && (*cs & OPT)) {
458
459 /* it was aoptional so no match is ok */
460 return (ptr);
461 } else if (ptr != NIL) {
462
463 /* not optional and we still matched */
464 return (NIL);
465 }
466 if (!isalnum((unsigned char)*s1) && *s1 != '_')
467 return (NIL);
468 if (*s1 == '\\')
469 x_escaped = x_escaped ? FALSE : TRUE;
470 else
471 x_escaped = FALSE;
472 } while (*s1++);
473 return (NIL);
474
475 /* try to match anything */
476 case 'a':
477 /*
478 * This is really the same as trying the match the
479 * remaining parts of the expression to any subset
480 * of the string.
481 */
482 s1 = s;
483 do {
484 ptr = expmatch(s1, MNEXT(cs), mstring);
485 if (ptr != NIL && s1 != s) {
486
487 /* we have a match */
488 return (ptr);
489 } else if (ptr != NIL && (*cs & OPT)) {
490
491 /* it was aoptional so no match is ok */
492 return (ptr);
493 } else if (ptr != NIL) {
494
495 /* not optional and we still matched */
496 return (NIL);
497 }
498 if (*s1 == '\\')
499 x_escaped = x_escaped ? FALSE : TRUE;
500 else
501 x_escaped = FALSE;
502 } while (*s1++);
503 return (NIL);
504
505 /* fail if we are currently x_escaped */
506 case 'e':
507 if (x_escaped)
508 return(NIL);
509 cs = MNEXT(cs);
510 break;
511
512 /* match any number of tabs and spaces */
513 case 'd':
514 ptr = s;
515 while (*s == ' ' || *s == '\t')
516 s++;
517 if (s != ptr || s == x_start) {
518
519 /* match, be happy */
520 matched = 1;
521 cs = MNEXT(cs);
522 } else if (*s == '\n' || *s == '\0') {
523
524 /* match, be happy */
525 matched = 1;
526 cs = MNEXT(cs);
527 } else if (*cs & ALT) {
528
529 /* try the next part */
530 matched = 0;
531 cs = MNEXT(cs);
532 } else if (*cs & OPT) {
533
534 /* doesn't matter */
535 matched = 1;
536 cs = MNEXT(cs);
537 } else
538
539 /* no match, error return */
540 return (NIL);
541 break;
542
543 /* check for end of line */
544 case '$':
545 if (*s == '\0' || *s == '\n') {
546
547 /* match, be happy */
548 s++;
549 matched = 1;
550 cs = MNEXT(cs);
551 } else if (*cs & ALT) {
552
553 /* try the next part */
554 matched = 0;
555 cs = MNEXT(cs);
556 } else if (*cs & OPT) {
557
558 /* doesn't matter */
559 matched = 1;
560 cs = MNEXT(cs);
561 } else
562
563 /* no match, error return */
564 return (NIL);
565 break;
566
567 /* check for start of line */
568 case '^':
569 if (s == x_start) {
570
571 /* match, be happy */
572 matched = 1;
573 cs = MNEXT(cs);
574 } else if (*cs & ALT) {
575
576 /* try the next part */
577 matched = 0;
578 cs = MNEXT(cs);
579 } else if (*cs & OPT) {
580
581 /* doesn't matter */
582 matched = 1;
583 cs = MNEXT(cs);
584 } else
585
586 /* no match, error return */
587 return (NIL);
588 break;
589
590 /* end of a subexpression, return success */
591 case ')':
592 return (s);
593 }
594 break;
595 }
596 }
597 return (s);
598 }
599