regexp.c revision 1.16 1 /* $NetBSD: regexp.c,v 1.16 2024/12/08 07:53:18 andvar Exp $ */
2
3 /*
4 * Copyright (c) 1980, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32
33 #if HAVE_NBTOOL_CONFIG_H
34 #include "nbtool_config.h"
35 #endif
36
37 #include <sys/cdefs.h>
38 #ifndef lint
39 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\
40 The Regents of the University of California. All rights reserved.");
41 #endif /* not lint */
42
43 #ifndef lint
44 #if 0
45 static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93";
46 #endif
47 __RCSID("$NetBSD: regexp.c,v 1.16 2024/12/08 07:53:18 andvar Exp $");
48 #endif /* not lint */
49
50 #include <assert.h>
51 #include <ctype.h>
52 #include <stdlib.h>
53 #include <stdbool.h>
54 #include <string.h>
55 #include "extern.h"
56
57 static void expconv (void);
58
59 bool x_escaped; /* true if we are currently x_escaped */
60 char *x_start; /* start of string */
61 bool l_onecase; /* true if upper and lower equivalent */
62
63 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
64
65 static char
66 tocc(ptrdiff_t x) {
67 if (x & ~0xff)
68 abort();
69 return (char)x;
70 }
71
72 /* STRNCMP - like strncmp except that we convert the
73 * first string to lower case before comparing
74 * if l_onecase is set.
75 */
76
77 int
78 STRNCMP(char *s1, char *s2, int len)
79 {
80 if (l_onecase) {
81 do
82 if (*s2 - makelower(*s1))
83 return *s2 - makelower(*s1);
84 else {
85 s2++;
86 s1++;
87 }
88 while (--len);
89 } else {
90 do
91 if (*s2 - *s1)
92 return *s2 - *s1;
93 else {
94 s2++;
95 s1++;
96 }
97 while (--len);
98 }
99 return(0);
100 }
101
102 /* The following routine converts an irregular expression to
103 * internal format.
104 *
105 * Either meta symbols (\a \d or \p) or character strings or
106 * operations ( alternation or perenthesizing ) can be
107 * specified. Each starts with a descriptor byte. The descriptor
108 * byte has STR set for strings, META set for meta symbols
109 * and OPER set for operations.
110 * The descriptor byte can also have the OPT bit set if the object
111 * defined is optional. Also ALT can be set to indicate an alternation.
112 *
113 * For metasymbols the byte following the descriptor byte identities
114 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
115 * strings the byte after the descriptor is a character count for
116 * the string:
117 *
118 * meta symbols := descriptor
119 * symbol
120 *
121 * strings := descriptor
122 * character count
123 * the string
124 *
125 * operations := descriptor
126 * symbol
127 * character count
128 */
129
130 /*
131 * handy macros for accessing parts of match blocks
132 */
133 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
134 #define MNEXT(A) (A+2) /* character following a metasymbol block */
135
136 #define OSYM(A) (*(A+1)) /* symbol in an operation block */
137 #define OCNT(A) (*(A+2)) /* character count */
138 #define ONEXT(A) (A+3) /* next character after the operation */
139 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
140
141 #define SCNT(A) (*(A+1)) /* byte count of a string */
142 #define SSTR(A) (A+2) /* address of the string */
143 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
144
145 /*
146 * bit flags in the descriptor
147 */
148 #define OPT 1
149 #define STR 2
150 #define META 4
151 #define ALT 8
152 #define OPER 16
153
154 static char *ccre; /* pointer to current position in converted exp*/
155 static char *ure; /* pointer current position in unconverted exp */
156
157 char *
158 convexp(char *re) /* unconverted irregular expression */
159 {
160 char *cre; /* pointer to converted regular expression */
161
162 /* allocate room for the converted expression */
163 if (re == NULL)
164 return NULL;
165 if (*re == '\0')
166 return NULL;
167 cre = malloc(4 * strlen(re) + 3);
168 ccre = cre;
169 ure = re;
170
171 /* start the conversion with a \a */
172 *cre = META | OPT;
173 MSYM(cre) = 'a';
174 ccre = MNEXT(cre);
175
176 /* start the conversion (its recursive) */
177 expconv();
178 *ccre = 0;
179 return cre;
180 }
181
182 static void
183 expconv(void)
184 {
185 char *cs; /* pointer to current symbol in converted exp */
186 char c; /* character being processed */
187 char *acs; /* pinter to last alternate */
188 int temp;
189
190 /* let the conversion begin */
191 acs = NULL;
192 cs = NULL;
193 while (*ure) {
194 switch (c = *ure++) {
195
196 case '\\':
197 switch (c = *ure++) {
198
199 /* escaped characters are just characters */
200 default:
201 if (cs == NULL || (*cs & STR) == 0) {
202 cs = ccre;
203 *cs = STR;
204 SCNT(cs) = 1;
205 ccre += 2;
206 } else
207 SCNT(cs)++;
208 *ccre++ = c;
209 break;
210
211 /* normal(?) metacharacters */
212 case 'a':
213 case 'd':
214 case 'e':
215 case 'p':
216 if (acs != NULL && acs != cs) {
217 do {
218 temp = OCNT(acs);
219 OCNT(acs) = tocc(ccre - acs);
220 acs -= temp;
221 } while (temp != 0);
222 acs = NULL;
223 }
224 cs = ccre;
225 *cs = META;
226 MSYM(cs) = c;
227 ccre = MNEXT(cs);
228 break;
229 }
230 break;
231
232 /* just put the symbol in */
233 case '^':
234 case '$':
235 if (acs != NULL && acs != cs) {
236 do {
237 temp = OCNT(acs);
238 OCNT(acs) = tocc(ccre - acs);
239 acs -= temp;
240 } while (temp != 0);
241 acs = NULL;
242 }
243 cs = ccre;
244 *cs = META;
245 MSYM(cs) = c;
246 ccre = MNEXT(cs);
247 break;
248
249 /* mark the last match sequence as optional */
250 case '?':
251 if (cs)
252 *cs = *cs | OPT;
253 break;
254
255 /* recurse and define a subexpression */
256 case '(':
257 if (acs != NULL && acs != cs) {
258 do {
259 temp = OCNT(acs);
260 OCNT(acs) = tocc(ccre - acs);
261 acs -= temp;
262 } while (temp != 0);
263 acs = NULL;
264 }
265 cs = ccre;
266 *cs = OPER;
267 OSYM(cs) = '(';
268 ccre = ONEXT(cs);
269 expconv();
270 OCNT(cs) = tocc(ccre - cs); /* offset to next symbol */
271 break;
272
273 /* return from a recursion */
274 case ')':
275 if (acs != NULL) {
276 do {
277 temp = OCNT(acs);
278 OCNT(acs) = tocc(ccre - acs);
279 acs -= temp;
280 } while (temp != 0);
281 acs = NULL;
282 }
283 cs = ccre;
284 *cs = META;
285 MSYM(cs) = c;
286 ccre = MNEXT(cs);
287 return;
288
289 /* mark the last match sequence as having an alternate */
290 /* the third byte will contain an offset to jump over the */
291 /* alternate match in case the first did not fail */
292 case '|':
293 if (acs != NULL && acs != cs)
294 OCNT(ccre) = tocc(ccre - acs); /* make a back pointer */
295 else
296 OCNT(ccre) = 0;
297 assert(cs != NULL);
298 *cs |= ALT;
299 cs = ccre;
300 *cs = OPER;
301 OSYM(cs) = '|';
302 ccre = ONEXT(cs);
303 acs = cs; /* remember that the pointer is to be filles */
304 break;
305
306 /* if its not a metasymbol just build a scharacter string */
307 default:
308 if (cs == NULL || (*cs & STR) == 0) {
309 cs = ccre;
310 *cs = STR;
311 SCNT(cs) = 1;
312 ccre = SSTR(cs);
313 } else
314 SCNT(cs)++;
315 *ccre++ = c;
316 break;
317 }
318 }
319 if (acs != NULL) {
320 do {
321 temp = OCNT(acs);
322 OCNT(acs) = tocc(ccre - acs);
323 acs -= temp;
324 } while (temp != 0);
325 acs = NULL;
326 }
327 return;
328 }
329 /* end of convertre */
330
331
332 /*
333 * The following routine recognises an irregular expression
334 * with the following special characters:
335 *
336 * \? - means last match was optional
337 * \a - matches any number of characters
338 * \d - matches any number of spaces and tabs
339 * \p - matches any number of alphanumeric
340 * characters. The
341 * characters matched will be copied into
342 * the area pointed to by 'name'.
343 * \| - alternation
344 * \( \) - grouping used mostly for alternation and
345 * optionality
346 *
347 * The irregular expression must be translated to internal form
348 * prior to calling this routine
349 *
350 * The value returned is the pointer to the first non \a
351 * character matched.
352 */
353
354 char *
355 expmatch(
356 char *s, /* string to check for a match in */
357 char *re, /* a converted irregular expression */
358 char *mstring) /* where to put whatever matches a \p */
359 {
360 char *cs; /* the current symbol */
361 char *ptr,*s1; /* temporary pointer */
362 bool matched; /* a temporary bool */
363
364 /* initial conditions */
365 if (re == NULL)
366 return NULL;
367 cs = re;
368 matched = false;
369
370 /* loop till expression string is exhausted (or at least pretty tired) */
371 while (*cs) {
372 switch (*cs & (OPER | STR | META)) {
373
374 /* try to match a string */
375 case STR:
376 matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
377 if (matched) {
378
379 /* hoorah it matches */
380 s += SCNT(cs);
381 cs = SNEXT(cs);
382 } else if (*cs & ALT) {
383
384 /* alternation, skip to next expression */
385 cs = SNEXT(cs);
386 } else if (*cs & OPT) {
387
388 /* the match is optional */
389 cs = SNEXT(cs);
390 matched = 1; /* indicate a successful match */
391 } else {
392
393 /* no match, error return */
394 return NULL;
395 }
396 break;
397
398 /* an operator, do something fancy */
399 case OPER:
400 switch (OSYM(cs)) {
401
402 /* this is an alternation */
403 case '|':
404 if (matched)
405
406 /* last thing in the alternation was a match, skip ahead */
407 cs = OPTR(cs);
408 else
409
410 /* no match, keep trying */
411 cs = ONEXT(cs);
412 break;
413
414 /* this is a grouping, recurse */
415 case '(':
416 ptr = expmatch(s, ONEXT(cs), mstring);
417 if (ptr != NULL) {
418
419 /* the subexpression matched */
420 matched = 1;
421 s = ptr;
422 } else if (*cs & ALT) {
423
424 /* alternation, skip to next expression */
425 matched = 0;
426 } else if (*cs & OPT) {
427
428 /* the match is optional */
429 matched = 1; /* indicate a successful match */
430 } else {
431
432 /* no match, error return */
433 return NULL;
434 }
435 cs = OPTR(cs);
436 break;
437 }
438 break;
439
440 /* try to match a metasymbol */
441 case META:
442 switch (MSYM(cs)) {
443
444 /* try to match anything and remember what was matched */
445 case 'p':
446 /*
447 * This is really the same as trying the match the
448 * remaining parts of the expression to any subset
449 * of the string.
450 */
451 s1 = s;
452 do {
453 ptr = expmatch(s1, MNEXT(cs), mstring);
454 if (ptr != NULL && s1 != s) {
455
456 /* we have a match, remember the match */
457 strncpy(mstring, s, (size_t)(s1 - s));
458 mstring[s1 - s] = '\0';
459 return ptr;
460 } else if (ptr != NULL && (*cs & OPT)) {
461
462 /* it was aoptional so no match is ok */
463 return ptr;
464 } else if (ptr != NULL) {
465
466 /* not optional and we still matched */
467 return NULL;
468 }
469 if (!isalnum((unsigned char)*s1) && *s1 != '_')
470 return NULL;
471 if (*s1 == '\\')
472 x_escaped = x_escaped ? false : true;
473 else
474 x_escaped = false;
475 } while (*s1++);
476 return NULL;
477
478 /* try to match anything */
479 case 'a':
480 /*
481 * This is really the same as trying the match the
482 * remaining parts of the expression to any subset
483 * of the string.
484 */
485 s1 = s;
486 do {
487 ptr = expmatch(s1, MNEXT(cs), mstring);
488 if (ptr != NULL && s1 != s) {
489
490 /* we have a match */
491 return ptr;
492 } else if (ptr != NULL && (*cs & OPT)) {
493
494 /* it was aoptional so no match is ok */
495 return ptr;
496 } else if (ptr != NULL) {
497
498 /* not optional and we still matched */
499 return NULL;
500 }
501 if (*s1 == '\\')
502 x_escaped = x_escaped ? false : true;
503 else
504 x_escaped = false;
505 } while (*s1++);
506 return NULL;
507
508 /* fail if we are currently x_escaped */
509 case 'e':
510 if (x_escaped)
511 return NULL;
512 cs = MNEXT(cs);
513 break;
514
515 /* match any number of tabs and spaces */
516 case 'd':
517 ptr = s;
518 while (*s == ' ' || *s == '\t')
519 s++;
520 if (s != ptr || s == x_start) {
521
522 /* match, be happy */
523 matched = 1;
524 cs = MNEXT(cs);
525 } else if (*s == '\n' || *s == '\0') {
526
527 /* match, be happy */
528 matched = 1;
529 cs = MNEXT(cs);
530 } else if (*cs & ALT) {
531
532 /* try the next part */
533 matched = 0;
534 cs = MNEXT(cs);
535 } else if (*cs & OPT) {
536
537 /* doesn't matter */
538 matched = 1;
539 cs = MNEXT(cs);
540 } else
541
542 /* no match, error return */
543 return NULL;
544 break;
545
546 /* check for end of line */
547 case '$':
548 if (*s == '\0' || *s == '\n') {
549
550 /* match, be happy */
551 s++;
552 matched = 1;
553 cs = MNEXT(cs);
554 } else if (*cs & ALT) {
555
556 /* try the next part */
557 matched = 0;
558 cs = MNEXT(cs);
559 } else if (*cs & OPT) {
560
561 /* doesn't matter */
562 matched = 1;
563 cs = MNEXT(cs);
564 } else
565
566 /* no match, error return */
567 return NULL;
568 break;
569
570 /* check for start of line */
571 case '^':
572 if (s == x_start) {
573
574 /* match, be happy */
575 matched = 1;
576 cs = MNEXT(cs);
577 } else if (*cs & ALT) {
578
579 /* try the next part */
580 matched = 0;
581 cs = MNEXT(cs);
582 } else if (*cs & OPT) {
583
584 /* doesn't matter */
585 matched = 1;
586 cs = MNEXT(cs);
587 } else
588
589 /* no match, error return */
590 return NULL;
591 break;
592
593 /* end of a subexpression, return success */
594 case ')':
595 return s;
596 }
597 break;
598 }
599 }
600 return s;
601 }
602