Home | History | Annotate | Line # | Download | only in vgrind
      1 /*	$NetBSD: regexp.c,v 1.17 2025/08/10 19:49:39 andvar Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1980, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  * 3. Neither the name of the University nor the names of its contributors
     17  *    may be used to endorse or promote products derived from this software
     18  *    without specific prior written permission.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     30  * SUCH DAMAGE.
     31  */
     32 
     33 #if HAVE_NBTOOL_CONFIG_H
     34 #include "nbtool_config.h"
     35 #endif
     36 
     37 #include <sys/cdefs.h>
     38 #ifndef lint
     39 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\
     40  The Regents of the University of California.  All rights reserved.");
     41 #endif /* not lint */
     42 
     43 #ifndef lint
     44 #if 0
     45 static char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
     46 #endif
     47 __RCSID("$NetBSD: regexp.c,v 1.17 2025/08/10 19:49:39 andvar Exp $");
     48 #endif /* not lint */
     49 
     50 #include <assert.h>
     51 #include <ctype.h>
     52 #include <stdlib.h>
     53 #include <stdbool.h>
     54 #include <string.h>
     55 #include "extern.h"
     56 
     57 static void	expconv (void);
     58 
     59 bool	 x_escaped;	/* true if we are currently x_escaped */
     60 char	*x_start;	/* start of string */
     61 bool	 l_onecase;	/* true if upper and lower equivalent */
     62 
     63 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
     64 
     65 static char
     66 tocc(ptrdiff_t x) {
     67 	if (x & ~0xff)
     68 		abort();
     69 	return (char)x;
     70 }
     71 
     72 /*  STRNCMP -	like strncmp except that we convert the
     73  *	 	first string to lower case before comparing
     74  *		if l_onecase is set.
     75  */
     76 
     77 int
     78 STRNCMP(char *s1, char *s2, int len)
     79 {
     80 	if (l_onecase) {
     81 	    do
     82 		if (*s2 - makelower(*s1))
     83 			return *s2 - makelower(*s1);
     84 		else {
     85 			s2++;
     86 			s1++;
     87 		}
     88 	    while (--len);
     89 	} else {
     90 	    do
     91 		if (*s2 - *s1)
     92 			return *s2 - *s1;
     93 		else {
     94 			s2++;
     95 			s1++;
     96 		}
     97 	    while (--len);
     98 	}
     99 	return(0);
    100 }
    101 
    102 /*	The following routine converts an irregular expression to
    103  *	internal format.
    104  *
    105  *	Either meta symbols (\a \d or \p) or character strings or
    106  *	operations ( alternation or perenthesizing ) can be
    107  *	specified.  Each starts with a descriptor byte.  The descriptor
    108  *	byte has STR set for strings, META set for meta symbols
    109  *	and OPER set for operations.
    110  *	The descriptor byte can also have the OPT bit set if the object
    111  *	defined is optional.  Also ALT can be set to indicate an alternation.
    112  *
    113  *	For metasymbols the byte following the descriptor byte identities
    114  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
    115  *	strings the byte after the descriptor is a character count for
    116  *	the string:
    117  *
    118  *		meta symbols := descriptor
    119  *				symbol
    120  *
    121  *		strings :=	descriptor
    122  *				character count
    123  *				the string
    124  *
    125  *		operations :=	descriptor
    126  *				symbol
    127  *				character count
    128  */
    129 
    130 /*
    131  *  handy macros for accessing parts of match blocks
    132  */
    133 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
    134 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
    135 
    136 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
    137 #define OCNT(A) (*(A+2))	/* character count */
    138 #define ONEXT(A) (A+3)		/* next character after the operation */
    139 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
    140 
    141 #define SCNT(A) (*(A+1))	/* byte count of a string */
    142 #define SSTR(A) (A+2)		/* address of the string */
    143 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
    144 
    145 /*
    146  *  bit flags in the descriptor
    147  */
    148 #define OPT 1
    149 #define STR 2
    150 #define META 4
    151 #define ALT 8
    152 #define OPER 16
    153 
    154 static char *ccre;	/* pointer to current position in converted exp*/
    155 static char *ure;	/* pointer current position in unconverted exp */
    156 
    157 char *
    158 convexp(char *re)	/* unconverted irregular expression */
    159 {
    160     char *cre;		/* pointer to converted regular expression */
    161 
    162     /* allocate room for the converted expression */
    163     if (re == NULL)
    164 	return NULL;
    165     if (*re == '\0')
    166 	return NULL;
    167     cre = malloc(4 * strlen(re) + 3);
    168     ccre = cre;
    169     ure = re;
    170 
    171     /* start the conversion with a \a */
    172     *cre = META | OPT;
    173     MSYM(cre) = 'a';
    174     ccre = MNEXT(cre);
    175 
    176     /* start the conversion (its recursive) */
    177     expconv();
    178     *ccre = 0;
    179     return cre;
    180 }
    181 
    182 static void
    183 expconv(void)
    184 {
    185     char *cs;		/* pointer to current symbol in converted exp */
    186     char c;		/* character being processed */
    187     char *acs;		/* pointer to last alternate */
    188     int temp;
    189 
    190     /* let the conversion begin */
    191     acs = NULL;
    192     cs = NULL;
    193     while (*ure) {
    194 	switch (c = *ure++) {
    195 
    196 	case '\\':
    197 	    switch (c = *ure++) {
    198 
    199 	    /* escaped characters are just characters */
    200 	    default:
    201 		if (cs == NULL || (*cs & STR) == 0) {
    202 		    cs = ccre;
    203 		    *cs = STR;
    204 		    SCNT(cs) = 1;
    205 		    ccre += 2;
    206 		} else
    207 		    SCNT(cs)++;
    208 		*ccre++ = c;
    209 		break;
    210 
    211 	    /* normal(?) metacharacters */
    212 	    case 'a':
    213 	    case 'd':
    214 	    case 'e':
    215 	    case 'p':
    216 		if (acs != NULL && acs != cs) {
    217 		    do {
    218 			temp = OCNT(acs);
    219 			OCNT(acs) = tocc(ccre - acs);
    220 			acs -= temp;
    221 		    } while (temp != 0);
    222 		    acs = NULL;
    223 		}
    224 		cs = ccre;
    225 		*cs = META;
    226 		MSYM(cs) = c;
    227 		ccre = MNEXT(cs);
    228 		break;
    229 	    }
    230 	    break;
    231 
    232 	/* just put the symbol in */
    233 	case '^':
    234 	case '$':
    235 	    if (acs != NULL && acs != cs) {
    236 		do {
    237 		    temp = OCNT(acs);
    238 		    OCNT(acs) = tocc(ccre - acs);
    239 		    acs -= temp;
    240 		} while (temp != 0);
    241 		acs = NULL;
    242 	    }
    243 	    cs = ccre;
    244 	    *cs = META;
    245 	    MSYM(cs) = c;
    246 	    ccre = MNEXT(cs);
    247 	    break;
    248 
    249 	/* mark the last match sequence as optional */
    250 	case '?':
    251 	    if (cs)
    252 	    	*cs = *cs | OPT;
    253 	    break;
    254 
    255 	/* recurse and define a subexpression */
    256 	case '(':
    257 	    if (acs != NULL && acs != cs) {
    258 		do {
    259 		    temp = OCNT(acs);
    260 		    OCNT(acs) = tocc(ccre - acs);
    261 		    acs -= temp;
    262 		} while (temp != 0);
    263 		acs = NULL;
    264 	    }
    265 	    cs = ccre;
    266 	    *cs = OPER;
    267 	    OSYM(cs) = '(';
    268 	    ccre = ONEXT(cs);
    269 	    expconv();
    270 	    OCNT(cs) = tocc(ccre - cs);		/* offset to next symbol */
    271 	    break;
    272 
    273 	/* return from a recursion */
    274 	case ')':
    275 	    if (acs != NULL) {
    276 		do {
    277 		    temp = OCNT(acs);
    278 		    OCNT(acs) = tocc(ccre - acs);
    279 		    acs -= temp;
    280 		} while (temp != 0);
    281 		acs = NULL;
    282 	    }
    283 	    cs = ccre;
    284 	    *cs = META;
    285 	    MSYM(cs) = c;
    286 	    ccre = MNEXT(cs);
    287 	    return;
    288 
    289 	/* mark the last match sequence as having an alternate */
    290 	/* the third byte will contain an offset to jump over the */
    291 	/* alternate match in case the first did not fail */
    292 	case '|':
    293 	    if (acs != NULL && acs != cs)
    294 		OCNT(ccre) = tocc(ccre - acs);	/* make a back pointer */
    295 	    else
    296 		OCNT(ccre) = 0;
    297 	    assert(cs != NULL);
    298 	    *cs |= ALT;
    299 	    cs = ccre;
    300 	    *cs = OPER;
    301 	    OSYM(cs) = '|';
    302 	    ccre = ONEXT(cs);
    303 	    acs = cs;	/* remember that the pointer is to be filles */
    304 	    break;
    305 
    306 	/* if its not a metasymbol just build a scharacter string */
    307 	default:
    308 	    if (cs == NULL || (*cs & STR) == 0) {
    309 		cs = ccre;
    310 		*cs = STR;
    311 		SCNT(cs) = 1;
    312 		ccre = SSTR(cs);
    313 	    } else
    314 		SCNT(cs)++;
    315 	    *ccre++ = c;
    316 	    break;
    317 	}
    318     }
    319     if (acs != NULL) {
    320 	do {
    321 	    temp = OCNT(acs);
    322 	    OCNT(acs) = tocc(ccre - acs);
    323 	    acs -= temp;
    324 	} while (temp != 0);
    325 	acs = NULL;
    326     }
    327     return;
    328 }
    329 /* end of convertre */
    330 
    331 
    332 /*
    333  *	The following routine recognises an irregular expression
    334  *	with the following special characters:
    335  *
    336  *		\?	-	means last match was optional
    337  *		\a	-	matches any number of characters
    338  *		\d	-	matches any number of spaces and tabs
    339  *		\p	-	matches any number of alphanumeric
    340  *				characters. The
    341  *				characters matched will be copied into
    342  *				the area pointed to by 'name'.
    343  *		\|	-	alternation
    344  *		\( \)	-	grouping used mostly for alternation and
    345  *				optionality
    346  *
    347  *	The irregular expression must be translated to internal form
    348  *	prior to calling this routine
    349  *
    350  *	The value returned is the pointer to the first non \a
    351  *	character matched.
    352  */
    353 
    354 char *
    355 expmatch(
    356     char *s,		/* string to check for a match in */
    357     char *re,		/* a converted irregular expression */
    358     char *mstring)	/* where to put whatever matches a \p */
    359 {
    360     char *cs;		/* the current symbol */
    361     char *ptr,*s1;	/* temporary pointer */
    362     bool matched;	/* a temporary bool */
    363 
    364     /* initial conditions */
    365     if (re == NULL)
    366 	return NULL;
    367     cs = re;
    368     matched = false;
    369 
    370     /* loop till expression string is exhausted (or at least pretty tired) */
    371     while (*cs) {
    372 	switch (*cs & (OPER | STR | META)) {
    373 
    374 	/* try to match a string */
    375 	case STR:
    376 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
    377 	    if (matched) {
    378 
    379 		/* hoorah it matches */
    380 		s += SCNT(cs);
    381 		cs = SNEXT(cs);
    382 	    } else if (*cs & ALT) {
    383 
    384 		/* alternation, skip to next expression */
    385 		cs = SNEXT(cs);
    386 	    } else if (*cs & OPT) {
    387 
    388 		/* the match is optional */
    389 		cs = SNEXT(cs);
    390 		matched = 1;		/* indicate a successful match */
    391 	    } else {
    392 
    393 		/* no match, error return */
    394 		return NULL;
    395 	    }
    396 	    break;
    397 
    398 	/* an operator, do something fancy */
    399 	case OPER:
    400 	    switch (OSYM(cs)) {
    401 
    402 	    /* this is an alternation */
    403 	    case '|':
    404 		if (matched)
    405 
    406 		    /* last thing in the alternation was a match, skip ahead */
    407 		    cs = OPTR(cs);
    408 		else
    409 
    410 		    /* no match, keep trying */
    411 		    cs = ONEXT(cs);
    412 		break;
    413 
    414 	    /* this is a grouping, recurse */
    415 	    case '(':
    416 		ptr = expmatch(s, ONEXT(cs), mstring);
    417 		if (ptr != NULL) {
    418 
    419 		    /* the subexpression matched */
    420 		    matched = 1;
    421 		    s = ptr;
    422 		} else if (*cs & ALT) {
    423 
    424 		    /* alternation, skip to next expression */
    425 		    matched = 0;
    426 		} else if (*cs & OPT) {
    427 
    428 		    /* the match is optional */
    429 		    matched = 1;	/* indicate a successful match */
    430 		} else {
    431 
    432 		    /* no match, error return */
    433 		    return NULL;
    434 		}
    435 		cs = OPTR(cs);
    436 		break;
    437 	    }
    438 	    break;
    439 
    440 	/* try to match a metasymbol */
    441 	case META:
    442 	    switch (MSYM(cs)) {
    443 
    444 	    /* try to match anything and remember what was matched */
    445 	    case 'p':
    446 		/*
    447 		 *  This is really the same as trying the match the
    448 		 *  remaining parts of the expression to any subset
    449 		 *  of the string.
    450 		 */
    451 		s1 = s;
    452 		do {
    453 		    ptr = expmatch(s1, MNEXT(cs), mstring);
    454 		    if (ptr != NULL && s1 != s) {
    455 
    456 			/* we have a match, remember the match */
    457 			strncpy(mstring, s, (size_t)(s1 - s));
    458 			mstring[s1 - s] = '\0';
    459 			return ptr;
    460 		    } else if (ptr != NULL && (*cs & OPT)) {
    461 
    462 			/* it was aoptional so no match is ok */
    463 			return ptr;
    464 		    } else if (ptr != NULL) {
    465 
    466 			/* not optional and we still matched */
    467 			return NULL;
    468 		    }
    469 		    if (!isalnum((unsigned char)*s1) && *s1 != '_')
    470 			return NULL;
    471 		    if (*s1 == '\\')
    472 			x_escaped = x_escaped ? false : true;
    473 		    else
    474 			x_escaped = false;
    475 		} while (*s1++);
    476 		return NULL;
    477 
    478 	    /* try to match anything */
    479 	    case 'a':
    480 		/*
    481 		 *  This is really the same as trying the match the
    482 		 *  remaining parts of the expression to any subset
    483 		 *  of the string.
    484 		 */
    485 		s1 = s;
    486 		do {
    487 		    ptr = expmatch(s1, MNEXT(cs), mstring);
    488 		    if (ptr != NULL && s1 != s) {
    489 
    490 			/* we have a match */
    491 			return ptr;
    492 		    } else if (ptr != NULL && (*cs & OPT)) {
    493 
    494 			/* it was aoptional so no match is ok */
    495 			return ptr;
    496 		    } else if (ptr != NULL) {
    497 
    498 			/* not optional and we still matched */
    499 			return NULL;
    500 		    }
    501 		    if (*s1 == '\\')
    502 			x_escaped = x_escaped ? false : true;
    503 		    else
    504 			x_escaped = false;
    505 		} while (*s1++);
    506 		return NULL;
    507 
    508 	    /* fail if we are currently x_escaped */
    509 	    case 'e':
    510 		if (x_escaped)
    511 		    return NULL;
    512 		cs = MNEXT(cs);
    513 		break;
    514 
    515 	    /* match any number of tabs and spaces */
    516 	    case 'd':
    517 		ptr = s;
    518 		while (*s == ' ' || *s == '\t')
    519 		    s++;
    520 		if (s != ptr || s == x_start) {
    521 
    522 		    /* match, be happy */
    523 		    matched = 1;
    524 		    cs = MNEXT(cs);
    525 		} else if (*s == '\n' || *s == '\0') {
    526 
    527 		    /* match, be happy */
    528 		    matched = 1;
    529 		    cs = MNEXT(cs);
    530 		} else if (*cs & ALT) {
    531 
    532 		    /* try the next part */
    533 		    matched = 0;
    534 		    cs = MNEXT(cs);
    535 		} else if (*cs & OPT) {
    536 
    537 		    /* doesn't matter */
    538 		    matched = 1;
    539 		    cs = MNEXT(cs);
    540 		} else
    541 
    542 		    /* no match, error return */
    543 		    return NULL;
    544 		break;
    545 
    546 	    /* check for end of line */
    547 	    case '$':
    548 		if (*s == '\0' || *s == '\n') {
    549 
    550 		    /* match, be happy */
    551 		    s++;
    552 		    matched = 1;
    553 		    cs = MNEXT(cs);
    554 		} else if (*cs & ALT) {
    555 
    556 		    /* try the next part */
    557 		    matched = 0;
    558 		    cs = MNEXT(cs);
    559 		} else if (*cs & OPT) {
    560 
    561 		    /* doesn't matter */
    562 		    matched = 1;
    563 		    cs = MNEXT(cs);
    564 		} else
    565 
    566 		    /* no match, error return */
    567 		    return NULL;
    568 		break;
    569 
    570 	    /* check for start of line */
    571 	    case '^':
    572 		if (s == x_start) {
    573 
    574 		    /* match, be happy */
    575 		    matched = 1;
    576 		    cs = MNEXT(cs);
    577 		} else if (*cs & ALT) {
    578 
    579 		    /* try the next part */
    580 		    matched = 0;
    581 		    cs = MNEXT(cs);
    582 		} else if (*cs & OPT) {
    583 
    584 		    /* doesn't matter */
    585 		    matched = 1;
    586 		    cs = MNEXT(cs);
    587 		} else
    588 
    589 		    /* no match, error return */
    590 		    return NULL;
    591 		break;
    592 
    593 	    /* end of a subexpression, return success */
    594 	    case ')':
    595 		return s;
    596 	    }
    597 	    break;
    598 	}
    599     }
    600     return s;
    601 }
    602