Home | History | Annotate | Line # | Download | only in vgrind
regexp.c revision 1.12
      1 /*	$NetBSD: regexp.c,v 1.12 2012/03/20 20:34:59 matt Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1980, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  * 3. Neither the name of the University nor the names of its contributors
     17  *    may be used to endorse or promote products derived from this software
     18  *    without specific prior written permission.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     30  * SUCH DAMAGE.
     31  */
     32 
     33 #include <sys/cdefs.h>
     34 #ifndef lint
     35 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\
     36  The Regents of the University of California.  All rights reserved.");
     37 #endif /* not lint */
     38 
     39 #ifndef lint
     40 #if 0
     41 static char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
     42 #endif
     43 __RCSID("$NetBSD: regexp.c,v 1.12 2012/03/20 20:34:59 matt Exp $");
     44 #endif /* not lint */
     45 
     46 #include <assert.h>
     47 #include <ctype.h>
     48 #include <stdlib.h>
     49 #include <string.h>
     50 #include "extern.h"
     51 
     52 #define FALSE	0
     53 #define TRUE	!(FALSE)
     54 #define NIL	0
     55 
     56 static void	expconv __P((void));
     57 
     58 boolean	 x_escaped;	/* true if we are currently x_escaped */
     59 char	*x_start;	/* start of string */
     60 boolean	 l_onecase;	/* true if upper and lower equivalent */
     61 
     62 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
     63 
     64 /*  STRNCMP -	like strncmp except that we convert the
     65  *	 	first string to lower case before comparing
     66  *		if l_onecase is set.
     67  */
     68 
     69 int
     70 STRNCMP(char *s1, char *s2, int len)
     71 {
     72 	if (l_onecase) {
     73 	    do
     74 		if (*s2 - makelower(*s1))
     75 			return (*s2 - makelower(*s1));
     76 		else {
     77 			s2++;
     78 			s1++;
     79 		}
     80 	    while (--len);
     81 	} else {
     82 	    do
     83 		if (*s2 - *s1)
     84 			return (*s2 - *s1);
     85 		else {
     86 			s2++;
     87 			s1++;
     88 		}
     89 	    while (--len);
     90 	}
     91 	return(0);
     92 }
     93 
     94 /*	The following routine converts an irregular expression to
     95  *	internal format.
     96  *
     97  *	Either meta symbols (\a \d or \p) or character strings or
     98  *	operations ( alternation or perenthesizing ) can be
     99  *	specified.  Each starts with a descriptor byte.  The descriptor
    100  *	byte has STR set for strings, META set for meta symbols
    101  *	and OPER set for operations.
    102  *	The descriptor byte can also have the OPT bit set if the object
    103  *	defined is optional.  Also ALT can be set to indicate an alternation.
    104  *
    105  *	For metasymbols the byte following the descriptor byte identities
    106  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
    107  *	strings the byte after the descriptor is a character count for
    108  *	the string:
    109  *
    110  *		meta symbols := descriptor
    111  *				symbol
    112  *
    113  *		strings :=	descriptor
    114  *				character count
    115  *				the string
    116  *
    117  *		operatins :=	descriptor
    118  *				symbol
    119  *				character count
    120  */
    121 
    122 /*
    123  *  handy macros for accessing parts of match blocks
    124  */
    125 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
    126 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
    127 
    128 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
    129 #define OCNT(A) (*(A+2))	/* character count */
    130 #define ONEXT(A) (A+3)		/* next character after the operation */
    131 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
    132 
    133 #define SCNT(A) (*(A+1))	/* byte count of a string */
    134 #define SSTR(A) (A+2)		/* address of the string */
    135 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
    136 
    137 /*
    138  *  bit flags in the descriptor
    139  */
    140 #define OPT 1
    141 #define STR 2
    142 #define META 4
    143 #define ALT 8
    144 #define OPER 16
    145 
    146 static char *ccre;	/* pointer to current position in converted exp*/
    147 static char *ure;	/* pointer current position in unconverted exp */
    148 
    149 char *
    150 convexp(char *re)	/* unconverted irregular expression */
    151 {
    152     char *cre;		/* pointer to converted regular expression */
    153 
    154     /* allocate room for the converted expression */
    155     if (re == NIL)
    156 	return (NIL);
    157     if (*re == '\0')
    158 	return (NIL);
    159     cre = malloc(4 * strlen(re) + 3);
    160     ccre = cre;
    161     ure = re;
    162 
    163     /* start the conversion with a \a */
    164     *cre = META | OPT;
    165     MSYM(cre) = 'a';
    166     ccre = MNEXT(cre);
    167 
    168     /* start the conversion (its recursive) */
    169     expconv();
    170     *ccre = 0;
    171     return (cre);
    172 }
    173 
    174 static void
    175 expconv(void)
    176 {
    177     char *cs;		/* pointer to current symbol in converted exp */
    178     char c;		/* character being processed */
    179     char *acs;		/* pinter to last alternate */
    180     int temp;
    181 
    182     /* let the conversion begin */
    183     acs = NIL;
    184     cs = NIL;
    185     while (*ure != NIL) {
    186 	switch (c = *ure++) {
    187 
    188 	case '\\':
    189 	    switch (c = *ure++) {
    190 
    191 	    /* escaped characters are just characters */
    192 	    default:
    193 		if (cs == NIL || (*cs & STR) == 0) {
    194 		    cs = ccre;
    195 		    *cs = STR;
    196 		    SCNT(cs) = 1;
    197 		    ccre += 2;
    198 		} else
    199 		    SCNT(cs)++;
    200 		*ccre++ = c;
    201 		break;
    202 
    203 	    /* normal(?) metacharacters */
    204 	    case 'a':
    205 	    case 'd':
    206 	    case 'e':
    207 	    case 'p':
    208 		if (acs != NIL && acs != cs) {
    209 		    do {
    210 			temp = OCNT(acs);
    211 			OCNT(acs) = ccre - acs;
    212 			acs -= temp;
    213 		    } while (temp != 0);
    214 		    acs = NIL;
    215 		}
    216 		cs = ccre;
    217 		*cs = META;
    218 		MSYM(cs) = c;
    219 		ccre = MNEXT(cs);
    220 		break;
    221 	    }
    222 	    break;
    223 
    224 	/* just put the symbol in */
    225 	case '^':
    226 	case '$':
    227 	    if (acs != NIL && acs != cs) {
    228 		do {
    229 		    temp = OCNT(acs);
    230 		    OCNT(acs) = ccre - acs;
    231 		    acs -= temp;
    232 		} while (temp != 0);
    233 		acs = NIL;
    234 	    }
    235 	    cs = ccre;
    236 	    *cs = META;
    237 	    MSYM(cs) = c;
    238 	    ccre = MNEXT(cs);
    239 	    break;
    240 
    241 	/* mark the last match sequence as optional */
    242 	case '?':
    243 	    if (cs)
    244 	    	*cs = *cs | OPT;
    245 	    break;
    246 
    247 	/* recurse and define a subexpression */
    248 	case '(':
    249 	    if (acs != NIL && acs != cs) {
    250 		do {
    251 		    temp = OCNT(acs);
    252 		    OCNT(acs) = ccre - acs;
    253 		    acs -= temp;
    254 		} while (temp != 0);
    255 		acs = NIL;
    256 	    }
    257 	    cs = ccre;
    258 	    *cs = OPER;
    259 	    OSYM(cs) = '(';
    260 	    ccre = ONEXT(cs);
    261 	    expconv();
    262 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
    263 	    break;
    264 
    265 	/* reurn from a recursion */
    266 	case ')':
    267 	    if (acs != NIL) {
    268 		do {
    269 		    temp = OCNT(acs);
    270 		    OCNT(acs) = ccre - acs;
    271 		    acs -= temp;
    272 		} while (temp != 0);
    273 		acs = NIL;
    274 	    }
    275 	    cs = ccre;
    276 	    *cs = META;
    277 	    MSYM(cs) = c;
    278 	    ccre = MNEXT(cs);
    279 	    return;
    280 
    281 	/* mark the last match sequence as having an alternate */
    282 	/* the third byte will contain an offset to jump over the */
    283 	/* alternate match in case the first did not fail */
    284 	case '|':
    285 	    if (acs != NIL && acs != cs)
    286 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
    287 	    else
    288 		OCNT(ccre) = 0;
    289 	    assert(cs != NULL);
    290 	    *cs |= ALT;
    291 	    cs = ccre;
    292 	    *cs = OPER;
    293 	    OSYM(cs) = '|';
    294 	    ccre = ONEXT(cs);
    295 	    acs = cs;	/* remember that the pointer is to be filles */
    296 	    break;
    297 
    298 	/* if its not a metasymbol just build a scharacter string */
    299 	default:
    300 	    if (cs == NIL || (*cs & STR) == 0) {
    301 		cs = ccre;
    302 		*cs = STR;
    303 		SCNT(cs) = 1;
    304 		ccre = SSTR(cs);
    305 	    } else
    306 		SCNT(cs)++;
    307 	    *ccre++ = c;
    308 	    break;
    309 	}
    310     }
    311     if (acs != NIL) {
    312 	do {
    313 	    temp = OCNT(acs);
    314 	    OCNT(acs) = ccre - acs;
    315 	    acs -= temp;
    316 	} while (temp != 0);
    317 	acs = NIL;
    318     }
    319     return;
    320 }
    321 /* end of convertre */
    322 
    323 
    324 /*
    325  *	The following routine recognises an irregular expresion
    326  *	with the following special characters:
    327  *
    328  *		\?	-	means last match was optional
    329  *		\a	-	matches any number of characters
    330  *		\d	-	matches any number of spaces and tabs
    331  *		\p	-	matches any number of alphanumeric
    332  *				characters. The
    333  *				characters matched will be copied into
    334  *				the area pointed to by 'name'.
    335  *		\|	-	alternation
    336  *		\( \)	-	grouping used mostly for alternation and
    337  *				optionality
    338  *
    339  *	The irregular expression must be translated to internal form
    340  *	prior to calling this routine
    341  *
    342  *	The value returned is the pointer to the first non \a
    343  *	character matched.
    344  */
    345 
    346 char *
    347 expmatch(
    348     char *s,		/* string to check for a match in */
    349     char *re,		/* a converted irregular expression */
    350     char *mstring)	/* where to put whatever matches a \p */
    351 {
    352     char *cs;		/* the current symbol */
    353     char *ptr,*s1;	/* temporary pointer */
    354     boolean matched;	/* a temporary boolean */
    355 
    356     /* initial conditions */
    357     if (re == NIL)
    358 	return (NIL);
    359     cs = re;
    360     matched = FALSE;
    361 
    362     /* loop till expression string is exhausted (or at least pretty tired) */
    363     while (*cs) {
    364 	switch (*cs & (OPER | STR | META)) {
    365 
    366 	/* try to match a string */
    367 	case STR:
    368 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
    369 	    if (matched) {
    370 
    371 		/* hoorah it matches */
    372 		s += SCNT(cs);
    373 		cs = SNEXT(cs);
    374 	    } else if (*cs & ALT) {
    375 
    376 		/* alternation, skip to next expression */
    377 		cs = SNEXT(cs);
    378 	    } else if (*cs & OPT) {
    379 
    380 		/* the match is optional */
    381 		cs = SNEXT(cs);
    382 		matched = 1;		/* indicate a successful match */
    383 	    } else {
    384 
    385 		/* no match, error return */
    386 		return (NIL);
    387 	    }
    388 	    break;
    389 
    390 	/* an operator, do something fancy */
    391 	case OPER:
    392 	    switch (OSYM(cs)) {
    393 
    394 	    /* this is an alternation */
    395 	    case '|':
    396 		if (matched)
    397 
    398 		    /* last thing in the alternation was a match, skip ahead */
    399 		    cs = OPTR(cs);
    400 		else
    401 
    402 		    /* no match, keep trying */
    403 		    cs = ONEXT(cs);
    404 		break;
    405 
    406 	    /* this is a grouping, recurse */
    407 	    case '(':
    408 		ptr = expmatch(s, ONEXT(cs), mstring);
    409 		if (ptr != NIL) {
    410 
    411 		    /* the subexpression matched */
    412 		    matched = 1;
    413 		    s = ptr;
    414 		} else if (*cs & ALT) {
    415 
    416 		    /* alternation, skip to next expression */
    417 		    matched = 0;
    418 		} else if (*cs & OPT) {
    419 
    420 		    /* the match is optional */
    421 		    matched = 1;	/* indicate a successful match */
    422 		} else {
    423 
    424 		    /* no match, error return */
    425 		    return (NIL);
    426 		}
    427 		cs = OPTR(cs);
    428 		break;
    429 	    }
    430 	    break;
    431 
    432 	/* try to match a metasymbol */
    433 	case META:
    434 	    switch (MSYM(cs)) {
    435 
    436 	    /* try to match anything and remember what was matched */
    437 	    case 'p':
    438 		/*
    439 		 *  This is really the same as trying the match the
    440 		 *  remaining parts of the expression to any subset
    441 		 *  of the string.
    442 		 */
    443 		s1 = s;
    444 		do {
    445 		    ptr = expmatch(s1, MNEXT(cs), mstring);
    446 		    if (ptr != NIL && s1 != s) {
    447 
    448 			/* we have a match, remember the match */
    449 			strncpy(mstring, s, s1 - s);
    450 			mstring[s1 - s] = '\0';
    451 			return (ptr);
    452 		    } else if (ptr != NIL && (*cs & OPT)) {
    453 
    454 			/* it was aoptional so no match is ok */
    455 			return (ptr);
    456 		    } else if (ptr != NIL) {
    457 
    458 			/* not optional and we still matched */
    459 			return (NIL);
    460 		    }
    461 		    if (!isalnum((unsigned char)*s1) && *s1 != '_')
    462 			return (NIL);
    463 		    if (*s1 == '\\')
    464 			x_escaped = x_escaped ? FALSE : TRUE;
    465 		    else
    466 			x_escaped = FALSE;
    467 		} while (*s1++);
    468 		return (NIL);
    469 
    470 	    /* try to match anything */
    471 	    case 'a':
    472 		/*
    473 		 *  This is really the same as trying the match the
    474 		 *  remaining parts of the expression to any subset
    475 		 *  of the string.
    476 		 */
    477 		s1 = s;
    478 		do {
    479 		    ptr = expmatch(s1, MNEXT(cs), mstring);
    480 		    if (ptr != NIL && s1 != s) {
    481 
    482 			/* we have a match */
    483 			return (ptr);
    484 		    } else if (ptr != NIL && (*cs & OPT)) {
    485 
    486 			/* it was aoptional so no match is ok */
    487 			return (ptr);
    488 		    } else if (ptr != NIL) {
    489 
    490 			/* not optional and we still matched */
    491 			return (NIL);
    492 		    }
    493 		    if (*s1 == '\\')
    494 			x_escaped = x_escaped ? FALSE : TRUE;
    495 		    else
    496 			x_escaped = FALSE;
    497 		} while (*s1++);
    498 		return (NIL);
    499 
    500 	    /* fail if we are currently x_escaped */
    501 	    case 'e':
    502 		if (x_escaped)
    503 		    return(NIL);
    504 		cs = MNEXT(cs);
    505 		break;
    506 
    507 	    /* match any number of tabs and spaces */
    508 	    case 'd':
    509 		ptr = s;
    510 		while (*s == ' ' || *s == '\t')
    511 		    s++;
    512 		if (s != ptr || s == x_start) {
    513 
    514 		    /* match, be happy */
    515 		    matched = 1;
    516 		    cs = MNEXT(cs);
    517 		} else if (*s == '\n' || *s == '\0') {
    518 
    519 		    /* match, be happy */
    520 		    matched = 1;
    521 		    cs = MNEXT(cs);
    522 		} else if (*cs & ALT) {
    523 
    524 		    /* try the next part */
    525 		    matched = 0;
    526 		    cs = MNEXT(cs);
    527 		} else if (*cs & OPT) {
    528 
    529 		    /* doesn't matter */
    530 		    matched = 1;
    531 		    cs = MNEXT(cs);
    532 		} else
    533 
    534 		    /* no match, error return */
    535 		    return (NIL);
    536 		break;
    537 
    538 	    /* check for end of line */
    539 	    case '$':
    540 		if (*s == '\0' || *s == '\n') {
    541 
    542 		    /* match, be happy */
    543 		    s++;
    544 		    matched = 1;
    545 		    cs = MNEXT(cs);
    546 		} else if (*cs & ALT) {
    547 
    548 		    /* try the next part */
    549 		    matched = 0;
    550 		    cs = MNEXT(cs);
    551 		} else if (*cs & OPT) {
    552 
    553 		    /* doesn't matter */
    554 		    matched = 1;
    555 		    cs = MNEXT(cs);
    556 		} else
    557 
    558 		    /* no match, error return */
    559 		    return (NIL);
    560 		break;
    561 
    562 	    /* check for start of line */
    563 	    case '^':
    564 		if (s == x_start) {
    565 
    566 		    /* match, be happy */
    567 		    matched = 1;
    568 		    cs = MNEXT(cs);
    569 		} else if (*cs & ALT) {
    570 
    571 		    /* try the next part */
    572 		    matched = 0;
    573 		    cs = MNEXT(cs);
    574 		} else if (*cs & OPT) {
    575 
    576 		    /* doesn't matter */
    577 		    matched = 1;
    578 		    cs = MNEXT(cs);
    579 		} else
    580 
    581 		    /* no match, error return */
    582 		    return (NIL);
    583 		break;
    584 
    585 	    /* end of a subexpression, return success */
    586 	    case ')':
    587 		return (s);
    588 	    }
    589 	    break;
    590 	}
    591     }
    592     return (s);
    593 }
    594