Home | History | Annotate | Line # | Download | only in vgrind
regexp.c revision 1.10
      1 /*	$NetBSD: regexp.c,v 1.10 2006/05/01 05:14:22 christos Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1980, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  * 3. Neither the name of the University nor the names of its contributors
     17  *    may be used to endorse or promote products derived from this software
     18  *    without specific prior written permission.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     30  * SUCH DAMAGE.
     31  */
     32 
     33 #include <sys/cdefs.h>
     34 #ifndef lint
     35 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\n\
     36 	The Regents of the University of California.  All rights reserved.\n");
     37 #endif /* not lint */
     38 
     39 #ifndef lint
     40 #if 0
     41 static char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
     42 #endif
     43 __RCSID("$NetBSD: regexp.c,v 1.10 2006/05/01 05:14:22 christos Exp $");
     44 #endif /* not lint */
     45 
     46 #include <assert.h>
     47 #include <ctype.h>
     48 #include <stdlib.h>
     49 #include <string.h>
     50 #include "extern.h"
     51 
     52 #define FALSE	0
     53 #define TRUE	!(FALSE)
     54 #define NIL	0
     55 
     56 static void	expconv __P((void));
     57 
     58 boolean	 x_escaped;	/* true if we are currently x_escaped */
     59 char	*x_start;	/* start of string */
     60 boolean	 l_onecase;	/* true if upper and lower equivalent */
     61 
     62 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
     63 
     64 /*  STRNCMP -	like strncmp except that we convert the
     65  *	 	first string to lower case before comparing
     66  *		if l_onecase is set.
     67  */
     68 
     69 int
     70 STRNCMP(s1, s2, len)
     71 	char *s1,*s2;
     72 	int len;
     73 {
     74 	if (l_onecase) {
     75 	    do
     76 		if (*s2 - makelower(*s1))
     77 			return (*s2 - makelower(*s1));
     78 		else {
     79 			s2++;
     80 			s1++;
     81 		}
     82 	    while (--len);
     83 	} else {
     84 	    do
     85 		if (*s2 - *s1)
     86 			return (*s2 - *s1);
     87 		else {
     88 			s2++;
     89 			s1++;
     90 		}
     91 	    while (--len);
     92 	}
     93 	return(0);
     94 }
     95 
     96 /*	The following routine converts an irregular expression to
     97  *	internal format.
     98  *
     99  *	Either meta symbols (\a \d or \p) or character strings or
    100  *	operations ( alternation or perenthesizing ) can be
    101  *	specified.  Each starts with a descriptor byte.  The descriptor
    102  *	byte has STR set for strings, META set for meta symbols
    103  *	and OPER set for operations.
    104  *	The descriptor byte can also have the OPT bit set if the object
    105  *	defined is optional.  Also ALT can be set to indicate an alternation.
    106  *
    107  *	For metasymbols the byte following the descriptor byte identities
    108  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
    109  *	strings the byte after the descriptor is a character count for
    110  *	the string:
    111  *
    112  *		meta symbols := descriptor
    113  *				symbol
    114  *
    115  *		strings :=	descriptor
    116  *				character count
    117  *				the string
    118  *
    119  *		operatins :=	descriptor
    120  *				symbol
    121  *				character count
    122  */
    123 
    124 /*
    125  *  handy macros for accessing parts of match blocks
    126  */
    127 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
    128 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
    129 
    130 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
    131 #define OCNT(A) (*(A+2))	/* character count */
    132 #define ONEXT(A) (A+3)		/* next character after the operation */
    133 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
    134 
    135 #define SCNT(A) (*(A+1))	/* byte count of a string */
    136 #define SSTR(A) (A+2)		/* address of the string */
    137 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
    138 
    139 /*
    140  *  bit flags in the descriptor
    141  */
    142 #define OPT 1
    143 #define STR 2
    144 #define META 4
    145 #define ALT 8
    146 #define OPER 16
    147 
    148 static char *ccre;	/* pointer to current position in converted exp*/
    149 static char *ure;	/* pointer current position in unconverted exp */
    150 
    151 char *
    152 convexp(re)
    153     char *re;		/* unconverted irregular expression */
    154 {
    155     char *cre;		/* pointer to converted regular expression */
    156 
    157     /* allocate room for the converted expression */
    158     if (re == NIL)
    159 	return (NIL);
    160     if (*re == '\0')
    161 	return (NIL);
    162     cre = malloc(4 * strlen(re) + 3);
    163     ccre = cre;
    164     ure = re;
    165 
    166     /* start the conversion with a \a */
    167     *cre = META | OPT;
    168     MSYM(cre) = 'a';
    169     ccre = MNEXT(cre);
    170 
    171     /* start the conversion (its recursive) */
    172     expconv();
    173     *ccre = 0;
    174     return (cre);
    175 }
    176 
    177 static void
    178 expconv()
    179 {
    180     char *cs;		/* pointer to current symbol in converted exp */
    181     char c;		/* character being processed */
    182     char *acs;		/* pinter to last alternate */
    183     int temp;
    184 
    185     /* let the conversion begin */
    186     acs = NIL;
    187     cs = NIL;
    188     while (*ure != NIL) {
    189 	switch (c = *ure++) {
    190 
    191 	case '\\':
    192 	    switch (c = *ure++) {
    193 
    194 	    /* escaped characters are just characters */
    195 	    default:
    196 		if (cs == NIL || (*cs & STR) == 0) {
    197 		    cs = ccre;
    198 		    *cs = STR;
    199 		    SCNT(cs) = 1;
    200 		    ccre += 2;
    201 		} else
    202 		    SCNT(cs)++;
    203 		*ccre++ = c;
    204 		break;
    205 
    206 	    /* normal(?) metacharacters */
    207 	    case 'a':
    208 	    case 'd':
    209 	    case 'e':
    210 	    case 'p':
    211 		if (acs != NIL && acs != cs) {
    212 		    do {
    213 			temp = OCNT(acs);
    214 			OCNT(acs) = ccre - acs;
    215 			acs -= temp;
    216 		    } while (temp != 0);
    217 		    acs = NIL;
    218 		}
    219 		cs = ccre;
    220 		*cs = META;
    221 		MSYM(cs) = c;
    222 		ccre = MNEXT(cs);
    223 		break;
    224 	    }
    225 	    break;
    226 
    227 	/* just put the symbol in */
    228 	case '^':
    229 	case '$':
    230 	    if (acs != NIL && acs != cs) {
    231 		do {
    232 		    temp = OCNT(acs);
    233 		    OCNT(acs) = ccre - acs;
    234 		    acs -= temp;
    235 		} while (temp != 0);
    236 		acs = NIL;
    237 	    }
    238 	    cs = ccre;
    239 	    *cs = META;
    240 	    MSYM(cs) = c;
    241 	    ccre = MNEXT(cs);
    242 	    break;
    243 
    244 	/* mark the last match sequence as optional */
    245 	case '?':
    246 	    if (cs)
    247 	    	*cs = *cs | OPT;
    248 	    break;
    249 
    250 	/* recurse and define a subexpression */
    251 	case '(':
    252 	    if (acs != NIL && acs != cs) {
    253 		do {
    254 		    temp = OCNT(acs);
    255 		    OCNT(acs) = ccre - acs;
    256 		    acs -= temp;
    257 		} while (temp != 0);
    258 		acs = NIL;
    259 	    }
    260 	    cs = ccre;
    261 	    *cs = OPER;
    262 	    OSYM(cs) = '(';
    263 	    ccre = ONEXT(cs);
    264 	    expconv();
    265 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
    266 	    break;
    267 
    268 	/* reurn from a recursion */
    269 	case ')':
    270 	    if (acs != NIL) {
    271 		do {
    272 		    temp = OCNT(acs);
    273 		    OCNT(acs) = ccre - acs;
    274 		    acs -= temp;
    275 		} while (temp != 0);
    276 		acs = NIL;
    277 	    }
    278 	    cs = ccre;
    279 	    *cs = META;
    280 	    MSYM(cs) = c;
    281 	    ccre = MNEXT(cs);
    282 	    return;
    283 
    284 	/* mark the last match sequence as having an alternate */
    285 	/* the third byte will contain an offset to jump over the */
    286 	/* alternate match in case the first did not fail */
    287 	case '|':
    288 	    if (acs != NIL && acs != cs)
    289 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
    290 	    else
    291 		OCNT(ccre) = 0;
    292 	    assert(cs != NULL);
    293 	    *cs |= ALT;
    294 	    cs = ccre;
    295 	    *cs = OPER;
    296 	    OSYM(cs) = '|';
    297 	    ccre = ONEXT(cs);
    298 	    acs = cs;	/* remember that the pointer is to be filles */
    299 	    break;
    300 
    301 	/* if its not a metasymbol just build a scharacter string */
    302 	default:
    303 	    if (cs == NIL || (*cs & STR) == 0) {
    304 		cs = ccre;
    305 		*cs = STR;
    306 		SCNT(cs) = 1;
    307 		ccre = SSTR(cs);
    308 	    } else
    309 		SCNT(cs)++;
    310 	    *ccre++ = c;
    311 	    break;
    312 	}
    313     }
    314     if (acs != NIL) {
    315 	do {
    316 	    temp = OCNT(acs);
    317 	    OCNT(acs) = ccre - acs;
    318 	    acs -= temp;
    319 	} while (temp != 0);
    320 	acs = NIL;
    321     }
    322     return;
    323 }
    324 /* end of convertre */
    325 
    326 
    327 /*
    328  *	The following routine recognises an irregular expresion
    329  *	with the following special characters:
    330  *
    331  *		\?	-	means last match was optional
    332  *		\a	-	matches any number of characters
    333  *		\d	-	matches any number of spaces and tabs
    334  *		\p	-	matches any number of alphanumeric
    335  *				characters. The
    336  *				characters matched will be copied into
    337  *				the area pointed to by 'name'.
    338  *		\|	-	alternation
    339  *		\( \)	-	grouping used mostly for alternation and
    340  *				optionality
    341  *
    342  *	The irregular expression must be translated to internal form
    343  *	prior to calling this routine
    344  *
    345  *	The value returned is the pointer to the first non \a
    346  *	character matched.
    347  */
    348 
    349 char *
    350 expmatch(s, re, mstring)
    351     char *s;		/* string to check for a match in */
    352     char *re;		/* a converted irregular expression */
    353     char *mstring;	/* where to put whatever matches a \p */
    354 {
    355     char *cs;		/* the current symbol */
    356     char *ptr,*s1;	/* temporary pointer */
    357     boolean matched;	/* a temporary boolean */
    358 
    359     /* initial conditions */
    360     if (re == NIL)
    361 	return (NIL);
    362     cs = re;
    363     matched = FALSE;
    364 
    365     /* loop till expression string is exhausted (or at least pretty tired) */
    366     while (*cs) {
    367 	switch (*cs & (OPER | STR | META)) {
    368 
    369 	/* try to match a string */
    370 	case STR:
    371 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
    372 	    if (matched) {
    373 
    374 		/* hoorah it matches */
    375 		s += SCNT(cs);
    376 		cs = SNEXT(cs);
    377 	    } else if (*cs & ALT) {
    378 
    379 		/* alternation, skip to next expression */
    380 		cs = SNEXT(cs);
    381 	    } else if (*cs & OPT) {
    382 
    383 		/* the match is optional */
    384 		cs = SNEXT(cs);
    385 		matched = 1;		/* indicate a successful match */
    386 	    } else {
    387 
    388 		/* no match, error return */
    389 		return (NIL);
    390 	    }
    391 	    break;
    392 
    393 	/* an operator, do something fancy */
    394 	case OPER:
    395 	    switch (OSYM(cs)) {
    396 
    397 	    /* this is an alternation */
    398 	    case '|':
    399 		if (matched)
    400 
    401 		    /* last thing in the alternation was a match, skip ahead */
    402 		    cs = OPTR(cs);
    403 		else
    404 
    405 		    /* no match, keep trying */
    406 		    cs = ONEXT(cs);
    407 		break;
    408 
    409 	    /* this is a grouping, recurse */
    410 	    case '(':
    411 		ptr = expmatch(s, ONEXT(cs), mstring);
    412 		if (ptr != NIL) {
    413 
    414 		    /* the subexpression matched */
    415 		    matched = 1;
    416 		    s = ptr;
    417 		} else if (*cs & ALT) {
    418 
    419 		    /* alternation, skip to next expression */
    420 		    matched = 0;
    421 		} else if (*cs & OPT) {
    422 
    423 		    /* the match is optional */
    424 		    matched = 1;	/* indicate a successful match */
    425 		} else {
    426 
    427 		    /* no match, error return */
    428 		    return (NIL);
    429 		}
    430 		cs = OPTR(cs);
    431 		break;
    432 	    }
    433 	    break;
    434 
    435 	/* try to match a metasymbol */
    436 	case META:
    437 	    switch (MSYM(cs)) {
    438 
    439 	    /* try to match anything and remember what was matched */
    440 	    case 'p':
    441 		/*
    442 		 *  This is really the same as trying the match the
    443 		 *  remaining parts of the expression to any subset
    444 		 *  of the string.
    445 		 */
    446 		s1 = s;
    447 		do {
    448 		    ptr = expmatch(s1, MNEXT(cs), mstring);
    449 		    if (ptr != NIL && s1 != s) {
    450 
    451 			/* we have a match, remember the match */
    452 			strncpy(mstring, s, s1 - s);
    453 			mstring[s1 - s] = '\0';
    454 			return (ptr);
    455 		    } else if (ptr != NIL && (*cs & OPT)) {
    456 
    457 			/* it was aoptional so no match is ok */
    458 			return (ptr);
    459 		    } else if (ptr != NIL) {
    460 
    461 			/* not optional and we still matched */
    462 			return (NIL);
    463 		    }
    464 		    if (!isalnum((unsigned char)*s1) && *s1 != '_')
    465 			return (NIL);
    466 		    if (*s1 == '\\')
    467 			x_escaped = x_escaped ? FALSE : TRUE;
    468 		    else
    469 			x_escaped = FALSE;
    470 		} while (*s1++);
    471 		return (NIL);
    472 
    473 	    /* try to match anything */
    474 	    case 'a':
    475 		/*
    476 		 *  This is really the same as trying the match the
    477 		 *  remaining parts of the expression to any subset
    478 		 *  of the string.
    479 		 */
    480 		s1 = s;
    481 		do {
    482 		    ptr = expmatch(s1, MNEXT(cs), mstring);
    483 		    if (ptr != NIL && s1 != s) {
    484 
    485 			/* we have a match */
    486 			return (ptr);
    487 		    } else if (ptr != NIL && (*cs & OPT)) {
    488 
    489 			/* it was aoptional so no match is ok */
    490 			return (ptr);
    491 		    } else if (ptr != NIL) {
    492 
    493 			/* not optional and we still matched */
    494 			return (NIL);
    495 		    }
    496 		    if (*s1 == '\\')
    497 			x_escaped = x_escaped ? FALSE : TRUE;
    498 		    else
    499 			x_escaped = FALSE;
    500 		} while (*s1++);
    501 		return (NIL);
    502 
    503 	    /* fail if we are currently x_escaped */
    504 	    case 'e':
    505 		if (x_escaped)
    506 		    return(NIL);
    507 		cs = MNEXT(cs);
    508 		break;
    509 
    510 	    /* match any number of tabs and spaces */
    511 	    case 'd':
    512 		ptr = s;
    513 		while (*s == ' ' || *s == '\t')
    514 		    s++;
    515 		if (s != ptr || s == x_start) {
    516 
    517 		    /* match, be happy */
    518 		    matched = 1;
    519 		    cs = MNEXT(cs);
    520 		} else if (*s == '\n' || *s == '\0') {
    521 
    522 		    /* match, be happy */
    523 		    matched = 1;
    524 		    cs = MNEXT(cs);
    525 		} else if (*cs & ALT) {
    526 
    527 		    /* try the next part */
    528 		    matched = 0;
    529 		    cs = MNEXT(cs);
    530 		} else if (*cs & OPT) {
    531 
    532 		    /* doesn't matter */
    533 		    matched = 1;
    534 		    cs = MNEXT(cs);
    535 		} else
    536 
    537 		    /* no match, error return */
    538 		    return (NIL);
    539 		break;
    540 
    541 	    /* check for end of line */
    542 	    case '$':
    543 		if (*s == '\0' || *s == '\n') {
    544 
    545 		    /* match, be happy */
    546 		    s++;
    547 		    matched = 1;
    548 		    cs = MNEXT(cs);
    549 		} else if (*cs & ALT) {
    550 
    551 		    /* try the next part */
    552 		    matched = 0;
    553 		    cs = MNEXT(cs);
    554 		} else if (*cs & OPT) {
    555 
    556 		    /* doesn't matter */
    557 		    matched = 1;
    558 		    cs = MNEXT(cs);
    559 		} else
    560 
    561 		    /* no match, error return */
    562 		    return (NIL);
    563 		break;
    564 
    565 	    /* check for start of line */
    566 	    case '^':
    567 		if (s == x_start) {
    568 
    569 		    /* match, be happy */
    570 		    matched = 1;
    571 		    cs = MNEXT(cs);
    572 		} else if (*cs & ALT) {
    573 
    574 		    /* try the next part */
    575 		    matched = 0;
    576 		    cs = MNEXT(cs);
    577 		} else if (*cs & OPT) {
    578 
    579 		    /* doesn't matter */
    580 		    matched = 1;
    581 		    cs = MNEXT(cs);
    582 		} else
    583 
    584 		    /* no match, error return */
    585 		    return (NIL);
    586 		break;
    587 
    588 	    /* end of a subexpression, return success */
    589 	    case ')':
    590 		return (s);
    591 	    }
    592 	    break;
    593 	}
    594     }
    595     return (s);
    596 }
    597