Home | History | Annotate | Line # | Download | only in vgrind
regexp.c revision 1.7
      1 /*	$NetBSD: regexp.c,v 1.7 2003/08/07 11:17:01 agc Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1980, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  * 3. Neither the name of the University nor the names of its contributors
     17  *    may be used to endorse or promote products derived from this software
     18  *    without specific prior written permission.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     30  * SUCH DAMAGE.
     31  */
     32 
     33 #include <sys/cdefs.h>
     34 #ifndef lint
     35 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\n\
     36 	The Regents of the University of California.  All rights reserved.\n");
     37 #endif /* not lint */
     38 
     39 #ifndef lint
     40 #if 0
     41 static char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
     42 #endif
     43 __RCSID("$NetBSD: regexp.c,v 1.7 2003/08/07 11:17:01 agc Exp $");
     44 #endif /* not lint */
     45 
     46 #include <ctype.h>
     47 #include <stdlib.h>
     48 #include <string.h>
     49 #include "extern.h"
     50 
     51 #define FALSE	0
     52 #define TRUE	!(FALSE)
     53 #define NIL	0
     54 
     55 static void	expconv __P((void));
     56 
     57 boolean	 x_escaped;	/* true if we are currently x_escaped */
     58 char	*x_start;	/* start of string */
     59 boolean	 l_onecase;	/* true if upper and lower equivalent */
     60 
     61 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((c)) : (c))
     62 
     63 /*  STRNCMP -	like strncmp except that we convert the
     64  *	 	first string to lower case before comparing
     65  *		if l_onecase is set.
     66  */
     67 
     68 int
     69 STRNCMP(s1, s2, len)
     70 	char *s1,*s2;
     71 	int len;
     72 {
     73 	if (l_onecase) {
     74 	    do
     75 		if (*s2 - makelower(*s1))
     76 			return (*s2 - makelower(*s1));
     77 		else {
     78 			s2++;
     79 			s1++;
     80 		}
     81 	    while (--len);
     82 	} else {
     83 	    do
     84 		if (*s2 - *s1)
     85 			return (*s2 - *s1);
     86 		else {
     87 			s2++;
     88 			s1++;
     89 		}
     90 	    while (--len);
     91 	}
     92 	return(0);
     93 }
     94 
     95 /*	The following routine converts an irregular expression to
     96  *	internal format.
     97  *
     98  *	Either meta symbols (\a \d or \p) or character strings or
     99  *	operations ( alternation or perenthesizing ) can be
    100  *	specified.  Each starts with a descriptor byte.  The descriptor
    101  *	byte has STR set for strings, META set for meta symbols
    102  *	and OPER set for operations.
    103  *	The descriptor byte can also have the OPT bit set if the object
    104  *	defined is optional.  Also ALT can be set to indicate an alternation.
    105  *
    106  *	For metasymbols the byte following the descriptor byte identities
    107  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
    108  *	strings the byte after the descriptor is a character count for
    109  *	the string:
    110  *
    111  *		meta symbols := descriptor
    112  *				symbol
    113  *
    114  *		strings :=	descriptor
    115  *				character count
    116  *				the string
    117  *
    118  *		operatins :=	descriptor
    119  *				symbol
    120  *				character count
    121  */
    122 
    123 /*
    124  *  handy macros for accessing parts of match blocks
    125  */
    126 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
    127 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
    128 
    129 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
    130 #define OCNT(A) (*(A+2))	/* character count */
    131 #define ONEXT(A) (A+3)		/* next character after the operation */
    132 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
    133 
    134 #define SCNT(A) (*(A+1))	/* byte count of a string */
    135 #define SSTR(A) (A+2)		/* address of the string */
    136 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
    137 
    138 /*
    139  *  bit flags in the descriptor
    140  */
    141 #define OPT 1
    142 #define STR 2
    143 #define META 4
    144 #define ALT 8
    145 #define OPER 16
    146 
    147 static char *ccre;	/* pointer to current position in converted exp*/
    148 static char *ure;	/* pointer current position in unconverted exp */
    149 
    150 char *
    151 convexp(re)
    152     char *re;		/* unconverted irregular expression */
    153 {
    154     char *cre;		/* pointer to converted regular expression */
    155 
    156     /* allocate room for the converted expression */
    157     if (re == NIL)
    158 	return (NIL);
    159     if (*re == '\0')
    160 	return (NIL);
    161     cre = malloc(4 * strlen(re) + 3);
    162     ccre = cre;
    163     ure = re;
    164 
    165     /* start the conversion with a \a */
    166     *cre = META | OPT;
    167     MSYM(cre) = 'a';
    168     ccre = MNEXT(cre);
    169 
    170     /* start the conversion (its recursive) */
    171     expconv();
    172     *ccre = 0;
    173     return (cre);
    174 }
    175 
    176 static void
    177 expconv()
    178 {
    179     char *cs;		/* pointer to current symbol in converted exp */
    180     char c;		/* character being processed */
    181     char *acs;		/* pinter to last alternate */
    182     int temp;
    183 
    184     /* let the conversion begin */
    185     acs = NIL;
    186     cs = NIL;
    187     while (*ure != NIL) {
    188 	switch (c = *ure++) {
    189 
    190 	case '\\':
    191 	    switch (c = *ure++) {
    192 
    193 	    /* escaped characters are just characters */
    194 	    default:
    195 		if (cs == NIL || (*cs & STR) == 0) {
    196 		    cs = ccre;
    197 		    *cs = STR;
    198 		    SCNT(cs) = 1;
    199 		    ccre += 2;
    200 		} else
    201 		    SCNT(cs)++;
    202 		*ccre++ = c;
    203 		break;
    204 
    205 	    /* normal(?) metacharacters */
    206 	    case 'a':
    207 	    case 'd':
    208 	    case 'e':
    209 	    case 'p':
    210 		if (acs != NIL && acs != cs) {
    211 		    do {
    212 			temp = OCNT(acs);
    213 			OCNT(acs) = ccre - acs;
    214 			acs -= temp;
    215 		    } while (temp != 0);
    216 		    acs = NIL;
    217 		}
    218 		cs = ccre;
    219 		*cs = META;
    220 		MSYM(cs) = c;
    221 		ccre = MNEXT(cs);
    222 		break;
    223 	    }
    224 	    break;
    225 
    226 	/* just put the symbol in */
    227 	case '^':
    228 	case '$':
    229 	    if (acs != NIL && acs != cs) {
    230 		do {
    231 		    temp = OCNT(acs);
    232 		    OCNT(acs) = ccre - acs;
    233 		    acs -= temp;
    234 		} while (temp != 0);
    235 		acs = NIL;
    236 	    }
    237 	    cs = ccre;
    238 	    *cs = META;
    239 	    MSYM(cs) = c;
    240 	    ccre = MNEXT(cs);
    241 	    break;
    242 
    243 	/* mark the last match sequence as optional */
    244 	case '?':
    245 	    if (cs)
    246 	    	*cs = *cs | OPT;
    247 	    break;
    248 
    249 	/* recurse and define a subexpression */
    250 	case '(':
    251 	    if (acs != NIL && acs != cs) {
    252 		do {
    253 		    temp = OCNT(acs);
    254 		    OCNT(acs) = ccre - acs;
    255 		    acs -= temp;
    256 		} while (temp != 0);
    257 		acs = NIL;
    258 	    }
    259 	    cs = ccre;
    260 	    *cs = OPER;
    261 	    OSYM(cs) = '(';
    262 	    ccre = ONEXT(cs);
    263 	    expconv();
    264 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
    265 	    break;
    266 
    267 	/* reurn from a recursion */
    268 	case ')':
    269 	    if (acs != NIL) {
    270 		do {
    271 		    temp = OCNT(acs);
    272 		    OCNT(acs) = ccre - acs;
    273 		    acs -= temp;
    274 		} while (temp != 0);
    275 		acs = NIL;
    276 	    }
    277 	    cs = ccre;
    278 	    *cs = META;
    279 	    MSYM(cs) = c;
    280 	    ccre = MNEXT(cs);
    281 	    return;
    282 
    283 	/* mark the last match sequence as having an alternate */
    284 	/* the third byte will contain an offset to jump over the */
    285 	/* alternate match in case the first did not fail */
    286 	case '|':
    287 	    if (acs != NIL && acs != cs)
    288 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
    289 	    else
    290 		OCNT(ccre) = 0;
    291 	    *cs |= ALT;
    292 	    cs = ccre;
    293 	    *cs = OPER;
    294 	    OSYM(cs) = '|';
    295 	    ccre = ONEXT(cs);
    296 	    acs = cs;	/* remember that the pointer is to be filles */
    297 	    break;
    298 
    299 	/* if its not a metasymbol just build a scharacter string */
    300 	default:
    301 	    if (cs == NIL || (*cs & STR) == 0) {
    302 		cs = ccre;
    303 		*cs = STR;
    304 		SCNT(cs) = 1;
    305 		ccre = SSTR(cs);
    306 	    } else
    307 		SCNT(cs)++;
    308 	    *ccre++ = c;
    309 	    break;
    310 	}
    311     }
    312     if (acs != NIL) {
    313 	do {
    314 	    temp = OCNT(acs);
    315 	    OCNT(acs) = ccre - acs;
    316 	    acs -= temp;
    317 	} while (temp != 0);
    318 	acs = NIL;
    319     }
    320     return;
    321 }
    322 /* end of convertre */
    323 
    324 
    325 /*
    326  *	The following routine recognises an irregular expresion
    327  *	with the following special characters:
    328  *
    329  *		\?	-	means last match was optional
    330  *		\a	-	matches any number of characters
    331  *		\d	-	matches any number of spaces and tabs
    332  *		\p	-	matches any number of alphanumeric
    333  *				characters. The
    334  *				characters matched will be copied into
    335  *				the area pointed to by 'name'.
    336  *		\|	-	alternation
    337  *		\( \)	-	grouping used mostly for alternation and
    338  *				optionality
    339  *
    340  *	The irregular expression must be translated to internal form
    341  *	prior to calling this routine
    342  *
    343  *	The value returned is the pointer to the first non \a
    344  *	character matched.
    345  */
    346 
    347 char *
    348 expmatch(s, re, mstring)
    349     char *s;		/* string to check for a match in */
    350     char *re;		/* a converted irregular expression */
    351     char *mstring;	/* where to put whatever matches a \p */
    352 {
    353     char *cs;		/* the current symbol */
    354     char *ptr,*s1;	/* temporary pointer */
    355     boolean matched;	/* a temporary boolean */
    356 
    357     /* initial conditions */
    358     if (re == NIL)
    359 	return (NIL);
    360     cs = re;
    361     matched = FALSE;
    362 
    363     /* loop till expression string is exhausted (or at least pretty tired) */
    364     while (*cs) {
    365 	switch (*cs & (OPER | STR | META)) {
    366 
    367 	/* try to match a string */
    368 	case STR:
    369 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
    370 	    if (matched) {
    371 
    372 		/* hoorah it matches */
    373 		s += SCNT(cs);
    374 		cs = SNEXT(cs);
    375 	    } else if (*cs & ALT) {
    376 
    377 		/* alternation, skip to next expression */
    378 		cs = SNEXT(cs);
    379 	    } else if (*cs & OPT) {
    380 
    381 		/* the match is optional */
    382 		cs = SNEXT(cs);
    383 		matched = 1;		/* indicate a successful match */
    384 	    } else {
    385 
    386 		/* no match, error return */
    387 		return (NIL);
    388 	    }
    389 	    break;
    390 
    391 	/* an operator, do something fancy */
    392 	case OPER:
    393 	    switch (OSYM(cs)) {
    394 
    395 	    /* this is an alternation */
    396 	    case '|':
    397 		if (matched)
    398 
    399 		    /* last thing in the alternation was a match, skip ahead */
    400 		    cs = OPTR(cs);
    401 		else
    402 
    403 		    /* no match, keep trying */
    404 		    cs = ONEXT(cs);
    405 		break;
    406 
    407 	    /* this is a grouping, recurse */
    408 	    case '(':
    409 		ptr = expmatch(s, ONEXT(cs), mstring);
    410 		if (ptr != NIL) {
    411 
    412 		    /* the subexpression matched */
    413 		    matched = 1;
    414 		    s = ptr;
    415 		} else if (*cs & ALT) {
    416 
    417 		    /* alternation, skip to next expression */
    418 		    matched = 0;
    419 		} else if (*cs & OPT) {
    420 
    421 		    /* the match is optional */
    422 		    matched = 1;	/* indicate a successful match */
    423 		} else {
    424 
    425 		    /* no match, error return */
    426 		    return (NIL);
    427 		}
    428 		cs = OPTR(cs);
    429 		break;
    430 	    }
    431 	    break;
    432 
    433 	/* try to match a metasymbol */
    434 	case META:
    435 	    switch (MSYM(cs)) {
    436 
    437 	    /* try to match anything and remember what was matched */
    438 	    case 'p':
    439 		/*
    440 		 *  This is really the same as trying the match the
    441 		 *  remaining parts of the expression to any subset
    442 		 *  of the string.
    443 		 */
    444 		s1 = s;
    445 		do {
    446 		    ptr = expmatch(s1, MNEXT(cs), mstring);
    447 		    if (ptr != NIL && s1 != s) {
    448 
    449 			/* we have a match, remember the match */
    450 			strncpy(mstring, s, s1 - s);
    451 			mstring[s1 - s] = '\0';
    452 			return (ptr);
    453 		    } else if (ptr != NIL && (*cs & OPT)) {
    454 
    455 			/* it was aoptional so no match is ok */
    456 			return (ptr);
    457 		    } else if (ptr != NIL) {
    458 
    459 			/* not optional and we still matched */
    460 			return (NIL);
    461 		    }
    462 		    if (!isalnum((unsigned char)*s1) && *s1 != '_')
    463 			return (NIL);
    464 		    if (*s1 == '\\')
    465 			x_escaped = x_escaped ? FALSE : TRUE;
    466 		    else
    467 			x_escaped = FALSE;
    468 		} while (*s1++);
    469 		return (NIL);
    470 
    471 	    /* try to match anything */
    472 	    case 'a':
    473 		/*
    474 		 *  This is really the same as trying the match the
    475 		 *  remaining parts of the expression to any subset
    476 		 *  of the string.
    477 		 */
    478 		s1 = s;
    479 		do {
    480 		    ptr = expmatch(s1, MNEXT(cs), mstring);
    481 		    if (ptr != NIL && s1 != s) {
    482 
    483 			/* we have a match */
    484 			return (ptr);
    485 		    } else if (ptr != NIL && (*cs & OPT)) {
    486 
    487 			/* it was aoptional so no match is ok */
    488 			return (ptr);
    489 		    } else if (ptr != NIL) {
    490 
    491 			/* not optional and we still matched */
    492 			return (NIL);
    493 		    }
    494 		    if (*s1 == '\\')
    495 			x_escaped = x_escaped ? FALSE : TRUE;
    496 		    else
    497 			x_escaped = FALSE;
    498 		} while (*s1++);
    499 		return (NIL);
    500 
    501 	    /* fail if we are currently x_escaped */
    502 	    case 'e':
    503 		if (x_escaped)
    504 		    return(NIL);
    505 		cs = MNEXT(cs);
    506 		break;
    507 
    508 	    /* match any number of tabs and spaces */
    509 	    case 'd':
    510 		ptr = s;
    511 		while (*s == ' ' || *s == '\t')
    512 		    s++;
    513 		if (s != ptr || s == x_start) {
    514 
    515 		    /* match, be happy */
    516 		    matched = 1;
    517 		    cs = MNEXT(cs);
    518 		} else if (*s == '\n' || *s == '\0') {
    519 
    520 		    /* match, be happy */
    521 		    matched = 1;
    522 		    cs = MNEXT(cs);
    523 		} else if (*cs & ALT) {
    524 
    525 		    /* try the next part */
    526 		    matched = 0;
    527 		    cs = MNEXT(cs);
    528 		} else if (*cs & OPT) {
    529 
    530 		    /* doesn't matter */
    531 		    matched = 1;
    532 		    cs = MNEXT(cs);
    533 		} else
    534 
    535 		    /* no match, error return */
    536 		    return (NIL);
    537 		break;
    538 
    539 	    /* check for end of line */
    540 	    case '$':
    541 		if (*s == '\0' || *s == '\n') {
    542 
    543 		    /* match, be happy */
    544 		    s++;
    545 		    matched = 1;
    546 		    cs = MNEXT(cs);
    547 		} else if (*cs & ALT) {
    548 
    549 		    /* try the next part */
    550 		    matched = 0;
    551 		    cs = MNEXT(cs);
    552 		} else if (*cs & OPT) {
    553 
    554 		    /* doesn't matter */
    555 		    matched = 1;
    556 		    cs = MNEXT(cs);
    557 		} else
    558 
    559 		    /* no match, error return */
    560 		    return (NIL);
    561 		break;
    562 
    563 	    /* check for start of line */
    564 	    case '^':
    565 		if (s == x_start) {
    566 
    567 		    /* match, be happy */
    568 		    matched = 1;
    569 		    cs = MNEXT(cs);
    570 		} else if (*cs & ALT) {
    571 
    572 		    /* try the next part */
    573 		    matched = 0;
    574 		    cs = MNEXT(cs);
    575 		} else if (*cs & OPT) {
    576 
    577 		    /* doesn't matter */
    578 		    matched = 1;
    579 		    cs = MNEXT(cs);
    580 		} else
    581 
    582 		    /* no match, error return */
    583 		    return (NIL);
    584 		break;
    585 
    586 	    /* end of a subexpression, return success */
    587 	    case ')':
    588 		return (s);
    589 	    }
    590 	    break;
    591 	}
    592     }
    593     return (s);
    594 }
    595