Home | History | Annotate | Line # | Download | only in vgrind
regexp.c revision 1.9
      1 /*	$NetBSD: regexp.c,v 1.9 2006/05/01 05:13:58 christos Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1980, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  * 3. Neither the name of the University nor the names of its contributors
     17  *    may be used to endorse or promote products derived from this software
     18  *    without specific prior written permission.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     30  * SUCH DAMAGE.
     31  */
     32 
     33 #include <sys/cdefs.h>
     34 #ifndef lint
     35 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\n\
     36 	The Regents of the University of California.  All rights reserved.\n");
     37 #endif /* not lint */
     38 
     39 #ifndef lint
     40 #if 0
     41 static char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
     42 #endif
     43 __RCSID("$NetBSD: regexp.c,v 1.9 2006/05/01 05:13:58 christos Exp $");
     44 #endif /* not lint */
     45 
     46 #include <ctype.h>
     47 #include <stdlib.h>
     48 #include <string.h>
     49 #include "extern.h"
     50 
     51 #define FALSE	0
     52 #define TRUE	!(FALSE)
     53 #define NIL	0
     54 
     55 static void	expconv __P((void));
     56 
     57 boolean	 x_escaped;	/* true if we are currently x_escaped */
     58 char	*x_start;	/* start of string */
     59 boolean	 l_onecase;	/* true if upper and lower equivalent */
     60 
     61 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
     62 
     63 /*  STRNCMP -	like strncmp except that we convert the
     64  *	 	first string to lower case before comparing
     65  *		if l_onecase is set.
     66  */
     67 
     68 int
     69 STRNCMP(s1, s2, len)
     70 	char *s1,*s2;
     71 	int len;
     72 {
     73 	if (l_onecase) {
     74 	    do
     75 		if (*s2 - makelower(*s1))
     76 			return (*s2 - makelower(*s1));
     77 		else {
     78 			s2++;
     79 			s1++;
     80 		}
     81 	    while (--len);
     82 	} else {
     83 	    do
     84 		if (*s2 - *s1)
     85 			return (*s2 - *s1);
     86 		else {
     87 			s2++;
     88 			s1++;
     89 		}
     90 	    while (--len);
     91 	}
     92 	return(0);
     93 }
     94 
     95 /*	The following routine converts an irregular expression to
     96  *	internal format.
     97  *
     98  *	Either meta symbols (\a \d or \p) or character strings or
     99  *	operations ( alternation or perenthesizing ) can be
    100  *	specified.  Each starts with a descriptor byte.  The descriptor
    101  *	byte has STR set for strings, META set for meta symbols
    102  *	and OPER set for operations.
    103  *	The descriptor byte can also have the OPT bit set if the object
    104  *	defined is optional.  Also ALT can be set to indicate an alternation.
    105  *
    106  *	For metasymbols the byte following the descriptor byte identities
    107  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
    108  *	strings the byte after the descriptor is a character count for
    109  *	the string:
    110  *
    111  *		meta symbols := descriptor
    112  *				symbol
    113  *
    114  *		strings :=	descriptor
    115  *				character count
    116  *				the string
    117  *
    118  *		operatins :=	descriptor
    119  *				symbol
    120  *				character count
    121  */
    122 
    123 /*
    124  *  handy macros for accessing parts of match blocks
    125  */
    126 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
    127 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
    128 
    129 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
    130 #define OCNT(A) (*(A+2))	/* character count */
    131 #define ONEXT(A) (A+3)		/* next character after the operation */
    132 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
    133 
    134 #define SCNT(A) (*(A+1))	/* byte count of a string */
    135 #define SSTR(A) (A+2)		/* address of the string */
    136 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
    137 
    138 /*
    139  *  bit flags in the descriptor
    140  */
    141 #define OPT 1
    142 #define STR 2
    143 #define META 4
    144 #define ALT 8
    145 #define OPER 16
    146 
    147 static char *ccre;	/* pointer to current position in converted exp*/
    148 static char *ure;	/* pointer current position in unconverted exp */
    149 
    150 char *
    151 convexp(re)
    152     char *re;		/* unconverted irregular expression */
    153 {
    154     char *cre;		/* pointer to converted regular expression */
    155 
    156     /* allocate room for the converted expression */
    157     if (re == NIL)
    158 	return (NIL);
    159     if (*re == '\0')
    160 	return (NIL);
    161     cre = malloc(4 * strlen(re) + 3);
    162     ccre = cre;
    163     ure = re;
    164 
    165     /* start the conversion with a \a */
    166     *cre = META | OPT;
    167     MSYM(cre) = 'a';
    168     ccre = MNEXT(cre);
    169 
    170     /* start the conversion (its recursive) */
    171     expconv();
    172     *ccre = 0;
    173     return (cre);
    174 }
    175 
    176 static void
    177 expconv()
    178 {
    179     char *cs;		/* pointer to current symbol in converted exp */
    180     char c;		/* character being processed */
    181     char *acs;		/* pinter to last alternate */
    182     int temp;
    183 
    184     /* let the conversion begin */
    185     acs = NIL;
    186     cs = NIL;
    187     while (*ure != NIL) {
    188 	switch (c = *ure++) {
    189 
    190 	case '\\':
    191 	    switch (c = *ure++) {
    192 
    193 	    /* escaped characters are just characters */
    194 	    default:
    195 		if (cs == NIL || (*cs & STR) == 0) {
    196 		    cs = ccre;
    197 		    *cs = STR;
    198 		    SCNT(cs) = 1;
    199 		    ccre += 2;
    200 		} else
    201 		    SCNT(cs)++;
    202 		*ccre++ = c;
    203 		break;
    204 
    205 	    /* normal(?) metacharacters */
    206 	    case 'a':
    207 	    case 'd':
    208 	    case 'e':
    209 	    case 'p':
    210 		if (acs != NIL && acs != cs) {
    211 		    do {
    212 			temp = OCNT(acs);
    213 			OCNT(acs) = ccre - acs;
    214 			acs -= temp;
    215 		    } while (temp != 0);
    216 		    acs = NIL;
    217 		}
    218 		cs = ccre;
    219 		*cs = META;
    220 		MSYM(cs) = c;
    221 		ccre = MNEXT(cs);
    222 		break;
    223 	    }
    224 	    break;
    225 
    226 	/* just put the symbol in */
    227 	case '^':
    228 	case '$':
    229 	    if (acs != NIL && acs != cs) {
    230 		do {
    231 		    temp = OCNT(acs);
    232 		    OCNT(acs) = ccre - acs;
    233 		    acs -= temp;
    234 		} while (temp != 0);
    235 		acs = NIL;
    236 	    }
    237 	    cs = ccre;
    238 	    *cs = META;
    239 	    MSYM(cs) = c;
    240 	    ccre = MNEXT(cs);
    241 	    break;
    242 
    243 	/* mark the last match sequence as optional */
    244 	case '?':
    245 	    if (cs)
    246 	    	*cs = *cs | OPT;
    247 	    break;
    248 
    249 	/* recurse and define a subexpression */
    250 	case '(':
    251 	    if (acs != NIL && acs != cs) {
    252 		do {
    253 		    temp = OCNT(acs);
    254 		    OCNT(acs) = ccre - acs;
    255 		    acs -= temp;
    256 		} while (temp != 0);
    257 		acs = NIL;
    258 	    }
    259 	    cs = ccre;
    260 	    *cs = OPER;
    261 	    OSYM(cs) = '(';
    262 	    ccre = ONEXT(cs);
    263 	    expconv();
    264 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
    265 	    break;
    266 
    267 	/* reurn from a recursion */
    268 	case ')':
    269 	    if (acs != NIL) {
    270 		do {
    271 		    temp = OCNT(acs);
    272 		    OCNT(acs) = ccre - acs;
    273 		    acs -= temp;
    274 		} while (temp != 0);
    275 		acs = NIL;
    276 	    }
    277 	    cs = ccre;
    278 	    *cs = META;
    279 	    MSYM(cs) = c;
    280 	    ccre = MNEXT(cs);
    281 	    return;
    282 
    283 	/* mark the last match sequence as having an alternate */
    284 	/* the third byte will contain an offset to jump over the */
    285 	/* alternate match in case the first did not fail */
    286 	case '|':
    287 	    if (acs != NIL && acs != cs)
    288 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
    289 	    else
    290 		OCNT(ccre) = 0;
    291 	    assert(cs != NULL);
    292 	    *cs |= ALT;
    293 	    cs = ccre;
    294 	    *cs = OPER;
    295 	    OSYM(cs) = '|';
    296 	    ccre = ONEXT(cs);
    297 	    acs = cs;	/* remember that the pointer is to be filles */
    298 	    break;
    299 
    300 	/* if its not a metasymbol just build a scharacter string */
    301 	default:
    302 	    if (cs == NIL || (*cs & STR) == 0) {
    303 		cs = ccre;
    304 		*cs = STR;
    305 		SCNT(cs) = 1;
    306 		ccre = SSTR(cs);
    307 	    } else
    308 		SCNT(cs)++;
    309 	    *ccre++ = c;
    310 	    break;
    311 	}
    312     }
    313     if (acs != NIL) {
    314 	do {
    315 	    temp = OCNT(acs);
    316 	    OCNT(acs) = ccre - acs;
    317 	    acs -= temp;
    318 	} while (temp != 0);
    319 	acs = NIL;
    320     }
    321     return;
    322 }
    323 /* end of convertre */
    324 
    325 
    326 /*
    327  *	The following routine recognises an irregular expresion
    328  *	with the following special characters:
    329  *
    330  *		\?	-	means last match was optional
    331  *		\a	-	matches any number of characters
    332  *		\d	-	matches any number of spaces and tabs
    333  *		\p	-	matches any number of alphanumeric
    334  *				characters. The
    335  *				characters matched will be copied into
    336  *				the area pointed to by 'name'.
    337  *		\|	-	alternation
    338  *		\( \)	-	grouping used mostly for alternation and
    339  *				optionality
    340  *
    341  *	The irregular expression must be translated to internal form
    342  *	prior to calling this routine
    343  *
    344  *	The value returned is the pointer to the first non \a
    345  *	character matched.
    346  */
    347 
    348 char *
    349 expmatch(s, re, mstring)
    350     char *s;		/* string to check for a match in */
    351     char *re;		/* a converted irregular expression */
    352     char *mstring;	/* where to put whatever matches a \p */
    353 {
    354     char *cs;		/* the current symbol */
    355     char *ptr,*s1;	/* temporary pointer */
    356     boolean matched;	/* a temporary boolean */
    357 
    358     /* initial conditions */
    359     if (re == NIL)
    360 	return (NIL);
    361     cs = re;
    362     matched = FALSE;
    363 
    364     /* loop till expression string is exhausted (or at least pretty tired) */
    365     while (*cs) {
    366 	switch (*cs & (OPER | STR | META)) {
    367 
    368 	/* try to match a string */
    369 	case STR:
    370 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
    371 	    if (matched) {
    372 
    373 		/* hoorah it matches */
    374 		s += SCNT(cs);
    375 		cs = SNEXT(cs);
    376 	    } else if (*cs & ALT) {
    377 
    378 		/* alternation, skip to next expression */
    379 		cs = SNEXT(cs);
    380 	    } else if (*cs & OPT) {
    381 
    382 		/* the match is optional */
    383 		cs = SNEXT(cs);
    384 		matched = 1;		/* indicate a successful match */
    385 	    } else {
    386 
    387 		/* no match, error return */
    388 		return (NIL);
    389 	    }
    390 	    break;
    391 
    392 	/* an operator, do something fancy */
    393 	case OPER:
    394 	    switch (OSYM(cs)) {
    395 
    396 	    /* this is an alternation */
    397 	    case '|':
    398 		if (matched)
    399 
    400 		    /* last thing in the alternation was a match, skip ahead */
    401 		    cs = OPTR(cs);
    402 		else
    403 
    404 		    /* no match, keep trying */
    405 		    cs = ONEXT(cs);
    406 		break;
    407 
    408 	    /* this is a grouping, recurse */
    409 	    case '(':
    410 		ptr = expmatch(s, ONEXT(cs), mstring);
    411 		if (ptr != NIL) {
    412 
    413 		    /* the subexpression matched */
    414 		    matched = 1;
    415 		    s = ptr;
    416 		} else if (*cs & ALT) {
    417 
    418 		    /* alternation, skip to next expression */
    419 		    matched = 0;
    420 		} else if (*cs & OPT) {
    421 
    422 		    /* the match is optional */
    423 		    matched = 1;	/* indicate a successful match */
    424 		} else {
    425 
    426 		    /* no match, error return */
    427 		    return (NIL);
    428 		}
    429 		cs = OPTR(cs);
    430 		break;
    431 	    }
    432 	    break;
    433 
    434 	/* try to match a metasymbol */
    435 	case META:
    436 	    switch (MSYM(cs)) {
    437 
    438 	    /* try to match anything and remember what was matched */
    439 	    case 'p':
    440 		/*
    441 		 *  This is really the same as trying the match the
    442 		 *  remaining parts of the expression to any subset
    443 		 *  of the string.
    444 		 */
    445 		s1 = s;
    446 		do {
    447 		    ptr = expmatch(s1, MNEXT(cs), mstring);
    448 		    if (ptr != NIL && s1 != s) {
    449 
    450 			/* we have a match, remember the match */
    451 			strncpy(mstring, s, s1 - s);
    452 			mstring[s1 - s] = '\0';
    453 			return (ptr);
    454 		    } else if (ptr != NIL && (*cs & OPT)) {
    455 
    456 			/* it was aoptional so no match is ok */
    457 			return (ptr);
    458 		    } else if (ptr != NIL) {
    459 
    460 			/* not optional and we still matched */
    461 			return (NIL);
    462 		    }
    463 		    if (!isalnum((unsigned char)*s1) && *s1 != '_')
    464 			return (NIL);
    465 		    if (*s1 == '\\')
    466 			x_escaped = x_escaped ? FALSE : TRUE;
    467 		    else
    468 			x_escaped = FALSE;
    469 		} while (*s1++);
    470 		return (NIL);
    471 
    472 	    /* try to match anything */
    473 	    case 'a':
    474 		/*
    475 		 *  This is really the same as trying the match the
    476 		 *  remaining parts of the expression to any subset
    477 		 *  of the string.
    478 		 */
    479 		s1 = s;
    480 		do {
    481 		    ptr = expmatch(s1, MNEXT(cs), mstring);
    482 		    if (ptr != NIL && s1 != s) {
    483 
    484 			/* we have a match */
    485 			return (ptr);
    486 		    } else if (ptr != NIL && (*cs & OPT)) {
    487 
    488 			/* it was aoptional so no match is ok */
    489 			return (ptr);
    490 		    } else if (ptr != NIL) {
    491 
    492 			/* not optional and we still matched */
    493 			return (NIL);
    494 		    }
    495 		    if (*s1 == '\\')
    496 			x_escaped = x_escaped ? FALSE : TRUE;
    497 		    else
    498 			x_escaped = FALSE;
    499 		} while (*s1++);
    500 		return (NIL);
    501 
    502 	    /* fail if we are currently x_escaped */
    503 	    case 'e':
    504 		if (x_escaped)
    505 		    return(NIL);
    506 		cs = MNEXT(cs);
    507 		break;
    508 
    509 	    /* match any number of tabs and spaces */
    510 	    case 'd':
    511 		ptr = s;
    512 		while (*s == ' ' || *s == '\t')
    513 		    s++;
    514 		if (s != ptr || s == x_start) {
    515 
    516 		    /* match, be happy */
    517 		    matched = 1;
    518 		    cs = MNEXT(cs);
    519 		} else if (*s == '\n' || *s == '\0') {
    520 
    521 		    /* match, be happy */
    522 		    matched = 1;
    523 		    cs = MNEXT(cs);
    524 		} else if (*cs & ALT) {
    525 
    526 		    /* try the next part */
    527 		    matched = 0;
    528 		    cs = MNEXT(cs);
    529 		} else if (*cs & OPT) {
    530 
    531 		    /* doesn't matter */
    532 		    matched = 1;
    533 		    cs = MNEXT(cs);
    534 		} else
    535 
    536 		    /* no match, error return */
    537 		    return (NIL);
    538 		break;
    539 
    540 	    /* check for end of line */
    541 	    case '$':
    542 		if (*s == '\0' || *s == '\n') {
    543 
    544 		    /* match, be happy */
    545 		    s++;
    546 		    matched = 1;
    547 		    cs = MNEXT(cs);
    548 		} else if (*cs & ALT) {
    549 
    550 		    /* try the next part */
    551 		    matched = 0;
    552 		    cs = MNEXT(cs);
    553 		} else if (*cs & OPT) {
    554 
    555 		    /* doesn't matter */
    556 		    matched = 1;
    557 		    cs = MNEXT(cs);
    558 		} else
    559 
    560 		    /* no match, error return */
    561 		    return (NIL);
    562 		break;
    563 
    564 	    /* check for start of line */
    565 	    case '^':
    566 		if (s == x_start) {
    567 
    568 		    /* match, be happy */
    569 		    matched = 1;
    570 		    cs = MNEXT(cs);
    571 		} else if (*cs & ALT) {
    572 
    573 		    /* try the next part */
    574 		    matched = 0;
    575 		    cs = MNEXT(cs);
    576 		} else if (*cs & OPT) {
    577 
    578 		    /* doesn't matter */
    579 		    matched = 1;
    580 		    cs = MNEXT(cs);
    581 		} else
    582 
    583 		    /* no match, error return */
    584 		    return (NIL);
    585 		break;
    586 
    587 	    /* end of a subexpression, return success */
    588 	    case ')':
    589 		return (s);
    590 	    }
    591 	    break;
    592 	}
    593     }
    594     return (s);
    595 }
    596