Home | History | Annotate | Line # | Download | only in vgrind
regexp.c revision 1.6
      1 /*	$NetBSD: regexp.c,v 1.6 2003/07/12 13:37:15 itojun Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1980, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  * 3. All advertising materials mentioning features or use of this software
     17  *    must display the following acknowledgement:
     18  *	This product includes software developed by the University of
     19  *	California, Berkeley and its contributors.
     20  * 4. Neither the name of the University nor the names of its contributors
     21  *    may be used to endorse or promote products derived from this software
     22  *    without specific prior written permission.
     23  *
     24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     34  * SUCH DAMAGE.
     35  */
     36 
     37 #include <sys/cdefs.h>
     38 #ifndef lint
     39 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\n\
     40 	The Regents of the University of California.  All rights reserved.\n");
     41 #endif /* not lint */
     42 
     43 #ifndef lint
     44 #if 0
     45 static char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
     46 #endif
     47 __RCSID("$NetBSD: regexp.c,v 1.6 2003/07/12 13:37:15 itojun Exp $");
     48 #endif /* not lint */
     49 
     50 #include <ctype.h>
     51 #include <stdlib.h>
     52 #include <string.h>
     53 #include "extern.h"
     54 
     55 #define FALSE	0
     56 #define TRUE	!(FALSE)
     57 #define NIL	0
     58 
     59 static void	expconv __P((void));
     60 
     61 boolean	 x_escaped;	/* true if we are currently x_escaped */
     62 char	*x_start;	/* start of string */
     63 boolean	 l_onecase;	/* true if upper and lower equivalent */
     64 
     65 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((c)) : (c))
     66 
     67 /*  STRNCMP -	like strncmp except that we convert the
     68  *	 	first string to lower case before comparing
     69  *		if l_onecase is set.
     70  */
     71 
     72 int
     73 STRNCMP(s1, s2, len)
     74 	char *s1,*s2;
     75 	int len;
     76 {
     77 	if (l_onecase) {
     78 	    do
     79 		if (*s2 - makelower(*s1))
     80 			return (*s2 - makelower(*s1));
     81 		else {
     82 			s2++;
     83 			s1++;
     84 		}
     85 	    while (--len);
     86 	} else {
     87 	    do
     88 		if (*s2 - *s1)
     89 			return (*s2 - *s1);
     90 		else {
     91 			s2++;
     92 			s1++;
     93 		}
     94 	    while (--len);
     95 	}
     96 	return(0);
     97 }
     98 
     99 /*	The following routine converts an irregular expression to
    100  *	internal format.
    101  *
    102  *	Either meta symbols (\a \d or \p) or character strings or
    103  *	operations ( alternation or perenthesizing ) can be
    104  *	specified.  Each starts with a descriptor byte.  The descriptor
    105  *	byte has STR set for strings, META set for meta symbols
    106  *	and OPER set for operations.
    107  *	The descriptor byte can also have the OPT bit set if the object
    108  *	defined is optional.  Also ALT can be set to indicate an alternation.
    109  *
    110  *	For metasymbols the byte following the descriptor byte identities
    111  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
    112  *	strings the byte after the descriptor is a character count for
    113  *	the string:
    114  *
    115  *		meta symbols := descriptor
    116  *				symbol
    117  *
    118  *		strings :=	descriptor
    119  *				character count
    120  *				the string
    121  *
    122  *		operatins :=	descriptor
    123  *				symbol
    124  *				character count
    125  */
    126 
    127 /*
    128  *  handy macros for accessing parts of match blocks
    129  */
    130 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
    131 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
    132 
    133 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
    134 #define OCNT(A) (*(A+2))	/* character count */
    135 #define ONEXT(A) (A+3)		/* next character after the operation */
    136 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
    137 
    138 #define SCNT(A) (*(A+1))	/* byte count of a string */
    139 #define SSTR(A) (A+2)		/* address of the string */
    140 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
    141 
    142 /*
    143  *  bit flags in the descriptor
    144  */
    145 #define OPT 1
    146 #define STR 2
    147 #define META 4
    148 #define ALT 8
    149 #define OPER 16
    150 
    151 static char *ccre;	/* pointer to current position in converted exp*/
    152 static char *ure;	/* pointer current position in unconverted exp */
    153 
    154 char *
    155 convexp(re)
    156     char *re;		/* unconverted irregular expression */
    157 {
    158     char *cre;		/* pointer to converted regular expression */
    159 
    160     /* allocate room for the converted expression */
    161     if (re == NIL)
    162 	return (NIL);
    163     if (*re == '\0')
    164 	return (NIL);
    165     cre = malloc(4 * strlen(re) + 3);
    166     ccre = cre;
    167     ure = re;
    168 
    169     /* start the conversion with a \a */
    170     *cre = META | OPT;
    171     MSYM(cre) = 'a';
    172     ccre = MNEXT(cre);
    173 
    174     /* start the conversion (its recursive) */
    175     expconv();
    176     *ccre = 0;
    177     return (cre);
    178 }
    179 
    180 static void
    181 expconv()
    182 {
    183     char *cs;		/* pointer to current symbol in converted exp */
    184     char c;		/* character being processed */
    185     char *acs;		/* pinter to last alternate */
    186     int temp;
    187 
    188     /* let the conversion begin */
    189     acs = NIL;
    190     cs = NIL;
    191     while (*ure != NIL) {
    192 	switch (c = *ure++) {
    193 
    194 	case '\\':
    195 	    switch (c = *ure++) {
    196 
    197 	    /* escaped characters are just characters */
    198 	    default:
    199 		if (cs == NIL || (*cs & STR) == 0) {
    200 		    cs = ccre;
    201 		    *cs = STR;
    202 		    SCNT(cs) = 1;
    203 		    ccre += 2;
    204 		} else
    205 		    SCNT(cs)++;
    206 		*ccre++ = c;
    207 		break;
    208 
    209 	    /* normal(?) metacharacters */
    210 	    case 'a':
    211 	    case 'd':
    212 	    case 'e':
    213 	    case 'p':
    214 		if (acs != NIL && acs != cs) {
    215 		    do {
    216 			temp = OCNT(acs);
    217 			OCNT(acs) = ccre - acs;
    218 			acs -= temp;
    219 		    } while (temp != 0);
    220 		    acs = NIL;
    221 		}
    222 		cs = ccre;
    223 		*cs = META;
    224 		MSYM(cs) = c;
    225 		ccre = MNEXT(cs);
    226 		break;
    227 	    }
    228 	    break;
    229 
    230 	/* just put the symbol in */
    231 	case '^':
    232 	case '$':
    233 	    if (acs != NIL && acs != cs) {
    234 		do {
    235 		    temp = OCNT(acs);
    236 		    OCNT(acs) = ccre - acs;
    237 		    acs -= temp;
    238 		} while (temp != 0);
    239 		acs = NIL;
    240 	    }
    241 	    cs = ccre;
    242 	    *cs = META;
    243 	    MSYM(cs) = c;
    244 	    ccre = MNEXT(cs);
    245 	    break;
    246 
    247 	/* mark the last match sequence as optional */
    248 	case '?':
    249 	    if (cs)
    250 	    	*cs = *cs | OPT;
    251 	    break;
    252 
    253 	/* recurse and define a subexpression */
    254 	case '(':
    255 	    if (acs != NIL && acs != cs) {
    256 		do {
    257 		    temp = OCNT(acs);
    258 		    OCNT(acs) = ccre - acs;
    259 		    acs -= temp;
    260 		} while (temp != 0);
    261 		acs = NIL;
    262 	    }
    263 	    cs = ccre;
    264 	    *cs = OPER;
    265 	    OSYM(cs) = '(';
    266 	    ccre = ONEXT(cs);
    267 	    expconv();
    268 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
    269 	    break;
    270 
    271 	/* reurn from a recursion */
    272 	case ')':
    273 	    if (acs != NIL) {
    274 		do {
    275 		    temp = OCNT(acs);
    276 		    OCNT(acs) = ccre - acs;
    277 		    acs -= temp;
    278 		} while (temp != 0);
    279 		acs = NIL;
    280 	    }
    281 	    cs = ccre;
    282 	    *cs = META;
    283 	    MSYM(cs) = c;
    284 	    ccre = MNEXT(cs);
    285 	    return;
    286 
    287 	/* mark the last match sequence as having an alternate */
    288 	/* the third byte will contain an offset to jump over the */
    289 	/* alternate match in case the first did not fail */
    290 	case '|':
    291 	    if (acs != NIL && acs != cs)
    292 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
    293 	    else
    294 		OCNT(ccre) = 0;
    295 	    *cs |= ALT;
    296 	    cs = ccre;
    297 	    *cs = OPER;
    298 	    OSYM(cs) = '|';
    299 	    ccre = ONEXT(cs);
    300 	    acs = cs;	/* remember that the pointer is to be filles */
    301 	    break;
    302 
    303 	/* if its not a metasymbol just build a scharacter string */
    304 	default:
    305 	    if (cs == NIL || (*cs & STR) == 0) {
    306 		cs = ccre;
    307 		*cs = STR;
    308 		SCNT(cs) = 1;
    309 		ccre = SSTR(cs);
    310 	    } else
    311 		SCNT(cs)++;
    312 	    *ccre++ = c;
    313 	    break;
    314 	}
    315     }
    316     if (acs != NIL) {
    317 	do {
    318 	    temp = OCNT(acs);
    319 	    OCNT(acs) = ccre - acs;
    320 	    acs -= temp;
    321 	} while (temp != 0);
    322 	acs = NIL;
    323     }
    324     return;
    325 }
    326 /* end of convertre */
    327 
    328 
    329 /*
    330  *	The following routine recognises an irregular expresion
    331  *	with the following special characters:
    332  *
    333  *		\?	-	means last match was optional
    334  *		\a	-	matches any number of characters
    335  *		\d	-	matches any number of spaces and tabs
    336  *		\p	-	matches any number of alphanumeric
    337  *				characters. The
    338  *				characters matched will be copied into
    339  *				the area pointed to by 'name'.
    340  *		\|	-	alternation
    341  *		\( \)	-	grouping used mostly for alternation and
    342  *				optionality
    343  *
    344  *	The irregular expression must be translated to internal form
    345  *	prior to calling this routine
    346  *
    347  *	The value returned is the pointer to the first non \a
    348  *	character matched.
    349  */
    350 
    351 char *
    352 expmatch(s, re, mstring)
    353     char *s;		/* string to check for a match in */
    354     char *re;		/* a converted irregular expression */
    355     char *mstring;	/* where to put whatever matches a \p */
    356 {
    357     char *cs;		/* the current symbol */
    358     char *ptr,*s1;	/* temporary pointer */
    359     boolean matched;	/* a temporary boolean */
    360 
    361     /* initial conditions */
    362     if (re == NIL)
    363 	return (NIL);
    364     cs = re;
    365     matched = FALSE;
    366 
    367     /* loop till expression string is exhausted (or at least pretty tired) */
    368     while (*cs) {
    369 	switch (*cs & (OPER | STR | META)) {
    370 
    371 	/* try to match a string */
    372 	case STR:
    373 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
    374 	    if (matched) {
    375 
    376 		/* hoorah it matches */
    377 		s += SCNT(cs);
    378 		cs = SNEXT(cs);
    379 	    } else if (*cs & ALT) {
    380 
    381 		/* alternation, skip to next expression */
    382 		cs = SNEXT(cs);
    383 	    } else if (*cs & OPT) {
    384 
    385 		/* the match is optional */
    386 		cs = SNEXT(cs);
    387 		matched = 1;		/* indicate a successful match */
    388 	    } else {
    389 
    390 		/* no match, error return */
    391 		return (NIL);
    392 	    }
    393 	    break;
    394 
    395 	/* an operator, do something fancy */
    396 	case OPER:
    397 	    switch (OSYM(cs)) {
    398 
    399 	    /* this is an alternation */
    400 	    case '|':
    401 		if (matched)
    402 
    403 		    /* last thing in the alternation was a match, skip ahead */
    404 		    cs = OPTR(cs);
    405 		else
    406 
    407 		    /* no match, keep trying */
    408 		    cs = ONEXT(cs);
    409 		break;
    410 
    411 	    /* this is a grouping, recurse */
    412 	    case '(':
    413 		ptr = expmatch(s, ONEXT(cs), mstring);
    414 		if (ptr != NIL) {
    415 
    416 		    /* the subexpression matched */
    417 		    matched = 1;
    418 		    s = ptr;
    419 		} else if (*cs & ALT) {
    420 
    421 		    /* alternation, skip to next expression */
    422 		    matched = 0;
    423 		} else if (*cs & OPT) {
    424 
    425 		    /* the match is optional */
    426 		    matched = 1;	/* indicate a successful match */
    427 		} else {
    428 
    429 		    /* no match, error return */
    430 		    return (NIL);
    431 		}
    432 		cs = OPTR(cs);
    433 		break;
    434 	    }
    435 	    break;
    436 
    437 	/* try to match a metasymbol */
    438 	case META:
    439 	    switch (MSYM(cs)) {
    440 
    441 	    /* try to match anything and remember what was matched */
    442 	    case 'p':
    443 		/*
    444 		 *  This is really the same as trying the match the
    445 		 *  remaining parts of the expression to any subset
    446 		 *  of the string.
    447 		 */
    448 		s1 = s;
    449 		do {
    450 		    ptr = expmatch(s1, MNEXT(cs), mstring);
    451 		    if (ptr != NIL && s1 != s) {
    452 
    453 			/* we have a match, remember the match */
    454 			strncpy(mstring, s, s1 - s);
    455 			mstring[s1 - s] = '\0';
    456 			return (ptr);
    457 		    } else if (ptr != NIL && (*cs & OPT)) {
    458 
    459 			/* it was aoptional so no match is ok */
    460 			return (ptr);
    461 		    } else if (ptr != NIL) {
    462 
    463 			/* not optional and we still matched */
    464 			return (NIL);
    465 		    }
    466 		    if (!isalnum((unsigned char)*s1) && *s1 != '_')
    467 			return (NIL);
    468 		    if (*s1 == '\\')
    469 			x_escaped = x_escaped ? FALSE : TRUE;
    470 		    else
    471 			x_escaped = FALSE;
    472 		} while (*s1++);
    473 		return (NIL);
    474 
    475 	    /* try to match anything */
    476 	    case 'a':
    477 		/*
    478 		 *  This is really the same as trying the match the
    479 		 *  remaining parts of the expression to any subset
    480 		 *  of the string.
    481 		 */
    482 		s1 = s;
    483 		do {
    484 		    ptr = expmatch(s1, MNEXT(cs), mstring);
    485 		    if (ptr != NIL && s1 != s) {
    486 
    487 			/* we have a match */
    488 			return (ptr);
    489 		    } else if (ptr != NIL && (*cs & OPT)) {
    490 
    491 			/* it was aoptional so no match is ok */
    492 			return (ptr);
    493 		    } else if (ptr != NIL) {
    494 
    495 			/* not optional and we still matched */
    496 			return (NIL);
    497 		    }
    498 		    if (*s1 == '\\')
    499 			x_escaped = x_escaped ? FALSE : TRUE;
    500 		    else
    501 			x_escaped = FALSE;
    502 		} while (*s1++);
    503 		return (NIL);
    504 
    505 	    /* fail if we are currently x_escaped */
    506 	    case 'e':
    507 		if (x_escaped)
    508 		    return(NIL);
    509 		cs = MNEXT(cs);
    510 		break;
    511 
    512 	    /* match any number of tabs and spaces */
    513 	    case 'd':
    514 		ptr = s;
    515 		while (*s == ' ' || *s == '\t')
    516 		    s++;
    517 		if (s != ptr || s == x_start) {
    518 
    519 		    /* match, be happy */
    520 		    matched = 1;
    521 		    cs = MNEXT(cs);
    522 		} else if (*s == '\n' || *s == '\0') {
    523 
    524 		    /* match, be happy */
    525 		    matched = 1;
    526 		    cs = MNEXT(cs);
    527 		} else if (*cs & ALT) {
    528 
    529 		    /* try the next part */
    530 		    matched = 0;
    531 		    cs = MNEXT(cs);
    532 		} else if (*cs & OPT) {
    533 
    534 		    /* doesn't matter */
    535 		    matched = 1;
    536 		    cs = MNEXT(cs);
    537 		} else
    538 
    539 		    /* no match, error return */
    540 		    return (NIL);
    541 		break;
    542 
    543 	    /* check for end of line */
    544 	    case '$':
    545 		if (*s == '\0' || *s == '\n') {
    546 
    547 		    /* match, be happy */
    548 		    s++;
    549 		    matched = 1;
    550 		    cs = MNEXT(cs);
    551 		} else if (*cs & ALT) {
    552 
    553 		    /* try the next part */
    554 		    matched = 0;
    555 		    cs = MNEXT(cs);
    556 		} else if (*cs & OPT) {
    557 
    558 		    /* doesn't matter */
    559 		    matched = 1;
    560 		    cs = MNEXT(cs);
    561 		} else
    562 
    563 		    /* no match, error return */
    564 		    return (NIL);
    565 		break;
    566 
    567 	    /* check for start of line */
    568 	    case '^':
    569 		if (s == x_start) {
    570 
    571 		    /* match, be happy */
    572 		    matched = 1;
    573 		    cs = MNEXT(cs);
    574 		} else if (*cs & ALT) {
    575 
    576 		    /* try the next part */
    577 		    matched = 0;
    578 		    cs = MNEXT(cs);
    579 		} else if (*cs & OPT) {
    580 
    581 		    /* doesn't matter */
    582 		    matched = 1;
    583 		    cs = MNEXT(cs);
    584 		} else
    585 
    586 		    /* no match, error return */
    587 		    return (NIL);
    588 		break;
    589 
    590 	    /* end of a subexpression, return success */
    591 	    case ')':
    592 		return (s);
    593 	    }
    594 	    break;
    595 	}
    596     }
    597     return (s);
    598 }
    599