Home | History | Annotate | Line # | Download | only in vgrind
regexp.c revision 1.2
      1 /*
      2  * Copyright (c) 1980 The Regents of the University of California.
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. All advertising materials mentioning features or use of this software
     14  *    must display the following acknowledgement:
     15  *	This product includes software developed by the University of
     16  *	California, Berkeley and its contributors.
     17  * 4. Neither the name of the University nor the names of its contributors
     18  *    may be used to endorse or promote products derived from this software
     19  *    without specific prior written permission.
     20  *
     21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     31  * SUCH DAMAGE.
     32  */
     33 
     34 #ifndef lint
     35 /*static char sccsid[] = "from: @(#)regexp.c	5.3 (Berkeley) 6/1/90";*/
     36 static char rcsid[] = "$Id: regexp.c,v 1.2 1993/08/01 18:03:29 mycroft Exp $";
     37 #endif /* not lint */
     38 
     39 #include <ctype.h>
     40 
     41 typedef int	boolean;
     42 #define TRUE	1
     43 #define FALSE	0
     44 #define NIL	0
     45 
     46 boolean l_onecase;	/* true if upper and lower equivalent */
     47 
     48 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
     49 
     50 /*  STRNCMP -	like strncmp except that we convert the
     51  *	 	first string to lower case before comparing
     52  *		if l_onecase is set.
     53  */
     54 
     55 STRNCMP(s1, s2, len)
     56 	register char *s1,*s2;
     57 	register int len;
     58 {
     59 	if (l_onecase) {
     60 	    do
     61 		if (*s2 - makelower(*s1))
     62 			return (*s2 - makelower(*s1));
     63 		else {
     64 			s2++;
     65 			s1++;
     66 		}
     67 	    while (--len);
     68 	} else {
     69 	    do
     70 		if (*s2 - *s1)
     71 			return (*s2 - *s1);
     72 		else {
     73 			s2++;
     74 			s1++;
     75 		}
     76 	    while (--len);
     77 	}
     78 	return(0);
     79 }
     80 
     81 /*	The following routine converts an irregular expression to
     82  *	internal format.
     83  *
     84  *	Either meta symbols (\a \d or \p) or character strings or
     85  *	operations ( alternation or perenthesizing ) can be
     86  *	specified.  Each starts with a descriptor byte.  The descriptor
     87  *	byte has STR set for strings, META set for meta symbols
     88  *	and OPER set for operations.
     89  *	The descriptor byte can also have the OPT bit set if the object
     90  *	defined is optional.  Also ALT can be set to indicate an alternation.
     91  *
     92  *	For metasymbols the byte following the descriptor byte identities
     93  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
     94  *	strings the byte after the descriptor is a character count for
     95  *	the string:
     96  *
     97  *		meta symbols := descriptor
     98  *				symbol
     99  *
    100  *		strings :=	descriptor
    101  *				character count
    102  *				the string
    103  *
    104  *		operatins :=	descriptor
    105  *				symbol
    106  *				character count
    107  */
    108 
    109 /*
    110  *  handy macros for accessing parts of match blocks
    111  */
    112 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
    113 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
    114 
    115 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
    116 #define OCNT(A) (*(A+2))	/* character count */
    117 #define ONEXT(A) (A+3)		/* next character after the operation */
    118 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
    119 
    120 #define SCNT(A) (*(A+1))	/* byte count of a string */
    121 #define SSTR(A) (A+2)		/* address of the string */
    122 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
    123 
    124 /*
    125  *  bit flags in the descriptor
    126  */
    127 #define OPT 1
    128 #define STR 2
    129 #define META 4
    130 #define ALT 8
    131 #define OPER 16
    132 
    133 char *ure;		/* pointer current position in unconverted exp */
    134 char *ccre;		/* pointer to current position in converted exp*/
    135 char *malloc();
    136 
    137 char *
    138 convexp(re)
    139     char *re;		/* unconverted irregular expression */
    140 {
    141     register char *cre;		/* pointer to converted regular expression */
    142 
    143     /* allocate room for the converted expression */
    144     if (re == NIL)
    145 	return (NIL);
    146     if (*re == '\0')
    147 	return (NIL);
    148     cre = malloc (4 * strlen(re) + 3);
    149     ccre = cre;
    150     ure = re;
    151 
    152     /* start the conversion with a \a */
    153     *cre = META | OPT;
    154     MSYM(cre) = 'a';
    155     ccre = MNEXT(cre);
    156 
    157     /* start the conversion (its recursive) */
    158     expconv ();
    159     *ccre = 0;
    160     return (cre);
    161 }
    162 
    163 expconv()
    164 {
    165     register char *cs;		/* pointer to current symbol in converted exp */
    166     register char c;		/* character being processed */
    167     register char *acs;		/* pinter to last alternate */
    168     register int temp;
    169 
    170     /* let the conversion begin */
    171     acs = NIL;
    172     cs = NIL;
    173     while (*ure != NIL) {
    174 	switch (c = *ure++) {
    175 
    176 	case '\\':
    177 	    switch (c = *ure++) {
    178 
    179 	    /* escaped characters are just characters */
    180 	    default:
    181 		if (cs == NIL || (*cs & STR) == 0) {
    182 		    cs = ccre;
    183 		    *cs = STR;
    184 		    SCNT(cs) = 1;
    185 		    ccre += 2;
    186 		} else
    187 		    SCNT(cs)++;
    188 		*ccre++ = c;
    189 		break;
    190 
    191 	    /* normal(?) metacharacters */
    192 	    case 'a':
    193 	    case 'd':
    194 	    case 'e':
    195 	    case 'p':
    196 		if (acs != NIL && acs != cs) {
    197 		    do {
    198 			temp = OCNT(acs);
    199 			OCNT(acs) = ccre - acs;
    200 			acs -= temp;
    201 		    } while (temp != 0);
    202 		    acs = NIL;
    203 		}
    204 		cs = ccre;
    205 		*cs = META;
    206 		MSYM(cs) = c;
    207 		ccre = MNEXT(cs);
    208 		break;
    209 	    }
    210 	    break;
    211 
    212 	/* just put the symbol in */
    213 	case '^':
    214 	case '$':
    215 	    if (acs != NIL && acs != cs) {
    216 		do {
    217 		    temp = OCNT(acs);
    218 		    OCNT(acs) = ccre - acs;
    219 		    acs -= temp;
    220 		} while (temp != 0);
    221 		acs = NIL;
    222 	    }
    223 	    cs = ccre;
    224 	    *cs = META;
    225 	    MSYM(cs) = c;
    226 	    ccre = MNEXT(cs);
    227 	    break;
    228 
    229 	/* mark the last match sequence as optional */
    230 	case '?':
    231 	    if (cs)
    232 	    	*cs = *cs | OPT;
    233 	    break;
    234 
    235 	/* recurse and define a subexpression */
    236 	case '(':
    237 	    if (acs != NIL && acs != cs) {
    238 		do {
    239 		    temp = OCNT(acs);
    240 		    OCNT(acs) = ccre - acs;
    241 		    acs -= temp;
    242 		} while (temp != 0);
    243 		acs = NIL;
    244 	    }
    245 	    cs = ccre;
    246 	    *cs = OPER;
    247 	    OSYM(cs) = '(';
    248 	    ccre = ONEXT(cs);
    249 	    expconv ();
    250 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
    251 	    break;
    252 
    253 	/* return from a recursion */
    254 	case ')':
    255 	    if (acs != NIL) {
    256 		do {
    257 		    temp = OCNT(acs);
    258 		    OCNT(acs) = ccre - acs;
    259 		    acs -= temp;
    260 		} while (temp != 0);
    261 		acs = NIL;
    262 	    }
    263 	    cs = ccre;
    264 	    *cs = META;
    265 	    MSYM(cs) = c;
    266 	    ccre = MNEXT(cs);
    267 	    return;
    268 
    269 	/* mark the last match sequence as having an alternate */
    270 	/* the third byte will contain an offset to jump over the */
    271 	/* alternate match in case the first did not fail */
    272 	case '|':
    273 	    if (acs != NIL && acs != cs)
    274 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
    275 	    else
    276 		OCNT(ccre) = 0;
    277 	    *cs |= ALT;
    278 	    cs = ccre;
    279 	    *cs = OPER;
    280 	    OSYM(cs) = '|';
    281 	    ccre = ONEXT(cs);
    282 	    acs = cs;	/* remember that the pointer is to be filles */
    283 	    break;
    284 
    285 	/* if its not a metasymbol just build a scharacter string */
    286 	default:
    287 	    if (cs == NIL || (*cs & STR) == 0) {
    288 		cs = ccre;
    289 		*cs = STR;
    290 		SCNT(cs) = 1;
    291 		ccre = SSTR(cs);
    292 	    } else
    293 		SCNT(cs)++;
    294 	    *ccre++ = c;
    295 	    break;
    296 	}
    297     }
    298     if (acs != NIL) {
    299 	do {
    300 	    temp = OCNT(acs);
    301 	    OCNT(acs) = ccre - acs;
    302 	    acs -= temp;
    303 	} while (temp != 0);
    304 	acs = NIL;
    305     }
    306     return;
    307 }
    308 /* end of convertre */
    309 
    310 
    311 /*
    312  *	The following routine recognises an irregular expresion
    313  *	with the following special characters:
    314  *
    315  *		\?	-	means last match was optional
    316  *		\a	-	matches any number of characters
    317  *		\d	-	matches any number of spaces and tabs
    318  *		\p	-	matches any number of alphanumeric
    319  *				characters. The
    320  *				characters matched will be copied into
    321  *				the area pointed to by 'name'.
    322  *		\|	-	alternation
    323  *		\( \)	-	grouping used mostly for alternation and
    324  *				optionality
    325  *
    326  *	The irregular expression must be translated to internal form
    327  *	prior to calling this routine
    328  *
    329  *	The value returned is the pointer to the first non \a
    330  *	character matched.
    331  */
    332 
    333 boolean _escaped;		/* true if we are currently _escaped */
    334 char *_start;			/* start of string */
    335 
    336 char *
    337 expmatch (s, re, mstring)
    338     register char *s;		/* string to check for a match in */
    339     register char *re;		/* a converted irregular expression */
    340     register char *mstring;	/* where to put whatever matches a \p */
    341 {
    342     register char *cs;		/* the current symbol */
    343     register char *ptr,*s1;	/* temporary pointer */
    344     boolean matched;		/* a temporary boolean */
    345 
    346     /* initial conditions */
    347     if (re == NIL)
    348 	return (NIL);
    349     cs = re;
    350     matched = FALSE;
    351 
    352     /* loop till expression string is exhausted (or at least pretty tired) */
    353     while (*cs) {
    354 	switch (*cs & (OPER | STR | META)) {
    355 
    356 	/* try to match a string */
    357 	case STR:
    358 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
    359 	    if (matched) {
    360 
    361 		/* hoorah it matches */
    362 		s += SCNT(cs);
    363 		cs = SNEXT(cs);
    364 	    } else if (*cs & ALT) {
    365 
    366 		/* alternation, skip to next expression */
    367 		cs = SNEXT(cs);
    368 	    } else if (*cs & OPT) {
    369 
    370 		/* the match is optional */
    371 		cs = SNEXT(cs);
    372 		matched = 1;		/* indicate a successful match */
    373 	    } else {
    374 
    375 		/* no match, error return */
    376 		return (NIL);
    377 	    }
    378 	    break;
    379 
    380 	/* an operator, do something fancy */
    381 	case OPER:
    382 	    switch (OSYM(cs)) {
    383 
    384 	    /* this is an alternation */
    385 	    case '|':
    386 		if (matched)
    387 
    388 		    /* last thing in the alternation was a match, skip ahead */
    389 		    cs = OPTR(cs);
    390 		else
    391 
    392 		    /* no match, keep trying */
    393 		    cs = ONEXT(cs);
    394 		break;
    395 
    396 	    /* this is a grouping, recurse */
    397 	    case '(':
    398 		ptr = expmatch (s, ONEXT(cs), mstring);
    399 		if (ptr != NIL) {
    400 
    401 		    /* the subexpression matched */
    402 		    matched = 1;
    403 		    s = ptr;
    404 		} else if (*cs & ALT) {
    405 
    406 		    /* alternation, skip to next expression */
    407 		    matched = 0;
    408 		} else if (*cs & OPT) {
    409 
    410 		    /* the match is optional */
    411 		    matched = 1;	/* indicate a successful match */
    412 		} else {
    413 
    414 		    /* no match, error return */
    415 		    return (NIL);
    416 		}
    417 		cs = OPTR(cs);
    418 		break;
    419 	    }
    420 	    break;
    421 
    422 	/* try to match a metasymbol */
    423 	case META:
    424 	    switch (MSYM(cs)) {
    425 
    426 	    /* try to match anything and remember what was matched */
    427 	    case 'p':
    428 		/*
    429 		 *  This is really the same as trying the match the
    430 		 *  remaining parts of the expression to any subset
    431 		 *  of the string.
    432 		 */
    433 		s1 = s;
    434 		do {
    435 		    ptr = expmatch (s1, MNEXT(cs), mstring);
    436 		    if (ptr != NIL && s1 != s) {
    437 
    438 			/* we have a match, remember the match */
    439 			strncpy (mstring, s, s1 - s);
    440 			mstring[s1 - s] = '\0';
    441 			return (ptr);
    442 		    } else if (ptr != NIL && (*cs & OPT)) {
    443 
    444 			/* it was aoptional so no match is ok */
    445 			return (ptr);
    446 		    } else if (ptr != NIL) {
    447 
    448 			/* not optional and we still matched */
    449 			return (NIL);
    450 		    }
    451 		    if (!isalnum(*s1) && *s1 != '_')
    452 			return (NIL);
    453 		    if (*s1 == '\\')
    454 			_escaped = _escaped ? FALSE : TRUE;
    455 		    else
    456 			_escaped = FALSE;
    457 		} while (*s1++);
    458 		return (NIL);
    459 
    460 	    /* try to match anything */
    461 	    case 'a':
    462 		/*
    463 		 *  This is really the same as trying the match the
    464 		 *  remaining parts of the expression to any subset
    465 		 *  of the string.
    466 		 */
    467 		s1 = s;
    468 		do {
    469 		    ptr = expmatch (s1, MNEXT(cs), mstring);
    470 		    if (ptr != NIL && s1 != s) {
    471 
    472 			/* we have a match */
    473 			return (ptr);
    474 		    } else if (ptr != NIL && (*cs & OPT)) {
    475 
    476 			/* it was aoptional so no match is ok */
    477 			return (ptr);
    478 		    } else if (ptr != NIL) {
    479 
    480 			/* not optional and we still matched */
    481 			return (NIL);
    482 		    }
    483 		    if (*s1 == '\\')
    484 			_escaped = _escaped ? FALSE : TRUE;
    485 		    else
    486 			_escaped = FALSE;
    487 		} while (*s1++);
    488 		return (NIL);
    489 
    490 	    /* fail if we are currently _escaped */
    491 	    case 'e':
    492 		if (_escaped)
    493 		    return(NIL);
    494 		cs = MNEXT(cs);
    495 		break;
    496 
    497 	    /* match any number of tabs and spaces */
    498 	    case 'd':
    499 		ptr = s;
    500 		while (*s == ' ' || *s == '\t')
    501 		    s++;
    502 		if (s != ptr || s == _start) {
    503 
    504 		    /* match, be happy */
    505 		    matched = 1;
    506 		    cs = MNEXT(cs);
    507 		} else if (*s == '\n' || *s == '\0') {
    508 
    509 		    /* match, be happy */
    510 		    matched = 1;
    511 		    cs = MNEXT(cs);
    512 		} else if (*cs & ALT) {
    513 
    514 		    /* try the next part */
    515 		    matched = 0;
    516 		    cs = MNEXT(cs);
    517 		} else if (*cs & OPT) {
    518 
    519 		    /* doesn't matter */
    520 		    matched = 1;
    521 		    cs = MNEXT(cs);
    522 		} else
    523 
    524 		    /* no match, error return */
    525 		    return (NIL);
    526 		break;
    527 
    528 	    /* check for end of line */
    529 	    case '$':
    530 		if (*s == '\0' || *s == '\n') {
    531 
    532 		    /* match, be happy */
    533 		    s++;
    534 		    matched = 1;
    535 		    cs = MNEXT(cs);
    536 		} else if (*cs & ALT) {
    537 
    538 		    /* try the next part */
    539 		    matched = 0;
    540 		    cs = MNEXT(cs);
    541 		} else if (*cs & OPT) {
    542 
    543 		    /* doesn't matter */
    544 		    matched = 1;
    545 		    cs = MNEXT(cs);
    546 		} else
    547 
    548 		    /* no match, error return */
    549 		    return (NIL);
    550 		break;
    551 
    552 	    /* check for start of line */
    553 	    case '^':
    554 		if (s == _start) {
    555 
    556 		    /* match, be happy */
    557 		    matched = 1;
    558 		    cs = MNEXT(cs);
    559 		} else if (*cs & ALT) {
    560 
    561 		    /* try the next part */
    562 		    matched = 0;
    563 		    cs = MNEXT(cs);
    564 		} else if (*cs & OPT) {
    565 
    566 		    /* doesn't matter */
    567 		    matched = 1;
    568 		    cs = MNEXT(cs);
    569 		} else
    570 
    571 		    /* no match, error return */
    572 		    return (NIL);
    573 		break;
    574 
    575 	    /* end of a subexpression, return success */
    576 	    case ')':
    577 		return (s);
    578 	    }
    579 	    break;
    580 	}
    581     }
    582     return (s);
    583 }
    584