Home | History | Annotate | Line # | Download | only in vgrind
regexp.c revision 1.1
      1 /*
      2  * Copyright (c) 1980 The Regents of the University of California.
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. All advertising materials mentioning features or use of this software
     14  *    must display the following acknowledgement:
     15  *	This product includes software developed by the University of
     16  *	California, Berkeley and its contributors.
     17  * 4. Neither the name of the University nor the names of its contributors
     18  *    may be used to endorse or promote products derived from this software
     19  *    without specific prior written permission.
     20  *
     21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     31  * SUCH DAMAGE.
     32  */
     33 
     34 #ifndef lint
     35 static char sccsid[] = "@(#)regexp.c	5.3 (Berkeley) 6/1/90";
     36 #endif /* not lint */
     37 
     38 #include <ctype.h>
     39 
     40 typedef int	boolean;
     41 #define TRUE	1
     42 #define FALSE	0
     43 #define NIL	0
     44 
     45 boolean l_onecase;	/* true if upper and lower equivalent */
     46 
     47 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
     48 
     49 /*  STRNCMP -	like strncmp except that we convert the
     50  *	 	first string to lower case before comparing
     51  *		if l_onecase is set.
     52  */
     53 
     54 STRNCMP(s1, s2, len)
     55 	register char *s1,*s2;
     56 	register int len;
     57 {
     58 	if (l_onecase) {
     59 	    do
     60 		if (*s2 - makelower(*s1))
     61 			return (*s2 - makelower(*s1));
     62 		else {
     63 			s2++;
     64 			s1++;
     65 		}
     66 	    while (--len);
     67 	} else {
     68 	    do
     69 		if (*s2 - *s1)
     70 			return (*s2 - *s1);
     71 		else {
     72 			s2++;
     73 			s1++;
     74 		}
     75 	    while (--len);
     76 	}
     77 	return(0);
     78 }
     79 
     80 /*	The following routine converts an irregular expression to
     81  *	internal format.
     82  *
     83  *	Either meta symbols (\a \d or \p) or character strings or
     84  *	operations ( alternation or perenthesizing ) can be
     85  *	specified.  Each starts with a descriptor byte.  The descriptor
     86  *	byte has STR set for strings, META set for meta symbols
     87  *	and OPER set for operations.
     88  *	The descriptor byte can also have the OPT bit set if the object
     89  *	defined is optional.  Also ALT can be set to indicate an alternation.
     90  *
     91  *	For metasymbols the byte following the descriptor byte identities
     92  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
     93  *	strings the byte after the descriptor is a character count for
     94  *	the string:
     95  *
     96  *		meta symbols := descriptor
     97  *				symbol
     98  *
     99  *		strings :=	descriptor
    100  *				character count
    101  *				the string
    102  *
    103  *		operatins :=	descriptor
    104  *				symbol
    105  *				character count
    106  */
    107 
    108 /*
    109  *  handy macros for accessing parts of match blocks
    110  */
    111 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
    112 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
    113 
    114 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
    115 #define OCNT(A) (*(A+2))	/* character count */
    116 #define ONEXT(A) (A+3)		/* next character after the operation */
    117 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
    118 
    119 #define SCNT(A) (*(A+1))	/* byte count of a string */
    120 #define SSTR(A) (A+2)		/* address of the string */
    121 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
    122 
    123 /*
    124  *  bit flags in the descriptor
    125  */
    126 #define OPT 1
    127 #define STR 2
    128 #define META 4
    129 #define ALT 8
    130 #define OPER 16
    131 
    132 char *ure;		/* pointer current position in unconverted exp */
    133 char *ccre;		/* pointer to current position in converted exp*/
    134 char *malloc();
    135 
    136 char *
    137 convexp(re)
    138     char *re;		/* unconverted irregular expression */
    139 {
    140     register char *cre;		/* pointer to converted regular expression */
    141 
    142     /* allocate room for the converted expression */
    143     if (re == NIL)
    144 	return (NIL);
    145     if (*re == '\0')
    146 	return (NIL);
    147     cre = malloc (4 * strlen(re) + 3);
    148     ccre = cre;
    149     ure = re;
    150 
    151     /* start the conversion with a \a */
    152     *cre = META | OPT;
    153     MSYM(cre) = 'a';
    154     ccre = MNEXT(cre);
    155 
    156     /* start the conversion (its recursive) */
    157     expconv ();
    158     *ccre = 0;
    159     return (cre);
    160 }
    161 
    162 expconv()
    163 {
    164     register char *cs;		/* pointer to current symbol in converted exp */
    165     register char c;		/* character being processed */
    166     register char *acs;		/* pinter to last alternate */
    167     register int temp;
    168 
    169     /* let the conversion begin */
    170     acs = NIL;
    171     cs = NIL;
    172     while (*ure != NIL) {
    173 	switch (c = *ure++) {
    174 
    175 	case '\\':
    176 	    switch (c = *ure++) {
    177 
    178 	    /* escaped characters are just characters */
    179 	    default:
    180 		if (cs == NIL || (*cs & STR) == 0) {
    181 		    cs = ccre;
    182 		    *cs = STR;
    183 		    SCNT(cs) = 1;
    184 		    ccre += 2;
    185 		} else
    186 		    SCNT(cs)++;
    187 		*ccre++ = c;
    188 		break;
    189 
    190 	    /* normal(?) metacharacters */
    191 	    case 'a':
    192 	    case 'd':
    193 	    case 'e':
    194 	    case 'p':
    195 		if (acs != NIL && acs != cs) {
    196 		    do {
    197 			temp = OCNT(acs);
    198 			OCNT(acs) = ccre - acs;
    199 			acs -= temp;
    200 		    } while (temp != 0);
    201 		    acs = NIL;
    202 		}
    203 		cs = ccre;
    204 		*cs = META;
    205 		MSYM(cs) = c;
    206 		ccre = MNEXT(cs);
    207 		break;
    208 	    }
    209 	    break;
    210 
    211 	/* just put the symbol in */
    212 	case '^':
    213 	case '$':
    214 	    if (acs != NIL && acs != cs) {
    215 		do {
    216 		    temp = OCNT(acs);
    217 		    OCNT(acs) = ccre - acs;
    218 		    acs -= temp;
    219 		} while (temp != 0);
    220 		acs = NIL;
    221 	    }
    222 	    cs = ccre;
    223 	    *cs = META;
    224 	    MSYM(cs) = c;
    225 	    ccre = MNEXT(cs);
    226 	    break;
    227 
    228 	/* mark the last match sequence as optional */
    229 	case '?':
    230 	    if (cs)
    231 	    	*cs = *cs | OPT;
    232 	    break;
    233 
    234 	/* recurse and define a subexpression */
    235 	case '(':
    236 	    if (acs != NIL && acs != cs) {
    237 		do {
    238 		    temp = OCNT(acs);
    239 		    OCNT(acs) = ccre - acs;
    240 		    acs -= temp;
    241 		} while (temp != 0);
    242 		acs = NIL;
    243 	    }
    244 	    cs = ccre;
    245 	    *cs = OPER;
    246 	    OSYM(cs) = '(';
    247 	    ccre = ONEXT(cs);
    248 	    expconv ();
    249 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
    250 	    break;
    251 
    252 	/* return from a recursion */
    253 	case ')':
    254 	    if (acs != NIL) {
    255 		do {
    256 		    temp = OCNT(acs);
    257 		    OCNT(acs) = ccre - acs;
    258 		    acs -= temp;
    259 		} while (temp != 0);
    260 		acs = NIL;
    261 	    }
    262 	    cs = ccre;
    263 	    *cs = META;
    264 	    MSYM(cs) = c;
    265 	    ccre = MNEXT(cs);
    266 	    return;
    267 
    268 	/* mark the last match sequence as having an alternate */
    269 	/* the third byte will contain an offset to jump over the */
    270 	/* alternate match in case the first did not fail */
    271 	case '|':
    272 	    if (acs != NIL && acs != cs)
    273 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
    274 	    else
    275 		OCNT(ccre) = 0;
    276 	    *cs |= ALT;
    277 	    cs = ccre;
    278 	    *cs = OPER;
    279 	    OSYM(cs) = '|';
    280 	    ccre = ONEXT(cs);
    281 	    acs = cs;	/* remember that the pointer is to be filles */
    282 	    break;
    283 
    284 	/* if its not a metasymbol just build a scharacter string */
    285 	default:
    286 	    if (cs == NIL || (*cs & STR) == 0) {
    287 		cs = ccre;
    288 		*cs = STR;
    289 		SCNT(cs) = 1;
    290 		ccre = SSTR(cs);
    291 	    } else
    292 		SCNT(cs)++;
    293 	    *ccre++ = c;
    294 	    break;
    295 	}
    296     }
    297     if (acs != NIL) {
    298 	do {
    299 	    temp = OCNT(acs);
    300 	    OCNT(acs) = ccre - acs;
    301 	    acs -= temp;
    302 	} while (temp != 0);
    303 	acs = NIL;
    304     }
    305     return;
    306 }
    307 /* end of convertre */
    308 
    309 
    310 /*
    311  *	The following routine recognises an irregular expresion
    312  *	with the following special characters:
    313  *
    314  *		\?	-	means last match was optional
    315  *		\a	-	matches any number of characters
    316  *		\d	-	matches any number of spaces and tabs
    317  *		\p	-	matches any number of alphanumeric
    318  *				characters. The
    319  *				characters matched will be copied into
    320  *				the area pointed to by 'name'.
    321  *		\|	-	alternation
    322  *		\( \)	-	grouping used mostly for alternation and
    323  *				optionality
    324  *
    325  *	The irregular expression must be translated to internal form
    326  *	prior to calling this routine
    327  *
    328  *	The value returned is the pointer to the first non \a
    329  *	character matched.
    330  */
    331 
    332 boolean _escaped;		/* true if we are currently _escaped */
    333 char *_start;			/* start of string */
    334 
    335 char *
    336 expmatch (s, re, mstring)
    337     register char *s;		/* string to check for a match in */
    338     register char *re;		/* a converted irregular expression */
    339     register char *mstring;	/* where to put whatever matches a \p */
    340 {
    341     register char *cs;		/* the current symbol */
    342     register char *ptr,*s1;	/* temporary pointer */
    343     boolean matched;		/* a temporary boolean */
    344 
    345     /* initial conditions */
    346     if (re == NIL)
    347 	return (NIL);
    348     cs = re;
    349     matched = FALSE;
    350 
    351     /* loop till expression string is exhausted (or at least pretty tired) */
    352     while (*cs) {
    353 	switch (*cs & (OPER | STR | META)) {
    354 
    355 	/* try to match a string */
    356 	case STR:
    357 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
    358 	    if (matched) {
    359 
    360 		/* hoorah it matches */
    361 		s += SCNT(cs);
    362 		cs = SNEXT(cs);
    363 	    } else if (*cs & ALT) {
    364 
    365 		/* alternation, skip to next expression */
    366 		cs = SNEXT(cs);
    367 	    } else if (*cs & OPT) {
    368 
    369 		/* the match is optional */
    370 		cs = SNEXT(cs);
    371 		matched = 1;		/* indicate a successful match */
    372 	    } else {
    373 
    374 		/* no match, error return */
    375 		return (NIL);
    376 	    }
    377 	    break;
    378 
    379 	/* an operator, do something fancy */
    380 	case OPER:
    381 	    switch (OSYM(cs)) {
    382 
    383 	    /* this is an alternation */
    384 	    case '|':
    385 		if (matched)
    386 
    387 		    /* last thing in the alternation was a match, skip ahead */
    388 		    cs = OPTR(cs);
    389 		else
    390 
    391 		    /* no match, keep trying */
    392 		    cs = ONEXT(cs);
    393 		break;
    394 
    395 	    /* this is a grouping, recurse */
    396 	    case '(':
    397 		ptr = expmatch (s, ONEXT(cs), mstring);
    398 		if (ptr != NIL) {
    399 
    400 		    /* the subexpression matched */
    401 		    matched = 1;
    402 		    s = ptr;
    403 		} else if (*cs & ALT) {
    404 
    405 		    /* alternation, skip to next expression */
    406 		    matched = 0;
    407 		} else if (*cs & OPT) {
    408 
    409 		    /* the match is optional */
    410 		    matched = 1;	/* indicate a successful match */
    411 		} else {
    412 
    413 		    /* no match, error return */
    414 		    return (NIL);
    415 		}
    416 		cs = OPTR(cs);
    417 		break;
    418 	    }
    419 	    break;
    420 
    421 	/* try to match a metasymbol */
    422 	case META:
    423 	    switch (MSYM(cs)) {
    424 
    425 	    /* try to match anything and remember what was matched */
    426 	    case 'p':
    427 		/*
    428 		 *  This is really the same as trying the match the
    429 		 *  remaining parts of the expression to any subset
    430 		 *  of the string.
    431 		 */
    432 		s1 = s;
    433 		do {
    434 		    ptr = expmatch (s1, MNEXT(cs), mstring);
    435 		    if (ptr != NIL && s1 != s) {
    436 
    437 			/* we have a match, remember the match */
    438 			strncpy (mstring, s, s1 - s);
    439 			mstring[s1 - s] = '\0';
    440 			return (ptr);
    441 		    } else if (ptr != NIL && (*cs & OPT)) {
    442 
    443 			/* it was aoptional so no match is ok */
    444 			return (ptr);
    445 		    } else if (ptr != NIL) {
    446 
    447 			/* not optional and we still matched */
    448 			return (NIL);
    449 		    }
    450 		    if (!isalnum(*s1) && *s1 != '_')
    451 			return (NIL);
    452 		    if (*s1 == '\\')
    453 			_escaped = _escaped ? FALSE : TRUE;
    454 		    else
    455 			_escaped = FALSE;
    456 		} while (*s1++);
    457 		return (NIL);
    458 
    459 	    /* try to match anything */
    460 	    case 'a':
    461 		/*
    462 		 *  This is really the same as trying the match the
    463 		 *  remaining parts of the expression to any subset
    464 		 *  of the string.
    465 		 */
    466 		s1 = s;
    467 		do {
    468 		    ptr = expmatch (s1, MNEXT(cs), mstring);
    469 		    if (ptr != NIL && s1 != s) {
    470 
    471 			/* we have a match */
    472 			return (ptr);
    473 		    } else if (ptr != NIL && (*cs & OPT)) {
    474 
    475 			/* it was aoptional so no match is ok */
    476 			return (ptr);
    477 		    } else if (ptr != NIL) {
    478 
    479 			/* not optional and we still matched */
    480 			return (NIL);
    481 		    }
    482 		    if (*s1 == '\\')
    483 			_escaped = _escaped ? FALSE : TRUE;
    484 		    else
    485 			_escaped = FALSE;
    486 		} while (*s1++);
    487 		return (NIL);
    488 
    489 	    /* fail if we are currently _escaped */
    490 	    case 'e':
    491 		if (_escaped)
    492 		    return(NIL);
    493 		cs = MNEXT(cs);
    494 		break;
    495 
    496 	    /* match any number of tabs and spaces */
    497 	    case 'd':
    498 		ptr = s;
    499 		while (*s == ' ' || *s == '\t')
    500 		    s++;
    501 		if (s != ptr || s == _start) {
    502 
    503 		    /* match, be happy */
    504 		    matched = 1;
    505 		    cs = MNEXT(cs);
    506 		} else if (*s == '\n' || *s == '\0') {
    507 
    508 		    /* match, be happy */
    509 		    matched = 1;
    510 		    cs = MNEXT(cs);
    511 		} else if (*cs & ALT) {
    512 
    513 		    /* try the next part */
    514 		    matched = 0;
    515 		    cs = MNEXT(cs);
    516 		} else if (*cs & OPT) {
    517 
    518 		    /* doesn't matter */
    519 		    matched = 1;
    520 		    cs = MNEXT(cs);
    521 		} else
    522 
    523 		    /* no match, error return */
    524 		    return (NIL);
    525 		break;
    526 
    527 	    /* check for end of line */
    528 	    case '$':
    529 		if (*s == '\0' || *s == '\n') {
    530 
    531 		    /* match, be happy */
    532 		    s++;
    533 		    matched = 1;
    534 		    cs = MNEXT(cs);
    535 		} else if (*cs & ALT) {
    536 
    537 		    /* try the next part */
    538 		    matched = 0;
    539 		    cs = MNEXT(cs);
    540 		} else if (*cs & OPT) {
    541 
    542 		    /* doesn't matter */
    543 		    matched = 1;
    544 		    cs = MNEXT(cs);
    545 		} else
    546 
    547 		    /* no match, error return */
    548 		    return (NIL);
    549 		break;
    550 
    551 	    /* check for start of line */
    552 	    case '^':
    553 		if (s == _start) {
    554 
    555 		    /* match, be happy */
    556 		    matched = 1;
    557 		    cs = MNEXT(cs);
    558 		} else if (*cs & ALT) {
    559 
    560 		    /* try the next part */
    561 		    matched = 0;
    562 		    cs = MNEXT(cs);
    563 		} else if (*cs & OPT) {
    564 
    565 		    /* doesn't matter */
    566 		    matched = 1;
    567 		    cs = MNEXT(cs);
    568 		} else
    569 
    570 		    /* no match, error return */
    571 		    return (NIL);
    572 		break;
    573 
    574 	    /* end of a subexpression, return success */
    575 	    case ')':
    576 		return (s);
    577 	    }
    578 	    break;
    579 	}
    580     }
    581     return (s);
    582 }
    583