Home | History | Annotate | Line # | Download | only in tr
str.c revision 1.26
      1 /*	$NetBSD: str.c,v 1.26 2013/08/11 01:29:28 dholland Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1991, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  * 3. Neither the name of the University nor the names of its contributors
     16  *    may be used to endorse or promote products derived from this software
     17  *    without specific prior written permission.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     29  * SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 #ifndef lint
     34 #if 0
     35 static char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
     36 #endif
     37 __RCSID("$NetBSD: str.c,v 1.26 2013/08/11 01:29:28 dholland Exp $");
     38 #endif /* not lint */
     39 
     40 #include <sys/types.h>
     41 
     42 #include <err.h>
     43 #include <errno.h>
     44 #include <stddef.h>
     45 #include <stdio.h>
     46 #include <stdlib.h>
     47 #include <string.h>
     48 #include <ctype.h>
     49 #include <assert.h>
     50 
     51 #include "extern.h"
     52 
     53 struct str {
     54 	enum { STRING1, STRING2 } which;
     55 	enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
     56 	int	 cnt;			/* character count */
     57 	int	 lastch;		/* last character */
     58 	int	equiv[2];		/* equivalence set */
     59 	int	*set;			/* set of characters */
     60 	const char *str;		/* user's string */
     61 };
     62 
     63 static int	backslash(STR *);
     64 static int	bracket(STR *);
     65 static int	c_class(const void *, const void *);
     66 static void	genclass(STR *, const char *, size_t);
     67 static void	genequiv(STR *);
     68 static int	genrange(STR *);
     69 static void	genseq(STR *);
     70 
     71 STR *
     72 str_create(int whichstring, const char *txt)
     73 {
     74 	STR *s;
     75 
     76 	s = malloc(sizeof(*s));
     77 	if (s == NULL) {
     78 		err(1, "Out of memory");
     79 	}
     80 
     81 	s->which = whichstring == 2 ? STRING2 : STRING1;
     82 	s->state = NORMAL;
     83 	s->cnt = 0;
     84 	s->lastch = OOBCH;
     85 	s->equiv[0] = 0;
     86 	s->equiv[1] = OOBCH;
     87 	s->set = NULL;
     88 	s->str = txt;
     89 
     90 	return s;
     91 }
     92 
     93 void
     94 str_destroy(STR *s)
     95 {
     96 	if (s->set != NULL && s->set != s->equiv) {
     97 		free(s->set);
     98 	}
     99 	free(s);
    100 }
    101 
    102 int
    103 next(STR *s, int *ret)
    104 {
    105 	int ch;
    106 
    107 	switch (s->state) {
    108 	case EOS:
    109 		*ret = s->lastch;
    110 		return 0;
    111 	case INFINITE:
    112 		*ret = s->lastch;
    113 		return 1;
    114 	case NORMAL:
    115 		ch = (unsigned char)s->str[0];
    116 		switch (ch) {
    117 		case '\0':
    118 			s->state = EOS;
    119 			*ret = s->lastch;
    120 			return 0;
    121 		case '\\':
    122 			s->lastch = backslash(s);
    123 			break;
    124 		case '[':
    125 			if (bracket(s)) {
    126 				return next(s, ret);
    127 			}
    128 			/* FALLTHROUGH */
    129 		default:
    130 			++s->str;
    131 			s->lastch = ch;
    132 			break;
    133 		}
    134 
    135 		/* We can start a range at any time. */
    136 		if (s->str[0] == '-' && genrange(s)) {
    137 			return next(s, ret);
    138 		}
    139 		*ret = s->lastch;
    140 		return 1;
    141 	case RANGE:
    142 		if (s->cnt == 0) {
    143 			s->state = NORMAL;
    144 			return next(s, ret);
    145 		}
    146 		s->cnt--;
    147 		++s->lastch;
    148 		*ret = s->lastch;
    149 		return 1;
    150 	case SEQUENCE:
    151 		if (s->cnt == 0) {
    152 			s->state = NORMAL;
    153 			return next(s, ret);
    154 		}
    155 		s->cnt--;
    156 		*ret = s->lastch;
    157 		return 1;
    158 	case SET:
    159 		s->lastch = s->set[s->cnt++];
    160 		if (s->lastch == OOBCH) {
    161 			s->state = NORMAL;
    162 			if (s->set != s->equiv) {
    163 				free(s->set);
    164 			}
    165 			s->set = NULL;
    166 			return next(s, ret);
    167 		}
    168 		*ret = s->lastch;
    169 		return 1;
    170 	}
    171 	/* NOTREACHED */
    172 	assert(0);
    173 	*ret = s->lastch;
    174 	return 0;
    175 }
    176 
    177 static int
    178 bracket(STR *s)
    179 {
    180 	const char *p;
    181 
    182 	switch (s->str[1]) {
    183 	case ':':				/* "[:class:]" */
    184 		if ((p = strstr(s->str + 2, ":]")) == NULL)
    185 			return 0;
    186 		s->str += 2;
    187 		genclass(s, s->str, p - s->str);
    188 		s->str = p + 2;
    189 		return 1;
    190 	case '=':				/* "[=equiv=]" */
    191 		if ((p = strstr(s->str + 2, "=]")) == NULL)
    192 			return 0;
    193 		s->str += 2;
    194 		genequiv(s);
    195 		return 1;
    196 	default:				/* "[\###*n]" or "[#*n]" */
    197 		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
    198 			return 0;
    199 		if (p[0] != '*' || strchr(p, ']') == NULL)
    200 			return 0;
    201 		s->str += 1;
    202 		genseq(s);
    203 		return 1;
    204 	}
    205 	/* NOTREACHED */
    206 }
    207 
    208 typedef struct {
    209 	const char *name;
    210 	int (*func)(int);
    211 } CLASS;
    212 
    213 static const CLASS classes[] = {
    214 	{ "alnum",  isalnum  },
    215 	{ "alpha",  isalpha  },
    216 	{ "blank",  isblank  },
    217 	{ "cntrl",  iscntrl  },
    218 	{ "digit",  isdigit  },
    219 	{ "graph",  isgraph  },
    220 	{ "lower",  islower  },
    221 	{ "print",  isprint  },
    222 	{ "punct",  ispunct  },
    223 	{ "space",  isspace  },
    224 	{ "upper",  isupper  },
    225 	{ "xdigit", isxdigit },
    226 };
    227 
    228 typedef struct {
    229 	const char *name;
    230 	size_t len;
    231 } CLASSKEY;
    232 
    233 static void
    234 genclass(STR *s, const char *class, size_t len)
    235 {
    236 	int ch;
    237 	const CLASS *cp;
    238 	CLASSKEY key;
    239 	int *p;
    240 	unsigned pos, num;
    241 
    242 	/* Find the class */
    243 	key.name = class;
    244 	key.len = len;
    245 	cp = bsearch(&key, classes, __arraycount(classes), sizeof(classes[0]),
    246 		     c_class);
    247 	if (cp == NULL) {
    248 		errx(1, "unknown class %.*s", (int)len, class);
    249 	}
    250 
    251 	/*
    252 	 * Figure out what characters are in the class
    253 	 */
    254 
    255 	num = NCHARS + 1;
    256 	p = malloc(num * sizeof(*p));
    257 	if (p == NULL) {
    258 		err(1, "malloc");
    259 	}
    260 
    261 	pos = 0;
    262 	for (ch = 0; ch < NCHARS; ch++) {
    263 		if (cp->func(ch)) {
    264 			p[pos++] = ch;
    265 		}
    266 	}
    267 
    268 	p[pos++] = OOBCH;
    269 	for (; pos < num; pos++) {
    270 		p[pos] = 0;
    271 	}
    272 
    273 	/* Update the state */
    274 
    275 	s->set = p;
    276 	s->cnt = 0;
    277 	s->state = SET;
    278 }
    279 
    280 static int
    281 c_class(const void *av, const void *bv)
    282 {
    283 	const CLASSKEY *a = av;
    284 	const CLASS *b = bv;
    285 	size_t blen;
    286 	int r;
    287 
    288 	blen = strlen(b->name);
    289 	r = strncmp(a->name, b->name, a->len);
    290 	if (r != 0) {
    291 		return r;
    292 	}
    293 	if (a->len < blen) {
    294 		/* someone gave us a prefix of the right name */
    295 		return -1;
    296 	}
    297 	assert(a-> len == blen);
    298 	return 0;
    299 }
    300 
    301 /*
    302  * English doesn't have any equivalence classes, so for now
    303  * we just syntax check and grab the character.
    304  */
    305 static void
    306 genequiv(STR *s)
    307 {
    308 	if (*s->str == '\\') {
    309 		s->equiv[0] = backslash(s);
    310 		if (*s->str != '=') {
    311 			errx(1, "Misplaced equivalence equals sign");
    312 		}
    313 	} else {
    314 		s->equiv[0] = (unsigned char)s->str[0];
    315 		if (s->str[1] != '=') {
    316 			errx(1, "Misplaced equivalence equals sign");
    317 		}
    318 	}
    319 	s->str += 2;
    320 	s->cnt = 0;
    321 	s->state = SET;
    322 	s->set = s->equiv;
    323 }
    324 
    325 static int
    326 genrange(STR *s)
    327 {
    328 	int stopval;
    329 	const char *savestart;
    330 
    331 	savestart = s->str++;
    332 	stopval = s->str[0] == '\\' ? backslash(s) : (unsigned char)*s->str++;
    333 	if (stopval < (unsigned char)s->lastch) {
    334 		s->str = savestart;
    335 		return 0;
    336 	}
    337 	s->cnt = stopval - s->lastch + 1;
    338 	s->state = RANGE;
    339 	--s->lastch;
    340 	return 1;
    341 }
    342 
    343 static void
    344 genseq(STR *s)
    345 {
    346 	char *ep;
    347 
    348 	if (s->which == STRING1) {
    349 		errx(1, "Sequences only valid in string2");
    350 	}
    351 
    352 	if (*s->str == '\\') {
    353 		s->lastch = backslash(s);
    354 	} else {
    355 		s->lastch = (unsigned char)*s->str++;
    356 	}
    357 	if (*s->str != '*') {
    358 		errx(1, "Misplaced sequence asterisk");
    359 	}
    360 
    361 	s->str++;
    362 	switch (s->str[0]) {
    363 	case '\\':
    364 		s->cnt = backslash(s);
    365 		break;
    366 	case ']':
    367 		s->cnt = 0;
    368 		++s->str;
    369 		break;
    370 	default:
    371 		if (isdigit((unsigned char)s->str[0])) {
    372 			s->cnt = strtol(s->str, &ep, 0);
    373 			if (*ep == ']') {
    374 				s->str = ep + 1;
    375 				break;
    376 			}
    377 		}
    378 		errx(1, "illegal sequence count");
    379 		/* NOTREACHED */
    380 	}
    381 
    382 	s->state = s->cnt ? SEQUENCE : INFINITE;
    383 }
    384 
    385 /*
    386  * Translate \??? into a character.  Up to 3 octal digits, if no digits either
    387  * an escape code or a literal character.
    388  */
    389 static int
    390 backslash(STR *s)
    391 {
    392 	int ch, cnt, val;
    393 
    394 	for (cnt = val = 0;;) {
    395 		s->str++;
    396 		ch = (unsigned char)s->str[0];
    397 		if (!isascii(ch) || !isdigit(ch)) {
    398 			break;
    399 		}
    400 		val = val * 8 + ch - '0';
    401 		if (++cnt == 3) {
    402 			++s->str;
    403 			break;
    404 		}
    405 	}
    406 	if (cnt) {
    407 		return val;
    408 	}
    409 	if (ch != '\0') {
    410 		++s->str;
    411 	}
    412 	switch (ch) {
    413 	case 'a':			/* escape characters */
    414 		return '\7';
    415 	case 'b':
    416 		return '\b';
    417 	case 'e':
    418 		return '\033';
    419 	case 'f':
    420 		return '\f';
    421 	case 'n':
    422 		return '\n';
    423 	case 'r':
    424 		return '\r';
    425 	case 't':
    426 		return '\t';
    427 	case 'v':
    428 		return '\13';
    429 	case '\0':			/*  \" -> \ */
    430 		s->state = EOS;
    431 		return '\\';
    432 	default:			/* \x" -> x */
    433 		return ch;
    434 	}
    435 }
    436