Home | History | Annotate | Line # | Download | only in tr
      1 /*	$NetBSD: str.c,v 1.30 2018/05/26 11:20:30 leot Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1991, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  * 3. Neither the name of the University nor the names of its contributors
     16  *    may be used to endorse or promote products derived from this software
     17  *    without specific prior written permission.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     29  * SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 #ifndef lint
     34 #if 0
     35 static char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
     36 #endif
     37 __RCSID("$NetBSD: str.c,v 1.30 2018/05/26 11:20:30 leot Exp $");
     38 #endif /* not lint */
     39 
     40 #include <sys/types.h>
     41 
     42 #include <err.h>
     43 #include <errno.h>
     44 #include <stddef.h>
     45 #include <stdio.h>
     46 #include <stdlib.h>
     47 #include <string.h>
     48 #include <ctype.h>
     49 #include <assert.h>
     50 
     51 #include "extern.h"
     52 
     53 struct str {
     54 	enum { STRING1, STRING2 } which;
     55 	enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
     56 	int cnt;			/* character count */
     57 	int lastch;			/* last character */
     58 	int equiv[2];			/* equivalence set */
     59 	int *set;			/* set of characters */
     60 	const char *str;		/* user's string */
     61 };
     62 
     63 static int backslash(STR *);
     64 static int bracket(STR *);
     65 static int c_class(const void *, const void *);
     66 static int *genclass(const char *, size_t);
     67 static void genequiv(STR *);
     68 static int genrange(STR *);
     69 static void genseq(STR *);
     70 
     71 STR *
     72 str_create(int whichstring, const char *txt)
     73 {
     74 	STR *s;
     75 
     76 	s = malloc(sizeof(*s));
     77 	if (s == NULL) {
     78 		err(1, "Out of memory");
     79 	}
     80 
     81 	s->which = whichstring == 2 ? STRING2 : STRING1;
     82 	s->state = NORMAL;
     83 	s->cnt = 0;
     84 	s->lastch = OOBCH;
     85 	s->equiv[0] = 0;
     86 	s->equiv[1] = OOBCH;
     87 	s->set = NULL;
     88 	s->str = txt;
     89 
     90 	return s;
     91 }
     92 
     93 void
     94 str_destroy(STR *s)
     95 {
     96 	if (s->set != NULL && s->set != s->equiv) {
     97 		free(s->set);
     98 	}
     99 	free(s);
    100 }
    101 
    102 int
    103 next(STR *s, int *ret)
    104 {
    105 	int ch;
    106 
    107 	switch (s->state) {
    108 	case EOS:
    109 		*ret = s->lastch;
    110 		return 0;
    111 	case INFINITE:
    112 		*ret = s->lastch;
    113 		return 1;
    114 	case NORMAL:
    115 		ch = (unsigned char)s->str[0];
    116 		switch (ch) {
    117 		case '\0':
    118 			s->state = EOS;
    119 			*ret = s->lastch;
    120 			return 0;
    121 		case '\\':
    122 			s->lastch = backslash(s);
    123 			break;
    124 		case '[':
    125 			if (bracket(s)) {
    126 				return next(s, ret);
    127 			}
    128 			/* FALLTHROUGH */
    129 		default:
    130 			++s->str;
    131 			s->lastch = ch;
    132 			break;
    133 		}
    134 
    135 		/* We can start a range at any time. */
    136 		if (s->str[0] == '-' && genrange(s)) {
    137 			return next(s, ret);
    138 		}
    139 		*ret = s->lastch;
    140 		return 1;
    141 	case RANGE:
    142 		if (s->cnt == 0) {
    143 			s->state = NORMAL;
    144 			return next(s, ret);
    145 		}
    146 		s->cnt--;
    147 		++s->lastch;
    148 		*ret = s->lastch;
    149 		return 1;
    150 	case SEQUENCE:
    151 		if (s->cnt == 0) {
    152 			s->state = NORMAL;
    153 			return next(s, ret);
    154 		}
    155 		s->cnt--;
    156 		*ret = s->lastch;
    157 		return 1;
    158 	case SET:
    159 		s->lastch = s->set[s->cnt++];
    160 		if (s->lastch == OOBCH) {
    161 			s->state = NORMAL;
    162 			if (s->set != s->equiv) {
    163 				free(s->set);
    164 			}
    165 			s->set = NULL;
    166 			return next(s, ret);
    167 		}
    168 		*ret = s->lastch;
    169 		return 1;
    170 	}
    171 	/* NOTREACHED */
    172 	assert(0);
    173 	*ret = s->lastch;
    174 	return 0;
    175 }
    176 
    177 static int
    178 bracket(STR *s)
    179 {
    180 	const char *p;
    181 	int *q;
    182 
    183 	switch (s->str[1]) {
    184 	case ':':				/* "[:class:]" */
    185 		if ((p = strstr(s->str + 2, ":]")) == NULL)
    186 			return 0;
    187 		s->str += 2;
    188 		q = genclass(s->str, p - s->str);
    189 		s->state = SET;
    190 		s->set = q;
    191 		s->cnt = 0;
    192 		s->str = p + 2;
    193 		return 1;
    194 	case '=':				/* "[=equiv=]" */
    195 		if ((p = strstr(s->str + 2, "=]")) == NULL)
    196 			return 0;
    197 		s->str += 2;
    198 		genequiv(s);
    199 		s->str = p + 2;
    200 		return 1;
    201 	default:				/* "[\###*n]" or "[#*n]" */
    202 		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
    203 			return 0;
    204 		if (p[0] != '*' || strchr(p, ']') == NULL)
    205 			return 0;
    206 		s->str += 1;
    207 		genseq(s);
    208 		return 1;
    209 	}
    210 	/* NOTREACHED */
    211 }
    212 
    213 typedef struct {
    214 	const char *name;
    215 	int (*func)(int);
    216 } CLASS;
    217 
    218 static const CLASS classes[] = {
    219 	{ "alnum",  isalnum  },
    220 	{ "alpha",  isalpha  },
    221 	{ "blank",  isblank  },
    222 	{ "cntrl",  iscntrl  },
    223 	{ "digit",  isdigit  },
    224 	{ "graph",  isgraph  },
    225 	{ "lower",  islower  },
    226 	{ "print",  isprint  },
    227 	{ "punct",  ispunct  },
    228 	{ "space",  isspace  },
    229 	{ "upper",  isupper  },
    230 	{ "xdigit", isxdigit },
    231 };
    232 
    233 typedef struct {
    234 	const char *name;
    235 	size_t len;
    236 } CLASSKEY;
    237 
    238 static int *
    239 genclass(const char *class, size_t len)
    240 {
    241 	int ch;
    242 	const CLASS *cp;
    243 	CLASSKEY key;
    244 	int *p;
    245 	unsigned pos, num;
    246 
    247 	/* Find the class */
    248 	key.name = class;
    249 	key.len = len;
    250 	cp = bsearch(&key, classes, __arraycount(classes), sizeof(classes[0]),
    251 		     c_class);
    252 	if (cp == NULL) {
    253 		errx(1, "unknown class %.*s", (int)len, class);
    254 	}
    255 
    256 	/*
    257 	 * Figure out what characters are in the class
    258 	 */
    259 
    260 	num = NCHARS + 1;
    261 	p = malloc(num * sizeof(*p));
    262 	if (p == NULL) {
    263 		err(1, "malloc");
    264 	}
    265 
    266 	pos = 0;
    267 	for (ch = 0; ch < NCHARS; ch++) {
    268 		if (cp->func(ch)) {
    269 			p[pos++] = ch;
    270 		}
    271 	}
    272 
    273 	p[pos++] = OOBCH;
    274 	for (; pos < num; pos++) {
    275 		p[pos] = 0;
    276 	}
    277 
    278 	return p;
    279 }
    280 
    281 static int
    282 c_class(const void *av, const void *bv)
    283 {
    284 	const CLASSKEY *a = av;
    285 	const CLASS *b = bv;
    286 	size_t blen;
    287 	int r;
    288 
    289 	blen = strlen(b->name);
    290 	r = strncmp(a->name, b->name, a->len);
    291 	if (r != 0) {
    292 		return r;
    293 	}
    294 	if (a->len < blen) {
    295 		/* someone gave us a prefix of the right name */
    296 		return -1;
    297 	}
    298 	assert(a-> len == blen);
    299 	return 0;
    300 }
    301 
    302 /*
    303  * English doesn't have any equivalence classes, so for now
    304  * we just syntax check and grab the character.
    305  */
    306 static void
    307 genequiv(STR *s)
    308 {
    309 	int ch;
    310 
    311 	ch = (unsigned char)s->str[0];
    312 	if (ch == '\\') {
    313 		s->equiv[0] = backslash(s);
    314 	} else {
    315 		s->equiv[0] = ch;
    316 		s->str++;
    317 	}
    318 	if (s->str[0] != '=') {
    319 		errx(1, "Misplaced equivalence equals sign");
    320 	}
    321 	s->str++;
    322 	if (s->str[0] != ']') {
    323 		errx(1, "Misplaced equivalence right bracket");
    324 	}
    325 	s->str++;
    326 
    327 	s->cnt = 0;
    328 	s->state = SET;
    329 	s->set = s->equiv;
    330 }
    331 
    332 static int
    333 genrange(STR *s)
    334 {
    335 	int stopval;
    336 	const char *savestart;
    337 
    338 	savestart = s->str++;
    339 	stopval = s->str[0] == '\\' ? backslash(s) : (unsigned char)*s->str++;
    340 	if (stopval < (unsigned char)s->lastch) {
    341 		s->str = savestart;
    342 		return 0;
    343 	}
    344 	s->cnt = stopval - s->lastch + 1;
    345 	s->state = RANGE;
    346 	--s->lastch;
    347 	return 1;
    348 }
    349 
    350 static void
    351 genseq(STR *s)
    352 {
    353 	char *ep;
    354 
    355 	if (s->which == STRING1) {
    356 		errx(1, "Sequences only valid in string2");
    357 	}
    358 
    359 	if (*s->str == '\\') {
    360 		s->lastch = backslash(s);
    361 	} else {
    362 		s->lastch = (unsigned char)*s->str++;
    363 	}
    364 	if (*s->str != '*') {
    365 		errx(1, "Misplaced sequence asterisk");
    366 	}
    367 
    368 	s->str++;
    369 	switch (s->str[0]) {
    370 	case '\\':
    371 		s->cnt = backslash(s);
    372 		break;
    373 	case ']':
    374 		s->cnt = 0;
    375 		++s->str;
    376 		break;
    377 	default:
    378 		if (isdigit((unsigned char)s->str[0])) {
    379 			s->cnt = strtol(s->str, &ep, 0);
    380 			if (*ep == ']') {
    381 				s->str = ep + 1;
    382 				break;
    383 			}
    384 		}
    385 		errx(1, "illegal sequence count");
    386 		/* NOTREACHED */
    387 	}
    388 
    389 	s->state = s->cnt ? SEQUENCE : INFINITE;
    390 }
    391 
    392 /*
    393  * Translate \??? into a character.  Up to 3 octal digits, if no digits either
    394  * an escape code or a literal character.
    395  */
    396 static int
    397 backslash(STR *s)
    398 {
    399 	int ch, cnt, val;
    400 
    401 	cnt = val = 0;
    402 	for (;;) {
    403 		/* Consume the character we're already on. */
    404 		s->str++;
    405 
    406 		/* Look at the next character. */
    407 		ch = (unsigned char)s->str[0];
    408 		if (!isascii(ch) || !isdigit(ch)) {
    409 			break;
    410 		}
    411 		val = val * 8 + ch - '0';
    412 		if (++cnt == 3) {
    413 			/* Enough digits; consume this one and stop */
    414 			++s->str;
    415 			break;
    416 		}
    417 	}
    418 	if (cnt) {
    419 		/* We saw digits, so return their value */
    420 		if (val >= OOBCH)
    421 			errx(1, "Invalid octal character value");
    422 		return val;
    423 	}
    424 	if (ch == '\0') {
    425 		/* \<end> -> \ */
    426 		s->state = EOS;
    427 		return '\\';
    428 	}
    429 
    430 	/* Consume the escaped character */
    431 	s->str++;
    432 
    433 	switch (ch) {
    434 	case 'a':			/* escape characters */
    435 		return '\7';
    436 	case 'b':
    437 		return '\b';
    438 	case 'e':
    439 		return '\033';
    440 	case 'f':
    441 		return '\f';
    442 	case 'n':
    443 		return '\n';
    444 	case 'r':
    445 		return '\r';
    446 	case 't':
    447 		return '\t';
    448 	case 'v':
    449 		return '\13';
    450 	default:			/* \q -> q */
    451 		return ch;
    452 	}
    453 }
    454