Home | History | Annotate | Line # | Download | only in tr
str.c revision 1.23
      1 /*	$NetBSD: str.c,v 1.23 2013/08/11 00:39:22 dholland Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1991, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  * 3. Neither the name of the University nor the names of its contributors
     16  *    may be used to endorse or promote products derived from this software
     17  *    without specific prior written permission.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     29  * SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 #ifndef lint
     34 #if 0
     35 static char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
     36 #endif
     37 __RCSID("$NetBSD: str.c,v 1.23 2013/08/11 00:39:22 dholland Exp $");
     38 #endif /* not lint */
     39 
     40 #include <sys/types.h>
     41 
     42 #include <err.h>
     43 #include <errno.h>
     44 #include <stddef.h>
     45 #include <stdio.h>
     46 #include <stdlib.h>
     47 #include <string.h>
     48 #include <ctype.h>
     49 #include <assert.h>
     50 
     51 #include "extern.h"
     52 
     53 struct str {
     54 	enum { STRING1, STRING2 } which;
     55 	enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
     56 	int	 cnt;			/* character count */
     57 	int	 lastch;		/* last character */
     58 	int	equiv[2];		/* equivalence set */
     59 	int	*set;			/* set of characters */
     60 	unsigned const char *str;	/* user's string */
     61 };
     62 
     63 static int	backslash(STR *);
     64 static int	bracket(STR *);
     65 static int	c_class(const void *, const void *);
     66 static void	genclass(STR *);
     67 static void	genequiv(STR *);
     68 static int	genrange(STR *);
     69 static void	genseq(STR *);
     70 
     71 STR *
     72 str_create(int whichstring, const char *txt)
     73 {
     74 	STR *s;
     75 
     76 	s = malloc(sizeof(*s));
     77 	if (s == NULL) {
     78 		err(1, "Out of memory");
     79 	}
     80 
     81 	s->which = whichstring == 2 ? STRING2 : STRING1;
     82 	s->state = NORMAL;
     83 	s->cnt = 0;
     84 	s->lastch = OOBCH;
     85 	s->equiv[0] = 0;
     86 	s->equiv[1] = OOBCH;
     87 	s->set = NULL;
     88 	s->str = txt;
     89 
     90 	return s;
     91 }
     92 
     93 void
     94 str_destroy(STR *s)
     95 {
     96 	if (s->set != NULL && s->set != s->equiv) {
     97 		free(s->set);
     98 	}
     99 	free(s);
    100 }
    101 
    102 int
    103 next(STR *s, int *ret)
    104 {
    105 	int ch;
    106 
    107 	switch (s->state) {
    108 	case EOS:
    109 		*ret = s->lastch;
    110 		return 0;
    111 	case INFINITE:
    112 		*ret = s->lastch;
    113 		return 1;
    114 	case NORMAL:
    115 		switch (ch = *s->str) {
    116 		case '\0':
    117 			s->state = EOS;
    118 			*ret = s->lastch;
    119 			return 0;
    120 		case '\\':
    121 			s->lastch = backslash(s);
    122 			break;
    123 		case '[':
    124 			if (bracket(s))
    125 				return next(s, ret);
    126 			/* FALLTHROUGH */
    127 		default:
    128 			++s->str;
    129 			s->lastch = ch;
    130 			break;
    131 		}
    132 
    133 		/* We can start a range at any time. */
    134 		if (s->str[0] == '-' && genrange(s)) {
    135 			return next(s, ret);
    136 		}
    137 		*ret = s->lastch;
    138 		return 1;
    139 	case RANGE:
    140 		if (s->cnt-- == 0) {
    141 			s->state = NORMAL;
    142 			return next(s, ret);
    143 		}
    144 		++s->lastch;
    145 		*ret = s->lastch;
    146 		return 1;
    147 	case SEQUENCE:
    148 		if (s->cnt-- == 0) {
    149 			s->state = NORMAL;
    150 			return next(s, ret);
    151 		}
    152 		*ret = s->lastch;
    153 		return 1;
    154 	case SET:
    155 		if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
    156 			s->state = NORMAL;
    157 			return next(s, ret);
    158 		}
    159 		*ret = s->lastch;
    160 		return 1;
    161 	}
    162 	/* NOTREACHED */
    163 	assert(0);
    164 	*ret = s->lastch;
    165 	return 0;
    166 }
    167 
    168 static int
    169 bracket(STR *s)
    170 {
    171 	char *p;
    172 
    173 	switch (s->str[1]) {
    174 	case ':':				/* "[:class:]" */
    175 		if ((p = strstr(s->str + 2, ":]")) == NULL)
    176 			return 0;
    177 		*p = '\0';
    178 		s->str += 2;
    179 		genclass(s);
    180 		s->str = p + 2;
    181 		return 1;
    182 	case '=':				/* "[=equiv=]" */
    183 		if ((p = strstr(s->str + 2, "=]")) == NULL)
    184 			return 0;
    185 		s->str += 2;
    186 		genequiv(s);
    187 		return 1;
    188 	default:				/* "[\###*n]" or "[#*n]" */
    189 		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
    190 			return 0;
    191 		if (p[0] != '*' || strchr(p, ']') == NULL)
    192 			return 0;
    193 		s->str += 1;
    194 		genseq(s);
    195 		return 1;
    196 	}
    197 	/* NOTREACHED */
    198 }
    199 
    200 typedef struct {
    201 	const char *name;
    202 	int (*func)(int);
    203 } CLASS;
    204 
    205 static const CLASS classes[] = {
    206 	{ "alnum",  isalnum  },
    207 	{ "alpha",  isalpha  },
    208 	{ "blank",  isblank  },
    209 	{ "cntrl",  iscntrl  },
    210 	{ "digit",  isdigit  },
    211 	{ "graph",  isgraph  },
    212 	{ "lower",  islower  },
    213 	{ "print",  isprint  },
    214 	{ "punct",  ispunct  },
    215 	{ "space",  isspace  },
    216 	{ "upper",  isupper  },
    217 	{ "xdigit", isxdigit },
    218 };
    219 
    220 static void
    221 genclass(STR *s)
    222 {
    223 	int cnt;
    224 	const CLASS *cp;
    225 	CLASS tmp;
    226 	int *p;
    227 
    228 	tmp.name = s->str;
    229 	if ((cp = bsearch(&tmp, classes, sizeof(classes) /
    230 	    sizeof(*cp), sizeof(*cp), c_class)) == NULL)
    231 		errx(1, "unknown class %s", s->str);
    232 
    233 	if ((s->set = p = malloc((NCHARS + 1) * sizeof(*p))) == NULL)
    234 		err(1, "malloc");
    235 
    236 	for (cnt = 0; cnt < NCHARS; ++cnt)
    237 		if ((*cp->func)(cnt))
    238 			*p++ = cnt;
    239 	*p++ = OOBCH;
    240 	memset(p, 0, NCHARS + 1 - (p - s->set));
    241 
    242 	s->cnt = 0;
    243 	s->state = SET;
    244 }
    245 
    246 static int
    247 c_class(const void *a, const void *b)
    248 {
    249 	return strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name);
    250 }
    251 
    252 /*
    253  * English doesn't have any equivalence classes, so for now
    254  * we just syntax check and grab the character.
    255  */
    256 static void
    257 genequiv(STR *s)
    258 {
    259 	if (*s->str == '\\') {
    260 		s->equiv[0] = backslash(s);
    261 		if (*s->str != '=')
    262 			errx(1, "misplaced equivalence equals sign");
    263 	} else {
    264 		s->equiv[0] = s->str[0];
    265 		if (s->str[1] != '=')
    266 			errx(1, "misplaced equivalence equals sign");
    267 	}
    268 	s->str += 2;
    269 	s->cnt = 0;
    270 	s->state = SET;
    271 	s->set = s->equiv;
    272 }
    273 
    274 static int
    275 genrange(STR *s)
    276 {
    277 	int stopval;
    278 	const char *savestart;
    279 
    280 	savestart = s->str;
    281 	stopval = *++s->str == '\\' ? backslash(s) : *s->str++;
    282 	if (stopval < (u_char)s->lastch) {
    283 		s->str = savestart;
    284 		return 0;
    285 	}
    286 	s->cnt = stopval - s->lastch + 1;
    287 	s->state = RANGE;
    288 	--s->lastch;
    289 	return 1;
    290 }
    291 
    292 static void
    293 genseq(STR *s)
    294 {
    295 	char *ep;
    296 
    297 	if (s->which == STRING1)
    298 		errx(1, "sequences only valid in string2");
    299 
    300 	if (*s->str == '\\')
    301 		s->lastch = backslash(s);
    302 	else
    303 		s->lastch = *s->str++;
    304 	if (*s->str != '*')
    305 		errx(1, "misplaced sequence asterisk");
    306 
    307 	switch (*++s->str) {
    308 	case '\\':
    309 		s->cnt = backslash(s);
    310 		break;
    311 	case ']':
    312 		s->cnt = 0;
    313 		++s->str;
    314 		break;
    315 	default:
    316 		if (isdigit(*s->str)) {
    317 			s->cnt = strtol(s->str, &ep, 0);
    318 			if (*ep == ']') {
    319 				s->str = ep + 1;
    320 				break;
    321 			}
    322 		}
    323 		errx(1, "illegal sequence count");
    324 		/* NOTREACHED */
    325 	}
    326 
    327 	s->state = s->cnt ? SEQUENCE : INFINITE;
    328 }
    329 
    330 /*
    331  * Translate \??? into a character.  Up to 3 octal digits, if no digits either
    332  * an escape code or a literal character.
    333  */
    334 static int
    335 backslash(STR *s)
    336 {
    337 	int ch, cnt, val;
    338 
    339 	for (cnt = val = 0;;) {
    340 		ch = *++s->str;
    341 		if (!isascii(ch) || !isdigit(ch))
    342 			break;
    343 		val = val * 8 + ch - '0';
    344 		if (++cnt == 3) {
    345 			++s->str;
    346 			break;
    347 		}
    348 	}
    349 	if (cnt)
    350 		return val;
    351 	if (ch != '\0')
    352 		++s->str;
    353 	switch (ch) {
    354 	case 'a':			/* escape characters */
    355 		return '\7';
    356 	case 'b':
    357 		return '\b';
    358 	case 'e':
    359 		return '\033';
    360 	case 'f':
    361 		return '\f';
    362 	case 'n':
    363 		return '\n';
    364 	case 'r':
    365 		return '\r';
    366 	case 't':
    367 		return '\t';
    368 	case 'v':
    369 		return '\13';
    370 	case '\0':			/*  \" -> \ */
    371 		s->state = EOS;
    372 		return '\\';
    373 	default:			/* \x" -> x */
    374 		return ch;
    375 	}
    376 }
    377