Home | History | Annotate | Line # | Download | only in tr
str.c revision 1.22
      1 /*	$NetBSD: str.c,v 1.22 2013/08/11 00:05:49 dholland Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1991, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  * 3. Neither the name of the University nor the names of its contributors
     16  *    may be used to endorse or promote products derived from this software
     17  *    without specific prior written permission.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     29  * SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 #ifndef lint
     34 #if 0
     35 static char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
     36 #endif
     37 __RCSID("$NetBSD: str.c,v 1.22 2013/08/11 00:05:49 dholland Exp $");
     38 #endif /* not lint */
     39 
     40 #include <sys/types.h>
     41 
     42 #include <err.h>
     43 #include <errno.h>
     44 #include <stddef.h>
     45 #include <stdio.h>
     46 #include <stdlib.h>
     47 #include <string.h>
     48 #include <ctype.h>
     49 #include <assert.h>
     50 
     51 #include "extern.h"
     52 
     53 struct str {
     54 	enum { STRING1, STRING2 } which;
     55 	enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
     56 	int	 cnt;			/* character count */
     57 	int	 lastch;		/* last character */
     58 	int	equiv[2];		/* equivalence set */
     59 	int	*set;			/* set of characters */
     60 	unsigned const char *str;	/* user's string */
     61 };
     62 
     63 static int	backslash(STR *);
     64 static int	bracket(STR *);
     65 static int	c_class(const void *, const void *);
     66 static void	genclass(STR *);
     67 static void	genequiv(STR *);
     68 static int	genrange(STR *);
     69 static void	genseq(STR *);
     70 
     71 STR *
     72 str_create(int whichstring)
     73 {
     74 	STR *s;
     75 
     76 	s = malloc(sizeof(*s));
     77 	if (s == NULL) {
     78 		err(1, "Out of memory");
     79 	}
     80 
     81 	s->which = whichstring == 2 ? STRING2 : STRING1;
     82 	s->state = NORMAL;
     83 	s->cnt = 0;
     84 	s->lastch = OOBCH;
     85 	s->equiv[0] = 0;
     86 	s->equiv[1] = OOBCH;
     87 	s->set = NULL;
     88 	s->str = NULL;
     89 
     90 	return s;
     91 }
     92 
     93 void
     94 str_destroy(STR *s)
     95 {
     96 	if (s->set != NULL && s->set != s->equiv) {
     97 		free(s->set);
     98 	}
     99 	free(s);
    100 }
    101 
    102 void
    103 str_setstring(STR *s, const char *txt)
    104 {
    105 	s->str = txt;
    106 }
    107 
    108 int
    109 next(STR *s, int *ret)
    110 {
    111 	int ch;
    112 
    113 	switch (s->state) {
    114 	case EOS:
    115 		*ret = s->lastch;
    116 		return 0;
    117 	case INFINITE:
    118 		*ret = s->lastch;
    119 		return 1;
    120 	case NORMAL:
    121 		switch (ch = *s->str) {
    122 		case '\0':
    123 			s->state = EOS;
    124 			*ret = s->lastch;
    125 			return 0;
    126 		case '\\':
    127 			s->lastch = backslash(s);
    128 			break;
    129 		case '[':
    130 			if (bracket(s))
    131 				return next(s, ret);
    132 			/* FALLTHROUGH */
    133 		default:
    134 			++s->str;
    135 			s->lastch = ch;
    136 			break;
    137 		}
    138 
    139 		/* We can start a range at any time. */
    140 		if (s->str[0] == '-' && genrange(s)) {
    141 			return next(s, ret);
    142 		}
    143 		*ret = s->lastch;
    144 		return 1;
    145 	case RANGE:
    146 		if (s->cnt-- == 0) {
    147 			s->state = NORMAL;
    148 			return next(s, ret);
    149 		}
    150 		++s->lastch;
    151 		*ret = s->lastch;
    152 		return 1;
    153 	case SEQUENCE:
    154 		if (s->cnt-- == 0) {
    155 			s->state = NORMAL;
    156 			return next(s, ret);
    157 		}
    158 		*ret = s->lastch;
    159 		return 1;
    160 	case SET:
    161 		if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
    162 			s->state = NORMAL;
    163 			return next(s, ret);
    164 		}
    165 		*ret = s->lastch;
    166 		return 1;
    167 	}
    168 	/* NOTREACHED */
    169 	assert(0);
    170 	*ret = s->lastch;
    171 	return 0;
    172 }
    173 
    174 static int
    175 bracket(STR *s)
    176 {
    177 	char *p;
    178 
    179 	switch (s->str[1]) {
    180 	case ':':				/* "[:class:]" */
    181 		if ((p = strstr(s->str + 2, ":]")) == NULL)
    182 			return 0;
    183 		*p = '\0';
    184 		s->str += 2;
    185 		genclass(s);
    186 		s->str = p + 2;
    187 		return 1;
    188 	case '=':				/* "[=equiv=]" */
    189 		if ((p = strstr(s->str + 2, "=]")) == NULL)
    190 			return 0;
    191 		s->str += 2;
    192 		genequiv(s);
    193 		return 1;
    194 	default:				/* "[\###*n]" or "[#*n]" */
    195 		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
    196 			return 0;
    197 		if (p[0] != '*' || strchr(p, ']') == NULL)
    198 			return 0;
    199 		s->str += 1;
    200 		genseq(s);
    201 		return 1;
    202 	}
    203 	/* NOTREACHED */
    204 }
    205 
    206 typedef struct {
    207 	const char *name;
    208 	int (*func)(int);
    209 } CLASS;
    210 
    211 static const CLASS classes[] = {
    212 	{ "alnum",  isalnum  },
    213 	{ "alpha",  isalpha  },
    214 	{ "blank",  isblank  },
    215 	{ "cntrl",  iscntrl  },
    216 	{ "digit",  isdigit  },
    217 	{ "graph",  isgraph  },
    218 	{ "lower",  islower  },
    219 	{ "print",  isprint  },
    220 	{ "punct",  ispunct  },
    221 	{ "space",  isspace  },
    222 	{ "upper",  isupper  },
    223 	{ "xdigit", isxdigit },
    224 };
    225 
    226 static void
    227 genclass(STR *s)
    228 {
    229 	int cnt;
    230 	const CLASS *cp;
    231 	CLASS tmp;
    232 	int *p;
    233 
    234 	tmp.name = s->str;
    235 	if ((cp = bsearch(&tmp, classes, sizeof(classes) /
    236 	    sizeof(*cp), sizeof(*cp), c_class)) == NULL)
    237 		errx(1, "unknown class %s", s->str);
    238 
    239 	if ((s->set = p = malloc((NCHARS + 1) * sizeof(*p))) == NULL)
    240 		err(1, "malloc");
    241 
    242 	for (cnt = 0; cnt < NCHARS; ++cnt)
    243 		if ((*cp->func)(cnt))
    244 			*p++ = cnt;
    245 	*p++ = OOBCH;
    246 	memset(p, 0, NCHARS + 1 - (p - s->set));
    247 
    248 	s->cnt = 0;
    249 	s->state = SET;
    250 }
    251 
    252 static int
    253 c_class(const void *a, const void *b)
    254 {
    255 	return strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name);
    256 }
    257 
    258 /*
    259  * English doesn't have any equivalence classes, so for now
    260  * we just syntax check and grab the character.
    261  */
    262 static void
    263 genequiv(STR *s)
    264 {
    265 	if (*s->str == '\\') {
    266 		s->equiv[0] = backslash(s);
    267 		if (*s->str != '=')
    268 			errx(1, "misplaced equivalence equals sign");
    269 	} else {
    270 		s->equiv[0] = s->str[0];
    271 		if (s->str[1] != '=')
    272 			errx(1, "misplaced equivalence equals sign");
    273 	}
    274 	s->str += 2;
    275 	s->cnt = 0;
    276 	s->state = SET;
    277 	s->set = s->equiv;
    278 }
    279 
    280 static int
    281 genrange(STR *s)
    282 {
    283 	int stopval;
    284 	const char *savestart;
    285 
    286 	savestart = s->str;
    287 	stopval = *++s->str == '\\' ? backslash(s) : *s->str++;
    288 	if (stopval < (u_char)s->lastch) {
    289 		s->str = savestart;
    290 		return 0;
    291 	}
    292 	s->cnt = stopval - s->lastch + 1;
    293 	s->state = RANGE;
    294 	--s->lastch;
    295 	return 1;
    296 }
    297 
    298 static void
    299 genseq(STR *s)
    300 {
    301 	char *ep;
    302 
    303 	if (s->which == STRING1)
    304 		errx(1, "sequences only valid in string2");
    305 
    306 	if (*s->str == '\\')
    307 		s->lastch = backslash(s);
    308 	else
    309 		s->lastch = *s->str++;
    310 	if (*s->str != '*')
    311 		errx(1, "misplaced sequence asterisk");
    312 
    313 	switch (*++s->str) {
    314 	case '\\':
    315 		s->cnt = backslash(s);
    316 		break;
    317 	case ']':
    318 		s->cnt = 0;
    319 		++s->str;
    320 		break;
    321 	default:
    322 		if (isdigit(*s->str)) {
    323 			s->cnt = strtol(s->str, &ep, 0);
    324 			if (*ep == ']') {
    325 				s->str = ep + 1;
    326 				break;
    327 			}
    328 		}
    329 		errx(1, "illegal sequence count");
    330 		/* NOTREACHED */
    331 	}
    332 
    333 	s->state = s->cnt ? SEQUENCE : INFINITE;
    334 }
    335 
    336 /*
    337  * Translate \??? into a character.  Up to 3 octal digits, if no digits either
    338  * an escape code or a literal character.
    339  */
    340 static int
    341 backslash(STR *s)
    342 {
    343 	int ch, cnt, val;
    344 
    345 	for (cnt = val = 0;;) {
    346 		ch = *++s->str;
    347 		if (!isascii(ch) || !isdigit(ch))
    348 			break;
    349 		val = val * 8 + ch - '0';
    350 		if (++cnt == 3) {
    351 			++s->str;
    352 			break;
    353 		}
    354 	}
    355 	if (cnt)
    356 		return val;
    357 	if (ch != '\0')
    358 		++s->str;
    359 	switch (ch) {
    360 	case 'a':			/* escape characters */
    361 		return '\7';
    362 	case 'b':
    363 		return '\b';
    364 	case 'e':
    365 		return '\033';
    366 	case 'f':
    367 		return '\f';
    368 	case 'n':
    369 		return '\n';
    370 	case 'r':
    371 		return '\r';
    372 	case 't':
    373 		return '\t';
    374 	case 'v':
    375 		return '\13';
    376 	case '\0':			/*  \" -> \ */
    377 		s->state = EOS;
    378 		return '\\';
    379 	default:			/* \x" -> x */
    380 		return ch;
    381 	}
    382 }
    383