str.c revision 1.23 1 /* $NetBSD: str.c,v 1.23 2013/08/11 00:39:22 dholland Exp $ */
2
3 /*-
4 * Copyright (c) 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 #ifndef lint
34 #if 0
35 static char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95";
36 #endif
37 __RCSID("$NetBSD: str.c,v 1.23 2013/08/11 00:39:22 dholland Exp $");
38 #endif /* not lint */
39
40 #include <sys/types.h>
41
42 #include <err.h>
43 #include <errno.h>
44 #include <stddef.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <ctype.h>
49 #include <assert.h>
50
51 #include "extern.h"
52
53 struct str {
54 enum { STRING1, STRING2 } which;
55 enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
56 int cnt; /* character count */
57 int lastch; /* last character */
58 int equiv[2]; /* equivalence set */
59 int *set; /* set of characters */
60 unsigned const char *str; /* user's string */
61 };
62
63 static int backslash(STR *);
64 static int bracket(STR *);
65 static int c_class(const void *, const void *);
66 static void genclass(STR *);
67 static void genequiv(STR *);
68 static int genrange(STR *);
69 static void genseq(STR *);
70
71 STR *
72 str_create(int whichstring, const char *txt)
73 {
74 STR *s;
75
76 s = malloc(sizeof(*s));
77 if (s == NULL) {
78 err(1, "Out of memory");
79 }
80
81 s->which = whichstring == 2 ? STRING2 : STRING1;
82 s->state = NORMAL;
83 s->cnt = 0;
84 s->lastch = OOBCH;
85 s->equiv[0] = 0;
86 s->equiv[1] = OOBCH;
87 s->set = NULL;
88 s->str = txt;
89
90 return s;
91 }
92
93 void
94 str_destroy(STR *s)
95 {
96 if (s->set != NULL && s->set != s->equiv) {
97 free(s->set);
98 }
99 free(s);
100 }
101
102 int
103 next(STR *s, int *ret)
104 {
105 int ch;
106
107 switch (s->state) {
108 case EOS:
109 *ret = s->lastch;
110 return 0;
111 case INFINITE:
112 *ret = s->lastch;
113 return 1;
114 case NORMAL:
115 switch (ch = *s->str) {
116 case '\0':
117 s->state = EOS;
118 *ret = s->lastch;
119 return 0;
120 case '\\':
121 s->lastch = backslash(s);
122 break;
123 case '[':
124 if (bracket(s))
125 return next(s, ret);
126 /* FALLTHROUGH */
127 default:
128 ++s->str;
129 s->lastch = ch;
130 break;
131 }
132
133 /* We can start a range at any time. */
134 if (s->str[0] == '-' && genrange(s)) {
135 return next(s, ret);
136 }
137 *ret = s->lastch;
138 return 1;
139 case RANGE:
140 if (s->cnt-- == 0) {
141 s->state = NORMAL;
142 return next(s, ret);
143 }
144 ++s->lastch;
145 *ret = s->lastch;
146 return 1;
147 case SEQUENCE:
148 if (s->cnt-- == 0) {
149 s->state = NORMAL;
150 return next(s, ret);
151 }
152 *ret = s->lastch;
153 return 1;
154 case SET:
155 if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
156 s->state = NORMAL;
157 return next(s, ret);
158 }
159 *ret = s->lastch;
160 return 1;
161 }
162 /* NOTREACHED */
163 assert(0);
164 *ret = s->lastch;
165 return 0;
166 }
167
168 static int
169 bracket(STR *s)
170 {
171 char *p;
172
173 switch (s->str[1]) {
174 case ':': /* "[:class:]" */
175 if ((p = strstr(s->str + 2, ":]")) == NULL)
176 return 0;
177 *p = '\0';
178 s->str += 2;
179 genclass(s);
180 s->str = p + 2;
181 return 1;
182 case '=': /* "[=equiv=]" */
183 if ((p = strstr(s->str + 2, "=]")) == NULL)
184 return 0;
185 s->str += 2;
186 genequiv(s);
187 return 1;
188 default: /* "[\###*n]" or "[#*n]" */
189 if ((p = strpbrk(s->str + 2, "*]")) == NULL)
190 return 0;
191 if (p[0] != '*' || strchr(p, ']') == NULL)
192 return 0;
193 s->str += 1;
194 genseq(s);
195 return 1;
196 }
197 /* NOTREACHED */
198 }
199
200 typedef struct {
201 const char *name;
202 int (*func)(int);
203 } CLASS;
204
205 static const CLASS classes[] = {
206 { "alnum", isalnum },
207 { "alpha", isalpha },
208 { "blank", isblank },
209 { "cntrl", iscntrl },
210 { "digit", isdigit },
211 { "graph", isgraph },
212 { "lower", islower },
213 { "print", isprint },
214 { "punct", ispunct },
215 { "space", isspace },
216 { "upper", isupper },
217 { "xdigit", isxdigit },
218 };
219
220 static void
221 genclass(STR *s)
222 {
223 int cnt;
224 const CLASS *cp;
225 CLASS tmp;
226 int *p;
227
228 tmp.name = s->str;
229 if ((cp = bsearch(&tmp, classes, sizeof(classes) /
230 sizeof(*cp), sizeof(*cp), c_class)) == NULL)
231 errx(1, "unknown class %s", s->str);
232
233 if ((s->set = p = malloc((NCHARS + 1) * sizeof(*p))) == NULL)
234 err(1, "malloc");
235
236 for (cnt = 0; cnt < NCHARS; ++cnt)
237 if ((*cp->func)(cnt))
238 *p++ = cnt;
239 *p++ = OOBCH;
240 memset(p, 0, NCHARS + 1 - (p - s->set));
241
242 s->cnt = 0;
243 s->state = SET;
244 }
245
246 static int
247 c_class(const void *a, const void *b)
248 {
249 return strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name);
250 }
251
252 /*
253 * English doesn't have any equivalence classes, so for now
254 * we just syntax check and grab the character.
255 */
256 static void
257 genequiv(STR *s)
258 {
259 if (*s->str == '\\') {
260 s->equiv[0] = backslash(s);
261 if (*s->str != '=')
262 errx(1, "misplaced equivalence equals sign");
263 } else {
264 s->equiv[0] = s->str[0];
265 if (s->str[1] != '=')
266 errx(1, "misplaced equivalence equals sign");
267 }
268 s->str += 2;
269 s->cnt = 0;
270 s->state = SET;
271 s->set = s->equiv;
272 }
273
274 static int
275 genrange(STR *s)
276 {
277 int stopval;
278 const char *savestart;
279
280 savestart = s->str;
281 stopval = *++s->str == '\\' ? backslash(s) : *s->str++;
282 if (stopval < (u_char)s->lastch) {
283 s->str = savestart;
284 return 0;
285 }
286 s->cnt = stopval - s->lastch + 1;
287 s->state = RANGE;
288 --s->lastch;
289 return 1;
290 }
291
292 static void
293 genseq(STR *s)
294 {
295 char *ep;
296
297 if (s->which == STRING1)
298 errx(1, "sequences only valid in string2");
299
300 if (*s->str == '\\')
301 s->lastch = backslash(s);
302 else
303 s->lastch = *s->str++;
304 if (*s->str != '*')
305 errx(1, "misplaced sequence asterisk");
306
307 switch (*++s->str) {
308 case '\\':
309 s->cnt = backslash(s);
310 break;
311 case ']':
312 s->cnt = 0;
313 ++s->str;
314 break;
315 default:
316 if (isdigit(*s->str)) {
317 s->cnt = strtol(s->str, &ep, 0);
318 if (*ep == ']') {
319 s->str = ep + 1;
320 break;
321 }
322 }
323 errx(1, "illegal sequence count");
324 /* NOTREACHED */
325 }
326
327 s->state = s->cnt ? SEQUENCE : INFINITE;
328 }
329
330 /*
331 * Translate \??? into a character. Up to 3 octal digits, if no digits either
332 * an escape code or a literal character.
333 */
334 static int
335 backslash(STR *s)
336 {
337 int ch, cnt, val;
338
339 for (cnt = val = 0;;) {
340 ch = *++s->str;
341 if (!isascii(ch) || !isdigit(ch))
342 break;
343 val = val * 8 + ch - '0';
344 if (++cnt == 3) {
345 ++s->str;
346 break;
347 }
348 }
349 if (cnt)
350 return val;
351 if (ch != '\0')
352 ++s->str;
353 switch (ch) {
354 case 'a': /* escape characters */
355 return '\7';
356 case 'b':
357 return '\b';
358 case 'e':
359 return '\033';
360 case 'f':
361 return '\f';
362 case 'n':
363 return '\n';
364 case 'r':
365 return '\r';
366 case 't':
367 return '\t';
368 case 'v':
369 return '\13';
370 case '\0': /* \" -> \ */
371 s->state = EOS;
372 return '\\';
373 default: /* \x" -> x */
374 return ch;
375 }
376 }
377