str.c revision 1.22 1 /* $NetBSD: str.c,v 1.22 2013/08/11 00:05:49 dholland Exp $ */
2
3 /*-
4 * Copyright (c) 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 #ifndef lint
34 #if 0
35 static char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95";
36 #endif
37 __RCSID("$NetBSD: str.c,v 1.22 2013/08/11 00:05:49 dholland Exp $");
38 #endif /* not lint */
39
40 #include <sys/types.h>
41
42 #include <err.h>
43 #include <errno.h>
44 #include <stddef.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <ctype.h>
49 #include <assert.h>
50
51 #include "extern.h"
52
53 struct str {
54 enum { STRING1, STRING2 } which;
55 enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
56 int cnt; /* character count */
57 int lastch; /* last character */
58 int equiv[2]; /* equivalence set */
59 int *set; /* set of characters */
60 unsigned const char *str; /* user's string */
61 };
62
63 static int backslash(STR *);
64 static int bracket(STR *);
65 static int c_class(const void *, const void *);
66 static void genclass(STR *);
67 static void genequiv(STR *);
68 static int genrange(STR *);
69 static void genseq(STR *);
70
71 STR *
72 str_create(int whichstring)
73 {
74 STR *s;
75
76 s = malloc(sizeof(*s));
77 if (s == NULL) {
78 err(1, "Out of memory");
79 }
80
81 s->which = whichstring == 2 ? STRING2 : STRING1;
82 s->state = NORMAL;
83 s->cnt = 0;
84 s->lastch = OOBCH;
85 s->equiv[0] = 0;
86 s->equiv[1] = OOBCH;
87 s->set = NULL;
88 s->str = NULL;
89
90 return s;
91 }
92
93 void
94 str_destroy(STR *s)
95 {
96 if (s->set != NULL && s->set != s->equiv) {
97 free(s->set);
98 }
99 free(s);
100 }
101
102 void
103 str_setstring(STR *s, const char *txt)
104 {
105 s->str = txt;
106 }
107
108 int
109 next(STR *s, int *ret)
110 {
111 int ch;
112
113 switch (s->state) {
114 case EOS:
115 *ret = s->lastch;
116 return 0;
117 case INFINITE:
118 *ret = s->lastch;
119 return 1;
120 case NORMAL:
121 switch (ch = *s->str) {
122 case '\0':
123 s->state = EOS;
124 *ret = s->lastch;
125 return 0;
126 case '\\':
127 s->lastch = backslash(s);
128 break;
129 case '[':
130 if (bracket(s))
131 return next(s, ret);
132 /* FALLTHROUGH */
133 default:
134 ++s->str;
135 s->lastch = ch;
136 break;
137 }
138
139 /* We can start a range at any time. */
140 if (s->str[0] == '-' && genrange(s)) {
141 return next(s, ret);
142 }
143 *ret = s->lastch;
144 return 1;
145 case RANGE:
146 if (s->cnt-- == 0) {
147 s->state = NORMAL;
148 return next(s, ret);
149 }
150 ++s->lastch;
151 *ret = s->lastch;
152 return 1;
153 case SEQUENCE:
154 if (s->cnt-- == 0) {
155 s->state = NORMAL;
156 return next(s, ret);
157 }
158 *ret = s->lastch;
159 return 1;
160 case SET:
161 if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
162 s->state = NORMAL;
163 return next(s, ret);
164 }
165 *ret = s->lastch;
166 return 1;
167 }
168 /* NOTREACHED */
169 assert(0);
170 *ret = s->lastch;
171 return 0;
172 }
173
174 static int
175 bracket(STR *s)
176 {
177 char *p;
178
179 switch (s->str[1]) {
180 case ':': /* "[:class:]" */
181 if ((p = strstr(s->str + 2, ":]")) == NULL)
182 return 0;
183 *p = '\0';
184 s->str += 2;
185 genclass(s);
186 s->str = p + 2;
187 return 1;
188 case '=': /* "[=equiv=]" */
189 if ((p = strstr(s->str + 2, "=]")) == NULL)
190 return 0;
191 s->str += 2;
192 genequiv(s);
193 return 1;
194 default: /* "[\###*n]" or "[#*n]" */
195 if ((p = strpbrk(s->str + 2, "*]")) == NULL)
196 return 0;
197 if (p[0] != '*' || strchr(p, ']') == NULL)
198 return 0;
199 s->str += 1;
200 genseq(s);
201 return 1;
202 }
203 /* NOTREACHED */
204 }
205
206 typedef struct {
207 const char *name;
208 int (*func)(int);
209 } CLASS;
210
211 static const CLASS classes[] = {
212 { "alnum", isalnum },
213 { "alpha", isalpha },
214 { "blank", isblank },
215 { "cntrl", iscntrl },
216 { "digit", isdigit },
217 { "graph", isgraph },
218 { "lower", islower },
219 { "print", isprint },
220 { "punct", ispunct },
221 { "space", isspace },
222 { "upper", isupper },
223 { "xdigit", isxdigit },
224 };
225
226 static void
227 genclass(STR *s)
228 {
229 int cnt;
230 const CLASS *cp;
231 CLASS tmp;
232 int *p;
233
234 tmp.name = s->str;
235 if ((cp = bsearch(&tmp, classes, sizeof(classes) /
236 sizeof(*cp), sizeof(*cp), c_class)) == NULL)
237 errx(1, "unknown class %s", s->str);
238
239 if ((s->set = p = malloc((NCHARS + 1) * sizeof(*p))) == NULL)
240 err(1, "malloc");
241
242 for (cnt = 0; cnt < NCHARS; ++cnt)
243 if ((*cp->func)(cnt))
244 *p++ = cnt;
245 *p++ = OOBCH;
246 memset(p, 0, NCHARS + 1 - (p - s->set));
247
248 s->cnt = 0;
249 s->state = SET;
250 }
251
252 static int
253 c_class(const void *a, const void *b)
254 {
255 return strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name);
256 }
257
258 /*
259 * English doesn't have any equivalence classes, so for now
260 * we just syntax check and grab the character.
261 */
262 static void
263 genequiv(STR *s)
264 {
265 if (*s->str == '\\') {
266 s->equiv[0] = backslash(s);
267 if (*s->str != '=')
268 errx(1, "misplaced equivalence equals sign");
269 } else {
270 s->equiv[0] = s->str[0];
271 if (s->str[1] != '=')
272 errx(1, "misplaced equivalence equals sign");
273 }
274 s->str += 2;
275 s->cnt = 0;
276 s->state = SET;
277 s->set = s->equiv;
278 }
279
280 static int
281 genrange(STR *s)
282 {
283 int stopval;
284 const char *savestart;
285
286 savestart = s->str;
287 stopval = *++s->str == '\\' ? backslash(s) : *s->str++;
288 if (stopval < (u_char)s->lastch) {
289 s->str = savestart;
290 return 0;
291 }
292 s->cnt = stopval - s->lastch + 1;
293 s->state = RANGE;
294 --s->lastch;
295 return 1;
296 }
297
298 static void
299 genseq(STR *s)
300 {
301 char *ep;
302
303 if (s->which == STRING1)
304 errx(1, "sequences only valid in string2");
305
306 if (*s->str == '\\')
307 s->lastch = backslash(s);
308 else
309 s->lastch = *s->str++;
310 if (*s->str != '*')
311 errx(1, "misplaced sequence asterisk");
312
313 switch (*++s->str) {
314 case '\\':
315 s->cnt = backslash(s);
316 break;
317 case ']':
318 s->cnt = 0;
319 ++s->str;
320 break;
321 default:
322 if (isdigit(*s->str)) {
323 s->cnt = strtol(s->str, &ep, 0);
324 if (*ep == ']') {
325 s->str = ep + 1;
326 break;
327 }
328 }
329 errx(1, "illegal sequence count");
330 /* NOTREACHED */
331 }
332
333 s->state = s->cnt ? SEQUENCE : INFINITE;
334 }
335
336 /*
337 * Translate \??? into a character. Up to 3 octal digits, if no digits either
338 * an escape code or a literal character.
339 */
340 static int
341 backslash(STR *s)
342 {
343 int ch, cnt, val;
344
345 for (cnt = val = 0;;) {
346 ch = *++s->str;
347 if (!isascii(ch) || !isdigit(ch))
348 break;
349 val = val * 8 + ch - '0';
350 if (++cnt == 3) {
351 ++s->str;
352 break;
353 }
354 }
355 if (cnt)
356 return val;
357 if (ch != '\0')
358 ++s->str;
359 switch (ch) {
360 case 'a': /* escape characters */
361 return '\7';
362 case 'b':
363 return '\b';
364 case 'e':
365 return '\033';
366 case 'f':
367 return '\f';
368 case 'n':
369 return '\n';
370 case 'r':
371 return '\r';
372 case 't':
373 return '\t';
374 case 'v':
375 return '\13';
376 case '\0': /* \" -> \ */
377 s->state = EOS;
378 return '\\';
379 default: /* \x" -> x */
380 return ch;
381 }
382 }
383