str.c revision 1.26 1 /* $NetBSD: str.c,v 1.26 2013/08/11 01:29:28 dholland Exp $ */
2
3 /*-
4 * Copyright (c) 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 #ifndef lint
34 #if 0
35 static char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95";
36 #endif
37 __RCSID("$NetBSD: str.c,v 1.26 2013/08/11 01:29:28 dholland Exp $");
38 #endif /* not lint */
39
40 #include <sys/types.h>
41
42 #include <err.h>
43 #include <errno.h>
44 #include <stddef.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <ctype.h>
49 #include <assert.h>
50
51 #include "extern.h"
52
53 struct str {
54 enum { STRING1, STRING2 } which;
55 enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
56 int cnt; /* character count */
57 int lastch; /* last character */
58 int equiv[2]; /* equivalence set */
59 int *set; /* set of characters */
60 const char *str; /* user's string */
61 };
62
63 static int backslash(STR *);
64 static int bracket(STR *);
65 static int c_class(const void *, const void *);
66 static void genclass(STR *, const char *, size_t);
67 static void genequiv(STR *);
68 static int genrange(STR *);
69 static void genseq(STR *);
70
71 STR *
72 str_create(int whichstring, const char *txt)
73 {
74 STR *s;
75
76 s = malloc(sizeof(*s));
77 if (s == NULL) {
78 err(1, "Out of memory");
79 }
80
81 s->which = whichstring == 2 ? STRING2 : STRING1;
82 s->state = NORMAL;
83 s->cnt = 0;
84 s->lastch = OOBCH;
85 s->equiv[0] = 0;
86 s->equiv[1] = OOBCH;
87 s->set = NULL;
88 s->str = txt;
89
90 return s;
91 }
92
93 void
94 str_destroy(STR *s)
95 {
96 if (s->set != NULL && s->set != s->equiv) {
97 free(s->set);
98 }
99 free(s);
100 }
101
102 int
103 next(STR *s, int *ret)
104 {
105 int ch;
106
107 switch (s->state) {
108 case EOS:
109 *ret = s->lastch;
110 return 0;
111 case INFINITE:
112 *ret = s->lastch;
113 return 1;
114 case NORMAL:
115 ch = (unsigned char)s->str[0];
116 switch (ch) {
117 case '\0':
118 s->state = EOS;
119 *ret = s->lastch;
120 return 0;
121 case '\\':
122 s->lastch = backslash(s);
123 break;
124 case '[':
125 if (bracket(s)) {
126 return next(s, ret);
127 }
128 /* FALLTHROUGH */
129 default:
130 ++s->str;
131 s->lastch = ch;
132 break;
133 }
134
135 /* We can start a range at any time. */
136 if (s->str[0] == '-' && genrange(s)) {
137 return next(s, ret);
138 }
139 *ret = s->lastch;
140 return 1;
141 case RANGE:
142 if (s->cnt == 0) {
143 s->state = NORMAL;
144 return next(s, ret);
145 }
146 s->cnt--;
147 ++s->lastch;
148 *ret = s->lastch;
149 return 1;
150 case SEQUENCE:
151 if (s->cnt == 0) {
152 s->state = NORMAL;
153 return next(s, ret);
154 }
155 s->cnt--;
156 *ret = s->lastch;
157 return 1;
158 case SET:
159 s->lastch = s->set[s->cnt++];
160 if (s->lastch == OOBCH) {
161 s->state = NORMAL;
162 if (s->set != s->equiv) {
163 free(s->set);
164 }
165 s->set = NULL;
166 return next(s, ret);
167 }
168 *ret = s->lastch;
169 return 1;
170 }
171 /* NOTREACHED */
172 assert(0);
173 *ret = s->lastch;
174 return 0;
175 }
176
177 static int
178 bracket(STR *s)
179 {
180 const char *p;
181
182 switch (s->str[1]) {
183 case ':': /* "[:class:]" */
184 if ((p = strstr(s->str + 2, ":]")) == NULL)
185 return 0;
186 s->str += 2;
187 genclass(s, s->str, p - s->str);
188 s->str = p + 2;
189 return 1;
190 case '=': /* "[=equiv=]" */
191 if ((p = strstr(s->str + 2, "=]")) == NULL)
192 return 0;
193 s->str += 2;
194 genequiv(s);
195 return 1;
196 default: /* "[\###*n]" or "[#*n]" */
197 if ((p = strpbrk(s->str + 2, "*]")) == NULL)
198 return 0;
199 if (p[0] != '*' || strchr(p, ']') == NULL)
200 return 0;
201 s->str += 1;
202 genseq(s);
203 return 1;
204 }
205 /* NOTREACHED */
206 }
207
208 typedef struct {
209 const char *name;
210 int (*func)(int);
211 } CLASS;
212
213 static const CLASS classes[] = {
214 { "alnum", isalnum },
215 { "alpha", isalpha },
216 { "blank", isblank },
217 { "cntrl", iscntrl },
218 { "digit", isdigit },
219 { "graph", isgraph },
220 { "lower", islower },
221 { "print", isprint },
222 { "punct", ispunct },
223 { "space", isspace },
224 { "upper", isupper },
225 { "xdigit", isxdigit },
226 };
227
228 typedef struct {
229 const char *name;
230 size_t len;
231 } CLASSKEY;
232
233 static void
234 genclass(STR *s, const char *class, size_t len)
235 {
236 int ch;
237 const CLASS *cp;
238 CLASSKEY key;
239 int *p;
240 unsigned pos, num;
241
242 /* Find the class */
243 key.name = class;
244 key.len = len;
245 cp = bsearch(&key, classes, __arraycount(classes), sizeof(classes[0]),
246 c_class);
247 if (cp == NULL) {
248 errx(1, "unknown class %.*s", (int)len, class);
249 }
250
251 /*
252 * Figure out what characters are in the class
253 */
254
255 num = NCHARS + 1;
256 p = malloc(num * sizeof(*p));
257 if (p == NULL) {
258 err(1, "malloc");
259 }
260
261 pos = 0;
262 for (ch = 0; ch < NCHARS; ch++) {
263 if (cp->func(ch)) {
264 p[pos++] = ch;
265 }
266 }
267
268 p[pos++] = OOBCH;
269 for (; pos < num; pos++) {
270 p[pos] = 0;
271 }
272
273 /* Update the state */
274
275 s->set = p;
276 s->cnt = 0;
277 s->state = SET;
278 }
279
280 static int
281 c_class(const void *av, const void *bv)
282 {
283 const CLASSKEY *a = av;
284 const CLASS *b = bv;
285 size_t blen;
286 int r;
287
288 blen = strlen(b->name);
289 r = strncmp(a->name, b->name, a->len);
290 if (r != 0) {
291 return r;
292 }
293 if (a->len < blen) {
294 /* someone gave us a prefix of the right name */
295 return -1;
296 }
297 assert(a-> len == blen);
298 return 0;
299 }
300
301 /*
302 * English doesn't have any equivalence classes, so for now
303 * we just syntax check and grab the character.
304 */
305 static void
306 genequiv(STR *s)
307 {
308 if (*s->str == '\\') {
309 s->equiv[0] = backslash(s);
310 if (*s->str != '=') {
311 errx(1, "Misplaced equivalence equals sign");
312 }
313 } else {
314 s->equiv[0] = (unsigned char)s->str[0];
315 if (s->str[1] != '=') {
316 errx(1, "Misplaced equivalence equals sign");
317 }
318 }
319 s->str += 2;
320 s->cnt = 0;
321 s->state = SET;
322 s->set = s->equiv;
323 }
324
325 static int
326 genrange(STR *s)
327 {
328 int stopval;
329 const char *savestart;
330
331 savestart = s->str++;
332 stopval = s->str[0] == '\\' ? backslash(s) : (unsigned char)*s->str++;
333 if (stopval < (unsigned char)s->lastch) {
334 s->str = savestart;
335 return 0;
336 }
337 s->cnt = stopval - s->lastch + 1;
338 s->state = RANGE;
339 --s->lastch;
340 return 1;
341 }
342
343 static void
344 genseq(STR *s)
345 {
346 char *ep;
347
348 if (s->which == STRING1) {
349 errx(1, "Sequences only valid in string2");
350 }
351
352 if (*s->str == '\\') {
353 s->lastch = backslash(s);
354 } else {
355 s->lastch = (unsigned char)*s->str++;
356 }
357 if (*s->str != '*') {
358 errx(1, "Misplaced sequence asterisk");
359 }
360
361 s->str++;
362 switch (s->str[0]) {
363 case '\\':
364 s->cnt = backslash(s);
365 break;
366 case ']':
367 s->cnt = 0;
368 ++s->str;
369 break;
370 default:
371 if (isdigit((unsigned char)s->str[0])) {
372 s->cnt = strtol(s->str, &ep, 0);
373 if (*ep == ']') {
374 s->str = ep + 1;
375 break;
376 }
377 }
378 errx(1, "illegal sequence count");
379 /* NOTREACHED */
380 }
381
382 s->state = s->cnt ? SEQUENCE : INFINITE;
383 }
384
385 /*
386 * Translate \??? into a character. Up to 3 octal digits, if no digits either
387 * an escape code or a literal character.
388 */
389 static int
390 backslash(STR *s)
391 {
392 int ch, cnt, val;
393
394 for (cnt = val = 0;;) {
395 s->str++;
396 ch = (unsigned char)s->str[0];
397 if (!isascii(ch) || !isdigit(ch)) {
398 break;
399 }
400 val = val * 8 + ch - '0';
401 if (++cnt == 3) {
402 ++s->str;
403 break;
404 }
405 }
406 if (cnt) {
407 return val;
408 }
409 if (ch != '\0') {
410 ++s->str;
411 }
412 switch (ch) {
413 case 'a': /* escape characters */
414 return '\7';
415 case 'b':
416 return '\b';
417 case 'e':
418 return '\033';
419 case 'f':
420 return '\f';
421 case 'n':
422 return '\n';
423 case 'r':
424 return '\r';
425 case 't':
426 return '\t';
427 case 'v':
428 return '\13';
429 case '\0': /* \" -> \ */
430 s->state = EOS;
431 return '\\';
432 default: /* \x" -> x */
433 return ch;
434 }
435 }
436