str.c revision 1.16 1 /* $NetBSD: str.c,v 1.16 2011/09/08 01:18:05 christos Exp $ */
2
3 /*-
4 * Copyright (c) 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 #ifndef lint
34 #if 0
35 static char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95";
36 #endif
37 __RCSID("$NetBSD: str.c,v 1.16 2011/09/08 01:18:05 christos Exp $");
38 #endif /* not lint */
39
40 #include <sys/types.h>
41
42 #include <err.h>
43 #include <errno.h>
44 #include <stddef.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <ctype.h>
49
50 #include "extern.h"
51
52 static int backslash(STR *);
53 static int bracket(STR *);
54 static int c_class(const void *, const void *);
55 static void genclass(STR *);
56 static void genequiv(STR *);
57 static int genrange(STR *);
58 static void genseq(STR *);
59
60 int
61 next(STR *s)
62 {
63 int ch;
64
65 switch (s->state) {
66 case EOS:
67 return 0;
68 case INFINITE:
69 return 1;
70 case NORMAL:
71 switch (ch = *s->str) {
72 case '\0':
73 s->state = EOS;
74 return 0;
75 case '\\':
76 s->lastch = backslash(s);
77 break;
78 case '[':
79 if (bracket(s))
80 return next(s);
81 /* FALLTHROUGH */
82 default:
83 ++s->str;
84 s->lastch = ch;
85 break;
86 }
87
88 /* We can start a range at any time. */
89 if (s->str[0] == '-' && genrange(s))
90 return next(s);
91 return 1;
92 case RANGE:
93 if (s->cnt-- == 0) {
94 s->state = NORMAL;
95 return next(s);
96 }
97 ++s->lastch;
98 return 1;
99 case SEQUENCE:
100 if (s->cnt-- == 0) {
101 s->state = NORMAL;
102 return next(s);
103 }
104 return 1;
105 case SET:
106 if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
107 s->state = NORMAL;
108 return next(s);
109 }
110 return 1;
111 }
112 /* NOTREACHED */
113 return 0;
114 }
115
116 static int
117 bracket(STR *s)
118 {
119 char *p;
120
121 switch (s->str[1]) {
122 case ':': /* "[:class:]" */
123 if ((p = strstr(s->str + 2, ":]")) == NULL)
124 return 0;
125 *p = '\0';
126 s->str += 2;
127 genclass(s);
128 s->str = p + 2;
129 return 1;
130 case '=': /* "[=equiv=]" */
131 if ((p = strstr(s->str + 2, "=]")) == NULL)
132 return 0;
133 s->str += 2;
134 genequiv(s);
135 return 1;
136 default: /* "[\###*n]" or "[#*n]" */
137 if ((p = strpbrk(s->str + 2, "*]")) == NULL)
138 return 0;
139 if (p[0] != '*' || strchr(p, ']') == NULL)
140 return 0;
141 s->str += 1;
142 genseq(s);
143 return 1;
144 }
145 /* NOTREACHED */
146 }
147
148 typedef struct {
149 const char *name;
150 int (*func)(int);
151 } CLASS;
152
153 static const CLASS classes[] = {
154 { "alnum", isalnum },
155 { "alpha", isalpha },
156 { "blank", isblank },
157 { "cntrl", iscntrl },
158 { "digit", isdigit },
159 { "graph", isgraph },
160 { "lower", islower },
161 { "print", isprint },
162 { "punct", ispunct },
163 { "space", isspace },
164 { "upper", isupper },
165 { "xdigit", isxdigit },
166 };
167
168 static void
169 genclass(STR *s)
170 {
171 int cnt, (*func)(int);
172 const CLASS *cp;
173 CLASS tmp;
174 int *p;
175
176 tmp.name = s->str;
177 if ((cp = bsearch(&tmp, classes, sizeof(classes) /
178 sizeof(*cp), sizeof(*cp), c_class)) == NULL)
179 errx(1, "unknown class %s", s->str);
180
181 if ((s->set = p = malloc((NCHARS + 1) * sizeof(*p))) == NULL)
182 err(1, "malloc");
183 for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt)
184 if ((func)(cnt))
185 *p++ = cnt;
186 *p = OOBCH;
187
188 s->cnt = 0;
189 s->state = SET;
190 }
191
192 static int
193 c_class(const void *a, const void *b)
194 {
195 return strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name);
196 }
197
198 /*
199 * English doesn't have any equivalence classes, so for now
200 * we just syntax check and grab the character.
201 */
202 static void
203 genequiv(STR *s)
204 {
205 if (*s->str == '\\') {
206 s->equiv[0] = backslash(s);
207 if (*s->str != '=')
208 errx(1, "misplaced equivalence equals sign");
209 } else {
210 s->equiv[0] = s->str[0];
211 if (s->str[1] != '=')
212 errx(1, "misplaced equivalence equals sign");
213 }
214 s->str += 2;
215 s->cnt = 0;
216 s->state = SET;
217 s->set = s->equiv;
218 }
219
220 static int
221 genrange(STR *s)
222 {
223 int stopval;
224 char *savestart;
225
226 savestart = s->str;
227 stopval = *++s->str == '\\' ? backslash(s) : *s->str++;
228 if (stopval < (u_char)s->lastch) {
229 s->str = savestart;
230 return 0;
231 }
232 s->cnt = stopval - s->lastch + 1;
233 s->state = RANGE;
234 --s->lastch;
235 return 1;
236 }
237
238 static void
239 genseq(STR *s)
240 {
241 char *ep;
242
243 if (s->which == STRING1)
244 errx(1, "sequences only valid in string2");
245
246 if (*s->str == '\\')
247 s->lastch = backslash(s);
248 else
249 s->lastch = *s->str++;
250 if (*s->str != '*')
251 errx(1, "misplaced sequence asterisk");
252
253 switch (*++s->str) {
254 case '\\':
255 s->cnt = backslash(s);
256 break;
257 case ']':
258 s->cnt = 0;
259 ++s->str;
260 break;
261 default:
262 if (isdigit(*s->str)) {
263 s->cnt = strtol(s->str, &ep, 0);
264 if (*ep == ']') {
265 s->str = ep + 1;
266 break;
267 }
268 }
269 errx(1, "illegal sequence count");
270 /* NOTREACHED */
271 }
272
273 s->state = s->cnt ? SEQUENCE : INFINITE;
274 }
275
276 /*
277 * Translate \??? into a character. Up to 3 octal digits, if no digits either
278 * an escape code or a literal character.
279 */
280 static int
281 backslash(STR *s)
282 {
283 int ch, cnt, val;
284
285 for (cnt = val = 0;;) {
286 ch = *++s->str;
287 if (!isascii(ch) || !isdigit(ch))
288 break;
289 val = val * 8 + ch - '0';
290 if (++cnt == 3) {
291 ++s->str;
292 break;
293 }
294 }
295 if (cnt)
296 return val;
297 if (ch != '\0')
298 ++s->str;
299 switch (ch) {
300 case 'a': /* escape characters */
301 return '\7';
302 case 'b':
303 return '\b';
304 case 'e':
305 return '\033';
306 case 'f':
307 return '\f';
308 case 'n':
309 return '\n';
310 case 'r':
311 return '\r';
312 case 't':
313 return '\t';
314 case 'v':
315 return '\13';
316 case '\0': /* \" -> \ */
317 s->state = EOS;
318 return '\\';
319 default: /* \x" -> x */
320 return ch;
321 }
322 }
323