str.c revision 1.20 1 /* $NetBSD: str.c,v 1.20 2013/08/10 23:54:41 dholland Exp $ */
2
3 /*-
4 * Copyright (c) 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 #ifndef lint
34 #if 0
35 static char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95";
36 #endif
37 __RCSID("$NetBSD: str.c,v 1.20 2013/08/10 23:54:41 dholland Exp $");
38 #endif /* not lint */
39
40 #include <sys/types.h>
41
42 #include <err.h>
43 #include <errno.h>
44 #include <stddef.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <ctype.h>
49 #include <assert.h>
50
51 #include "extern.h"
52
53 static int backslash(STR *);
54 static int bracket(STR *);
55 static int c_class(const void *, const void *);
56 static void genclass(STR *);
57 static void genequiv(STR *);
58 static int genrange(STR *);
59 static void genseq(STR *);
60
61 int
62 next(STR *s, int *ret)
63 {
64 int ch;
65
66 switch (s->state) {
67 case EOS:
68 *ret = s->lastch;
69 return 0;
70 case INFINITE:
71 *ret = s->lastch;
72 return 1;
73 case NORMAL:
74 switch (ch = *s->str) {
75 case '\0':
76 s->state = EOS;
77 *ret = s->lastch;
78 return 0;
79 case '\\':
80 s->lastch = backslash(s);
81 break;
82 case '[':
83 if (bracket(s))
84 return next(s, ret);
85 /* FALLTHROUGH */
86 default:
87 ++s->str;
88 s->lastch = ch;
89 break;
90 }
91
92 /* We can start a range at any time. */
93 if (s->str[0] == '-' && genrange(s)) {
94 return next(s, ret);
95 }
96 *ret = s->lastch;
97 return 1;
98 case RANGE:
99 if (s->cnt-- == 0) {
100 s->state = NORMAL;
101 return next(s, ret);
102 }
103 ++s->lastch;
104 *ret = s->lastch;
105 return 1;
106 case SEQUENCE:
107 if (s->cnt-- == 0) {
108 s->state = NORMAL;
109 return next(s, ret);
110 }
111 *ret = s->lastch;
112 return 1;
113 case SET:
114 if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
115 s->state = NORMAL;
116 return next(s, ret);
117 }
118 *ret = s->lastch;
119 return 1;
120 }
121 /* NOTREACHED */
122 assert(0);
123 *ret = s->lastch;
124 return 0;
125 }
126
127 static int
128 bracket(STR *s)
129 {
130 char *p;
131
132 switch (s->str[1]) {
133 case ':': /* "[:class:]" */
134 if ((p = strstr(s->str + 2, ":]")) == NULL)
135 return 0;
136 *p = '\0';
137 s->str += 2;
138 genclass(s);
139 s->str = p + 2;
140 return 1;
141 case '=': /* "[=equiv=]" */
142 if ((p = strstr(s->str + 2, "=]")) == NULL)
143 return 0;
144 s->str += 2;
145 genequiv(s);
146 return 1;
147 default: /* "[\###*n]" or "[#*n]" */
148 if ((p = strpbrk(s->str + 2, "*]")) == NULL)
149 return 0;
150 if (p[0] != '*' || strchr(p, ']') == NULL)
151 return 0;
152 s->str += 1;
153 genseq(s);
154 return 1;
155 }
156 /* NOTREACHED */
157 }
158
159 typedef struct {
160 const char *name;
161 int (*func)(int);
162 } CLASS;
163
164 static const CLASS classes[] = {
165 { "alnum", isalnum },
166 { "alpha", isalpha },
167 { "blank", isblank },
168 { "cntrl", iscntrl },
169 { "digit", isdigit },
170 { "graph", isgraph },
171 { "lower", islower },
172 { "print", isprint },
173 { "punct", ispunct },
174 { "space", isspace },
175 { "upper", isupper },
176 { "xdigit", isxdigit },
177 };
178
179 static void
180 genclass(STR *s)
181 {
182 int cnt;
183 const CLASS *cp;
184 CLASS tmp;
185 int *p;
186
187 tmp.name = s->str;
188 if ((cp = bsearch(&tmp, classes, sizeof(classes) /
189 sizeof(*cp), sizeof(*cp), c_class)) == NULL)
190 errx(1, "unknown class %s", s->str);
191
192 if ((s->set = p = malloc((NCHARS + 1) * sizeof(*p))) == NULL)
193 err(1, "malloc");
194
195 for (cnt = 0; cnt < NCHARS; ++cnt)
196 if ((*cp->func)(cnt))
197 *p++ = cnt;
198 *p++ = OOBCH;
199 memset(p, 0, NCHARS + 1 - (p - s->set));
200
201 s->cnt = 0;
202 s->state = SET;
203 }
204
205 static int
206 c_class(const void *a, const void *b)
207 {
208 return strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name);
209 }
210
211 /*
212 * English doesn't have any equivalence classes, so for now
213 * we just syntax check and grab the character.
214 */
215 static void
216 genequiv(STR *s)
217 {
218 if (*s->str == '\\') {
219 s->equiv[0] = backslash(s);
220 if (*s->str != '=')
221 errx(1, "misplaced equivalence equals sign");
222 } else {
223 s->equiv[0] = s->str[0];
224 if (s->str[1] != '=')
225 errx(1, "misplaced equivalence equals sign");
226 }
227 s->str += 2;
228 s->cnt = 0;
229 s->state = SET;
230 s->set = s->equiv;
231 }
232
233 static int
234 genrange(STR *s)
235 {
236 int stopval;
237 char *savestart;
238
239 savestart = s->str;
240 stopval = *++s->str == '\\' ? backslash(s) : *s->str++;
241 if (stopval < (u_char)s->lastch) {
242 s->str = savestart;
243 return 0;
244 }
245 s->cnt = stopval - s->lastch + 1;
246 s->state = RANGE;
247 --s->lastch;
248 return 1;
249 }
250
251 static void
252 genseq(STR *s)
253 {
254 char *ep;
255
256 if (s->which == STRING1)
257 errx(1, "sequences only valid in string2");
258
259 if (*s->str == '\\')
260 s->lastch = backslash(s);
261 else
262 s->lastch = *s->str++;
263 if (*s->str != '*')
264 errx(1, "misplaced sequence asterisk");
265
266 switch (*++s->str) {
267 case '\\':
268 s->cnt = backslash(s);
269 break;
270 case ']':
271 s->cnt = 0;
272 ++s->str;
273 break;
274 default:
275 if (isdigit(*s->str)) {
276 s->cnt = strtol(s->str, &ep, 0);
277 if (*ep == ']') {
278 s->str = ep + 1;
279 break;
280 }
281 }
282 errx(1, "illegal sequence count");
283 /* NOTREACHED */
284 }
285
286 s->state = s->cnt ? SEQUENCE : INFINITE;
287 }
288
289 /*
290 * Translate \??? into a character. Up to 3 octal digits, if no digits either
291 * an escape code or a literal character.
292 */
293 static int
294 backslash(STR *s)
295 {
296 int ch, cnt, val;
297
298 for (cnt = val = 0;;) {
299 ch = *++s->str;
300 if (!isascii(ch) || !isdigit(ch))
301 break;
302 val = val * 8 + ch - '0';
303 if (++cnt == 3) {
304 ++s->str;
305 break;
306 }
307 }
308 if (cnt)
309 return val;
310 if (ch != '\0')
311 ++s->str;
312 switch (ch) {
313 case 'a': /* escape characters */
314 return '\7';
315 case 'b':
316 return '\b';
317 case 'e':
318 return '\033';
319 case 'f':
320 return '\f';
321 case 'n':
322 return '\n';
323 case 'r':
324 return '\r';
325 case 't':
326 return '\t';
327 case 'v':
328 return '\13';
329 case '\0': /* \" -> \ */
330 s->state = EOS;
331 return '\\';
332 default: /* \x" -> x */
333 return ch;
334 }
335 }
336