str.c revision 1.10 1 /* $NetBSD: str.c,v 1.10 2003/08/07 11:16:46 agc Exp $ */
2
3 /*-
4 * Copyright (c) 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 #ifndef lint
34 #if 0
35 static char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95";
36 #endif
37 __RCSID("$NetBSD: str.c,v 1.10 2003/08/07 11:16:46 agc Exp $");
38 #endif /* not lint */
39
40 #include <sys/cdefs.h>
41 #include <sys/types.h>
42
43 #include <err.h>
44 #include <errno.h>
45 #include <stddef.h>
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <ctype.h>
50
51 #include "extern.h"
52
53 static int backslash __P((STR *));
54 static int bracket __P((STR *));
55 static int c_class __P((const void *, const void *));
56 static void genclass __P((STR *));
57 static void genequiv __P((STR *));
58 static int genrange __P((STR *));
59 static void genseq __P((STR *));
60
61 int
62 next(s)
63 STR *s;
64 {
65 int ch;
66
67 switch (s->state) {
68 case EOS:
69 return (0);
70 case INFINITE:
71 return (1);
72 case NORMAL:
73 switch (ch = *s->str) {
74 case '\0':
75 s->state = EOS;
76 return (0);
77 case '\\':
78 s->lastch = backslash(s);
79 break;
80 case '[':
81 if (bracket(s))
82 return (next(s));
83 /* FALLTHROUGH */
84 default:
85 ++s->str;
86 s->lastch = ch;
87 break;
88 }
89
90 /* We can start a range at any time. */
91 if (s->str[0] == '-' && genrange(s))
92 return (next(s));
93 return (1);
94 case RANGE:
95 if (s->cnt-- == 0) {
96 s->state = NORMAL;
97 return (next(s));
98 }
99 ++s->lastch;
100 return (1);
101 case SEQUENCE:
102 if (s->cnt-- == 0) {
103 s->state = NORMAL;
104 return (next(s));
105 }
106 return (1);
107 case SET:
108 if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
109 s->state = NORMAL;
110 return (next(s));
111 }
112 return (1);
113 }
114 /* NOTREACHED */
115 return (0);
116 }
117
118 static int
119 bracket(s)
120 STR *s;
121 {
122 char *p;
123
124 switch (s->str[1]) {
125 case ':': /* "[:class:]" */
126 if ((p = strstr(s->str + 2, ":]")) == NULL)
127 return (0);
128 *p = '\0';
129 s->str += 2;
130 genclass(s);
131 s->str = p + 2;
132 return (1);
133 case '=': /* "[=equiv=]" */
134 if ((p = strstr(s->str + 2, "=]")) == NULL)
135 return (0);
136 s->str += 2;
137 genequiv(s);
138 return (1);
139 default: /* "[\###*n]" or "[#*n]" */
140 if ((p = strpbrk(s->str + 2, "*]")) == NULL)
141 return (0);
142 if (p[0] != '*' || strchr(p, ']') == NULL)
143 return (0);
144 s->str += 1;
145 genseq(s);
146 return (1);
147 }
148 /* NOTREACHED */
149 }
150
151 typedef struct {
152 char *name;
153 int (*func) __P((int));
154 int *set;
155 } CLASS;
156
157 static CLASS classes[] = {
158 { "alnum", isalnum, },
159 { "alpha", isalpha, },
160 { "blank", isblank, },
161 { "cntrl", iscntrl, },
162 { "digit", isdigit, },
163 { "graph", isgraph, },
164 { "lower", islower, },
165 { "print", isprint, },
166 { "punct", ispunct, },
167 { "space", isspace, },
168 { "upper", isupper, },
169 { "xdigit", isxdigit, },
170 };
171
172 static void
173 genclass(s)
174 STR *s;
175 {
176 int cnt, (*func) __P((int));
177 CLASS *cp, tmp;
178 int *p;
179
180 tmp.name = s->str;
181 if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) /
182 sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
183 errx(1, "unknown class %s", s->str);
184
185 if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
186 err(1, "malloc");
187 memset(p, 0, (NCHARS + 1) * sizeof(int));
188 for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt)
189 if ((func)(cnt))
190 *p++ = cnt;
191 *p = OOBCH;
192
193 s->cnt = 0;
194 s->state = SET;
195 s->set = cp->set;
196 }
197
198 static int
199 c_class(a, b)
200 const void *a, *b;
201 {
202 return (strcmp(((CLASS *)a)->name, ((CLASS *)b)->name));
203 }
204
205 /*
206 * English doesn't have any equivalence classes, so for now
207 * we just syntax check and grab the character.
208 */
209 static void
210 genequiv(s)
211 STR *s;
212 {
213 if (*s->str == '\\') {
214 s->equiv[0] = backslash(s);
215 if (*s->str != '=')
216 errx(1, "misplaced equivalence equals sign");
217 } else {
218 s->equiv[0] = s->str[0];
219 if (s->str[1] != '=')
220 errx(1, "misplaced equivalence equals sign");
221 }
222 s->str += 2;
223 s->cnt = 0;
224 s->state = SET;
225 s->set = s->equiv;
226 }
227
228 static int
229 genrange(s)
230 STR *s;
231 {
232 int stopval;
233 char *savestart;
234
235 savestart = s->str;
236 stopval = *++s->str == '\\' ? backslash(s) : *s->str++;
237 if (stopval < (u_char)s->lastch) {
238 s->str = savestart;
239 return (0);
240 }
241 s->cnt = stopval - s->lastch + 1;
242 s->state = RANGE;
243 --s->lastch;
244 return (1);
245 }
246
247 static void
248 genseq(s)
249 STR *s;
250 {
251 char *ep;
252
253 if (s->which == STRING1)
254 errx(1, "sequences only valid in string2");
255
256 if (*s->str == '\\')
257 s->lastch = backslash(s);
258 else
259 s->lastch = *s->str++;
260 if (*s->str != '*')
261 errx(1, "misplaced sequence asterisk");
262
263 switch (*++s->str) {
264 case '\\':
265 s->cnt = backslash(s);
266 break;
267 case ']':
268 s->cnt = 0;
269 ++s->str;
270 break;
271 default:
272 if (isdigit(*s->str)) {
273 s->cnt = strtol(s->str, &ep, 0);
274 if (*ep == ']') {
275 s->str = ep + 1;
276 break;
277 }
278 }
279 errx(1, "illegal sequence count");
280 /* NOTREACHED */
281 }
282
283 s->state = s->cnt ? SEQUENCE : INFINITE;
284 }
285
286 /*
287 * Translate \??? into a character. Up to 3 octal digits, if no digits either
288 * an escape code or a literal character.
289 */
290 static int
291 backslash(s)
292 STR *s;
293 {
294 int ch, cnt, val;
295
296 for (cnt = val = 0;;) {
297 ch = *++s->str;
298 if (!isascii(ch) || !isdigit(ch))
299 break;
300 val = val * 8 + ch - '0';
301 if (++cnt == 3) {
302 ++s->str;
303 break;
304 }
305 }
306 if (cnt)
307 return (val);
308 if (ch != '\0')
309 ++s->str;
310 switch (ch) {
311 case 'a': /* escape characters */
312 return ('\7');
313 case 'b':
314 return ('\b');
315 case 'f':
316 return ('\f');
317 case 'n':
318 return ('\n');
319 case 'r':
320 return ('\r');
321 case 't':
322 return ('\t');
323 case 'v':
324 return ('\13');
325 case '\0': /* \" -> \ */
326 s->state = EOS;
327 return ('\\');
328 default: /* \x" -> x */
329 return (ch);
330 }
331 }
332