regex.c revision 1.3 1 /* $NetBSD: regex.c,v 1.3 2019/01/09 16:55:14 christos Exp $ */
2
3 /*
4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5 *
6 * This Source Code Form is subject to the terms of the Mozilla Public
7 * License, v. 2.0. If a copy of the MPL was not distributed with this
8 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 *
10 * See the COPYRIGHT file distributed with this work for additional
11 * information regarding copyright ownership.
12 */
13
14 #include <config.h>
15
16 #include <stdbool.h>
17
18 #include <isc/file.h>
19 #include <isc/print.h>
20 #include <isc/regex.h>
21 #include <isc/string.h>
22
23 #if VALREGEX_REPORT_REASON
24 #define FAIL(x) do { reason = (x); goto error; } while(/*CONSTCOND*/0)
25 #else
26 #define FAIL(x) goto error
27 #endif
28
29 /*
30 * Validate the regular expression 'C' locale.
31 */
32 int
33 isc_regex_validate(const char *c) {
34 enum {
35 none, parse_bracket, parse_bound,
36 parse_ce, parse_ec, parse_cc
37 } state = none;
38 /* Well known character classes. */
39 const char *cc[] = {
40 ":alnum:", ":digit:", ":punct:", ":alpha:", ":graph:",
41 ":space:", ":blank:", ":lower:", ":upper:", ":cntrl:",
42 ":print:", ":xdigit:"
43 };
44 bool seen_comma = false;
45 bool seen_high = false;
46 bool seen_char = false;
47 bool seen_ec = false;
48 bool seen_ce = false;
49 bool have_atom = false;
50 int group = 0;
51 int range = 0;
52 int sub = 0;
53 bool empty_ok = false;
54 bool neg = false;
55 bool was_multiple = false;
56 unsigned int low = 0;
57 unsigned int high = 0;
58 const char *ccname = NULL;
59 int range_start = 0;
60 #if VALREGEX_REPORT_REASON
61 const char *reason = "";
62 #endif
63
64 if (c == NULL || *c == 0)
65 FAIL("empty string");
66
67 while (c != NULL && *c != 0) {
68 switch (state) {
69 case none:
70 switch (*c) {
71 case '\\': /* make literal */
72 ++c;
73 switch (*c) {
74 case '1': case '2': case '3':
75 case '4': case '5': case '6':
76 case '7': case '8': case '9':
77 if ((*c - '0') > sub)
78 FAIL("bad back reference");
79 have_atom = true;
80 was_multiple = false;
81 break;
82 case 0:
83 FAIL("escaped end-of-string");
84 default:
85 goto literal;
86 }
87 ++c;
88 break;
89 case '[': /* bracket start */
90 ++c;
91 neg = false;
92 was_multiple = false;
93 seen_char = false;
94 state = parse_bracket;
95 break;
96 case '{': /* bound start */
97 switch (c[1]) {
98 case '0': case '1': case '2': case '3':
99 case '4': case '5': case '6': case '7':
100 case '8': case '9':
101 if (!have_atom)
102 FAIL("no atom");
103 if (was_multiple)
104 FAIL("was multiple");
105 seen_comma = false;
106 seen_high = false;
107 low = high = 0;
108 state = parse_bound;
109 break;
110 default:
111 goto literal;
112 }
113 ++c;
114 have_atom = true;
115 was_multiple = true;
116 break;
117 case '}':
118 goto literal;
119 case '(': /* group start */
120 have_atom = false;
121 was_multiple = false;
122 empty_ok = true;
123 ++group;
124 ++sub;
125 ++c;
126 break;
127 case ')': /* group end */
128 if (group && !have_atom && !empty_ok)
129 FAIL("empty alternative");
130 have_atom = true;
131 was_multiple = false;
132 if (group != 0)
133 --group;
134 ++c;
135 break;
136 case '|': /* alternative seperator */
137 if (!have_atom)
138 FAIL("no atom");
139 have_atom = false;
140 empty_ok = false;
141 was_multiple = false;
142 ++c;
143 break;
144 case '^':
145 case '$':
146 have_atom = true;
147 was_multiple = true;
148 ++c;
149 break;
150 case '+':
151 case '*':
152 case '?':
153 if (was_multiple)
154 FAIL("was multiple");
155 if (!have_atom)
156 FAIL("no atom");
157 have_atom = true;
158 was_multiple = true;
159 ++c;
160 break;
161 case '.':
162 default:
163 literal:
164 have_atom = true;
165 was_multiple = false;
166 ++c;
167 break;
168 }
169 break;
170 case parse_bound:
171 switch (*c) {
172 case '0': case '1': case '2': case '3': case '4':
173 case '5': case '6': case '7': case '8': case '9':
174 if (!seen_comma) {
175 low = low * 10 + *c - '0';
176 if (low > 255)
177 FAIL("lower bound too big");
178 } else {
179 seen_high = true;
180 high = high * 10 + *c - '0';
181 if (high > 255)
182 FAIL("upper bound too big");
183 }
184 ++c;
185 break;
186 case ',':
187 if (seen_comma)
188 FAIL("multiple commas");
189 seen_comma = true;
190 ++c;
191 break;
192 default:
193 case '{':
194 FAIL("non digit/comma");
195 case '}':
196 if (seen_high && low > high)
197 FAIL("bad parse bound");
198 seen_comma = false;
199 state = none;
200 ++c;
201 break;
202 }
203 break;
204 case parse_bracket:
205 switch (*c) {
206 case '^':
207 if (seen_char || neg) goto inside;
208 neg = true;
209 ++c;
210 break;
211 case '-':
212 if (range == 2) goto inside;
213 if (!seen_char) goto inside;
214 if (range == 1)
215 FAIL("bad range");
216 range = 2;
217 ++c;
218 break;
219 case '[':
220 ++c;
221 switch (*c) {
222 case '.': /* collating element */
223 if (range != 0) --range;
224 ++c;
225 state = parse_ce;
226 seen_ce = false;
227 break;
228 case '=': /* equivalence class */
229 if (range == 2)
230 FAIL("equivalence class in range");
231 ++c;
232 state = parse_ec;
233 seen_ec = false;
234 break;
235 case ':': /* character class */
236 if (range == 2)
237 FAIL("character class in range");
238 ccname = c;
239 ++c;
240 state = parse_cc;
241 break;
242 }
243 seen_char = true;
244 break;
245 case ']':
246 if (!c[1] && !seen_char)
247 FAIL("unfinished brace");
248 if (!seen_char)
249 goto inside;
250 ++c;
251 range = 0;
252 have_atom = true;
253 state = none;
254 break;
255 default:
256 inside:
257 seen_char = true;
258 if (range == 2 && (*c & 0xff) < range_start)
259 FAIL("out of order range");
260 if (range != 0)
261 --range;
262 range_start = *c & 0xff;
263 ++c;
264 break;
265 };
266 break;
267 case parse_ce:
268 switch (*c) {
269 case '.':
270 ++c;
271 switch (*c) {
272 case ']':
273 if (!seen_ce)
274 FAIL("empty ce");
275 ++c;
276 state = parse_bracket;
277 break;
278 default:
279 if (seen_ce)
280 range_start = 256;
281 else
282 range_start = '.';
283 seen_ce = true;
284 break;
285 }
286 break;
287 default:
288 if (seen_ce)
289 range_start = 256;
290 else
291 range_start = *c;
292 seen_ce = true;
293 ++c;
294 break;
295 }
296 break;
297 case parse_ec:
298 switch (*c) {
299 case '=':
300 ++c;
301 switch (*c) {
302 case ']':
303 if (!seen_ec)
304 FAIL("no ec");
305 ++c;
306 state = parse_bracket;
307 break;
308 default:
309 seen_ec = true;
310 break;
311 }
312 break;
313 default:
314 seen_ec = true;
315 ++c;
316 break;
317 }
318 break;
319 case parse_cc:
320 switch (*c) {
321 case ':':
322 ++c;
323 switch (*c) {
324 case ']': {
325 unsigned int i;
326 bool found = false;
327 for (i = 0;
328 i < sizeof(cc)/sizeof(*cc);
329 i++)
330 {
331 unsigned int len;
332 len = strlen(cc[i]);
333 if (len !=
334 (unsigned int)(c - ccname))
335 continue;
336 if (strncmp(cc[i], ccname, len))
337 continue;
338 found = true;
339 }
340 if (!found)
341 FAIL("unknown cc");
342 ++c;
343 state = parse_bracket;
344 break;
345 }
346 default:
347 break;
348 }
349 break;
350 default:
351 ++c;
352 break;
353 }
354 break;
355 }
356 }
357 if (group != 0)
358 FAIL("group open");
359 if (state != none)
360 FAIL("incomplete");
361 if (!have_atom)
362 FAIL("no atom");
363 return (sub);
364
365 error:
366 #if VALREGEX_REPORT_REASON
367 fprintf(stderr, "%s\n", reason);
368 #endif
369 return (-1);
370 }
371