regex.c revision 1.1.1.4 1 /* $NetBSD: regex.c,v 1.1.1.4 2021/02/19 16:37:16 christos Exp $ */
2
3 /*
4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5 *
6 * This Source Code Form is subject to the terms of the Mozilla Public
7 * License, v. 2.0. If a copy of the MPL was not distributed with this
8 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
9 *
10 * See the COPYRIGHT file distributed with this work for additional
11 * information regarding copyright ownership.
12 */
13
14 #include <stdbool.h>
15
16 #include <isc/file.h>
17 #include <isc/print.h>
18 #include <isc/regex.h>
19 #include <isc/string.h>
20
21 #if VALREGEX_REPORT_REASON
22 #define FAIL(x) \
23 do { \
24 reason = (x); \
25 goto error; \
26 } while (0)
27 #else /* if VALREGEX_REPORT_REASON */
28 #define FAIL(x) goto error
29 #endif /* if VALREGEX_REPORT_REASON */
30
31 /*
32 * Validate the regular expression 'C' locale.
33 */
34 int
35 isc_regex_validate(const char *c) {
36 enum {
37 none,
38 parse_bracket,
39 parse_bound,
40 parse_ce,
41 parse_ec,
42 parse_cc
43 } state = none;
44 /* Well known character classes. */
45 const char *cc[] = { ":alnum:", ":digit:", ":punct:", ":alpha:",
46 ":graph:", ":space:", ":blank:", ":lower:",
47 ":upper:", ":cntrl:", ":print:", ":xdigit:" };
48 bool seen_comma = false;
49 bool seen_high = false;
50 bool seen_char = false;
51 bool seen_ec = false;
52 bool seen_ce = false;
53 bool have_atom = false;
54 int group = 0;
55 int range = 0;
56 int sub = 0;
57 bool empty_ok = false;
58 bool neg = false;
59 bool was_multiple = false;
60 unsigned int low = 0;
61 unsigned int high = 0;
62 const char *ccname = NULL;
63 int range_start = 0;
64 #if VALREGEX_REPORT_REASON
65 const char *reason = "";
66 #endif /* if VALREGEX_REPORT_REASON */
67
68 if (c == NULL || *c == 0) {
69 FAIL("empty string");
70 }
71
72 while (c != NULL && *c != 0) {
73 switch (state) {
74 case none:
75 switch (*c) {
76 case '\\': /* make literal */
77 ++c;
78 switch (*c) {
79 case '1':
80 case '2':
81 case '3':
82 case '4':
83 case '5':
84 case '6':
85 case '7':
86 case '8':
87 case '9':
88 if ((*c - '0') > sub) {
89 FAIL("bad back reference");
90 }
91 have_atom = true;
92 was_multiple = false;
93 break;
94 case 0:
95 FAIL("escaped end-of-string");
96 default:
97 goto literal;
98 }
99 ++c;
100 break;
101 case '[': /* bracket start */
102 ++c;
103 neg = false;
104 was_multiple = false;
105 seen_char = false;
106 state = parse_bracket;
107 break;
108 case '{': /* bound start */
109 switch (c[1]) {
110 case '0':
111 case '1':
112 case '2':
113 case '3':
114 case '4':
115 case '5':
116 case '6':
117 case '7':
118 case '8':
119 case '9':
120 if (!have_atom) {
121 FAIL("no atom");
122 }
123 if (was_multiple) {
124 FAIL("was multiple");
125 }
126 seen_comma = false;
127 seen_high = false;
128 low = high = 0;
129 state = parse_bound;
130 break;
131 default:
132 goto literal;
133 }
134 ++c;
135 have_atom = true;
136 was_multiple = true;
137 break;
138 case '}':
139 goto literal;
140 case '(': /* group start */
141 have_atom = false;
142 was_multiple = false;
143 empty_ok = true;
144 ++group;
145 ++sub;
146 ++c;
147 break;
148 case ')': /* group end */
149 if (group && !have_atom && !empty_ok) {
150 FAIL("empty alternative");
151 }
152 have_atom = true;
153 was_multiple = false;
154 if (group != 0) {
155 --group;
156 }
157 ++c;
158 break;
159 case '|': /* alternative separator */
160 if (!have_atom) {
161 FAIL("no atom");
162 }
163 have_atom = false;
164 empty_ok = false;
165 was_multiple = false;
166 ++c;
167 break;
168 case '^':
169 case '$':
170 have_atom = true;
171 was_multiple = true;
172 ++c;
173 break;
174 case '+':
175 case '*':
176 case '?':
177 if (was_multiple) {
178 FAIL("was multiple");
179 }
180 if (!have_atom) {
181 FAIL("no atom");
182 }
183 have_atom = true;
184 was_multiple = true;
185 ++c;
186 break;
187 case '.':
188 default:
189 literal:
190 have_atom = true;
191 was_multiple = false;
192 ++c;
193 break;
194 }
195 break;
196 case parse_bound:
197 switch (*c) {
198 case '0':
199 case '1':
200 case '2':
201 case '3':
202 case '4':
203 case '5':
204 case '6':
205 case '7':
206 case '8':
207 case '9':
208 if (!seen_comma) {
209 low = low * 10 + *c - '0';
210 if (low > 255) {
211 FAIL("lower bound too big");
212 }
213 } else {
214 seen_high = true;
215 high = high * 10 + *c - '0';
216 if (high > 255) {
217 FAIL("upper bound too big");
218 }
219 }
220 ++c;
221 break;
222 case ',':
223 if (seen_comma) {
224 FAIL("multiple commas");
225 }
226 seen_comma = true;
227 ++c;
228 break;
229 default:
230 case '{':
231 FAIL("non digit/comma");
232 case '}':
233 if (seen_high && low > high) {
234 FAIL("bad parse bound");
235 }
236 seen_comma = false;
237 state = none;
238 ++c;
239 break;
240 }
241 break;
242 case parse_bracket:
243 switch (*c) {
244 case '^':
245 if (seen_char || neg) {
246 goto inside;
247 }
248 neg = true;
249 ++c;
250 break;
251 case '-':
252 if (range == 2) {
253 goto inside;
254 }
255 if (!seen_char) {
256 goto inside;
257 }
258 if (range == 1) {
259 FAIL("bad range");
260 }
261 range = 2;
262 ++c;
263 break;
264 case '[':
265 ++c;
266 switch (*c) {
267 case '.': /* collating element */
268 if (range != 0) {
269 --range;
270 }
271 ++c;
272 state = parse_ce;
273 seen_ce = false;
274 break;
275 case '=': /* equivalence class */
276 if (range == 2) {
277 FAIL("equivalence class in "
278 "range");
279 }
280 ++c;
281 state = parse_ec;
282 seen_ec = false;
283 break;
284 case ':': /* character class */
285 if (range == 2) {
286 FAIL("character class in "
287 "range");
288 }
289 ccname = c;
290 ++c;
291 state = parse_cc;
292 break;
293 }
294 seen_char = true;
295 break;
296 case ']':
297 if (!c[1] && !seen_char) {
298 FAIL("unfinished brace");
299 }
300 if (!seen_char) {
301 goto inside;
302 }
303 ++c;
304 range = 0;
305 have_atom = true;
306 state = none;
307 break;
308 default:
309 inside:
310 seen_char = true;
311 if (range == 2 && (*c & 0xff) < range_start) {
312 FAIL("out of order range");
313 }
314 if (range != 0) {
315 --range;
316 }
317 range_start = *c & 0xff;
318 ++c;
319 break;
320 }
321 break;
322 case parse_ce:
323 switch (*c) {
324 case '.':
325 ++c;
326 switch (*c) {
327 case ']':
328 if (!seen_ce) {
329 FAIL("empty ce");
330 }
331 ++c;
332 state = parse_bracket;
333 break;
334 default:
335 if (seen_ce) {
336 range_start = 256;
337 } else {
338 range_start = '.';
339 }
340 seen_ce = true;
341 break;
342 }
343 break;
344 default:
345 if (seen_ce) {
346 range_start = 256;
347 } else {
348 range_start = *c;
349 }
350 seen_ce = true;
351 ++c;
352 break;
353 }
354 break;
355 case parse_ec:
356 switch (*c) {
357 case '=':
358 ++c;
359 switch (*c) {
360 case ']':
361 if (!seen_ec) {
362 FAIL("no ec");
363 }
364 ++c;
365 state = parse_bracket;
366 break;
367 default:
368 seen_ec = true;
369 break;
370 }
371 break;
372 default:
373 seen_ec = true;
374 ++c;
375 break;
376 }
377 break;
378 case parse_cc:
379 switch (*c) {
380 case ':':
381 ++c;
382 switch (*c) {
383 case ']': {
384 unsigned int i;
385 bool found = false;
386 for (i = 0;
387 i < sizeof(cc) / sizeof(*cc); i++)
388 {
389 unsigned int len;
390 len = strlen(cc[i]);
391 if (len !=
392 (unsigned int)(c - ccname))
393 {
394 continue;
395 }
396 if (strncmp(cc[i], ccname, len))
397 {
398 continue;
399 }
400 found = true;
401 }
402 if (!found) {
403 FAIL("unknown cc");
404 }
405 ++c;
406 state = parse_bracket;
407 break;
408 }
409 default:
410 break;
411 }
412 break;
413 default:
414 ++c;
415 break;
416 }
417 break;
418 }
419 }
420 if (group != 0) {
421 FAIL("group open");
422 }
423 if (state != none) {
424 FAIL("incomplete");
425 }
426 if (!have_atom) {
427 FAIL("no atom");
428 }
429 return (sub);
430
431 error:
432 #if VALREGEX_REPORT_REASON
433 fprintf(stderr, "%s\n", reason);
434 #endif /* if VALREGEX_REPORT_REASON */
435 return (-1);
436 }
437