regex.c revision 1.1.1.6 1 /* $NetBSD: regex.c,v 1.1.1.6 2025/01/26 16:12:31 christos Exp $ */
2
3 /*
4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5 *
6 * SPDX-License-Identifier: MPL-2.0
7 *
8 * This Source Code Form is subject to the terms of the Mozilla Public
9 * License, v. 2.0. If a copy of the MPL was not distributed with this
10 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11 *
12 * See the COPYRIGHT file distributed with this work for additional
13 * information regarding copyright ownership.
14 */
15
16 #include <stdbool.h>
17
18 #include <isc/file.h>
19 #include <isc/regex.h>
20 #include <isc/string.h>
21
22 #if VALREGEX_REPORT_REASON
23 #define FAIL(x) \
24 do { \
25 reason = (x); \
26 goto error; \
27 } while (0)
28 #else /* if VALREGEX_REPORT_REASON */
29 #define FAIL(x) goto error
30 #endif /* if VALREGEX_REPORT_REASON */
31
32 /*
33 * Validate the regular expression 'C' locale.
34 */
35 int
36 isc_regex_validate(const char *c) {
37 enum {
38 none,
39 parse_bracket,
40 parse_bound,
41 parse_ce,
42 parse_ec,
43 parse_cc
44 } state = none;
45 /* Well known character classes. */
46 const char *cc[] = { ":alnum:", ":digit:", ":punct:", ":alpha:",
47 ":graph:", ":space:", ":blank:", ":lower:",
48 ":upper:", ":cntrl:", ":print:", ":xdigit:" };
49 bool seen_comma = false;
50 bool seen_high = false;
51 bool seen_char = false;
52 bool seen_ec = false;
53 bool seen_ce = false;
54 bool have_atom = false;
55 int group = 0;
56 int range = 0;
57 int sub = 0;
58 bool empty_ok = false;
59 bool neg = false;
60 bool was_multiple = false;
61 unsigned int low = 0;
62 unsigned int high = 0;
63 const char *ccname = NULL;
64 int range_start = 0;
65 #if VALREGEX_REPORT_REASON
66 const char *reason = "";
67 #endif /* if VALREGEX_REPORT_REASON */
68
69 if (c == NULL || *c == 0) {
70 FAIL("empty string");
71 }
72
73 while (c != NULL && *c != 0) {
74 switch (state) {
75 case none:
76 switch (*c) {
77 case '\\': /* make literal */
78 ++c;
79 switch (*c) {
80 case '1':
81 case '2':
82 case '3':
83 case '4':
84 case '5':
85 case '6':
86 case '7':
87 case '8':
88 case '9':
89 if ((*c - '0') > sub) {
90 FAIL("bad back reference");
91 }
92 have_atom = true;
93 was_multiple = false;
94 break;
95 case 0:
96 FAIL("escaped end-of-string");
97 default:
98 goto literal;
99 }
100 ++c;
101 break;
102 case '[': /* bracket start */
103 ++c;
104 neg = false;
105 was_multiple = false;
106 seen_char = false;
107 state = parse_bracket;
108 break;
109 case '{': /* bound start */
110 switch (c[1]) {
111 case '0':
112 case '1':
113 case '2':
114 case '3':
115 case '4':
116 case '5':
117 case '6':
118 case '7':
119 case '8':
120 case '9':
121 if (!have_atom) {
122 FAIL("no atom");
123 }
124 if (was_multiple) {
125 FAIL("was multiple");
126 }
127 seen_comma = false;
128 seen_high = false;
129 low = high = 0;
130 state = parse_bound;
131 break;
132 default:
133 goto literal;
134 }
135 ++c;
136 have_atom = true;
137 was_multiple = true;
138 break;
139 case '}':
140 goto literal;
141 case '(': /* group start */
142 have_atom = false;
143 was_multiple = false;
144 empty_ok = true;
145 ++group;
146 ++sub;
147 ++c;
148 break;
149 case ')': /* group end */
150 if (group && !have_atom && !empty_ok) {
151 FAIL("empty alternative");
152 }
153 have_atom = true;
154 was_multiple = false;
155 if (group != 0) {
156 --group;
157 }
158 ++c;
159 break;
160 case '|': /* alternative separator */
161 if (!have_atom) {
162 FAIL("no atom");
163 }
164 have_atom = false;
165 empty_ok = false;
166 was_multiple = false;
167 ++c;
168 break;
169 case '^':
170 case '$':
171 have_atom = true;
172 was_multiple = true;
173 ++c;
174 break;
175 case '+':
176 case '*':
177 case '?':
178 if (was_multiple) {
179 FAIL("was multiple");
180 }
181 if (!have_atom) {
182 FAIL("no atom");
183 }
184 have_atom = true;
185 was_multiple = true;
186 ++c;
187 break;
188 case '.':
189 default:
190 literal:
191 have_atom = true;
192 was_multiple = false;
193 ++c;
194 break;
195 }
196 break;
197 case parse_bound:
198 switch (*c) {
199 case '0':
200 case '1':
201 case '2':
202 case '3':
203 case '4':
204 case '5':
205 case '6':
206 case '7':
207 case '8':
208 case '9':
209 if (!seen_comma) {
210 low = low * 10 + *c - '0';
211 if (low > 255) {
212 FAIL("lower bound too big");
213 }
214 } else {
215 seen_high = true;
216 high = high * 10 + *c - '0';
217 if (high > 255) {
218 FAIL("upper bound too big");
219 }
220 }
221 ++c;
222 break;
223 case ',':
224 if (seen_comma) {
225 FAIL("multiple commas");
226 }
227 seen_comma = true;
228 ++c;
229 break;
230 default:
231 case '{':
232 FAIL("non digit/comma");
233 case '}':
234 if (seen_high && low > high) {
235 FAIL("bad parse bound");
236 }
237 seen_comma = false;
238 state = none;
239 ++c;
240 break;
241 }
242 break;
243 case parse_bracket:
244 switch (*c) {
245 case '^':
246 if (seen_char || neg) {
247 goto inside;
248 }
249 neg = true;
250 ++c;
251 break;
252 case '-':
253 if (range == 2) {
254 goto inside;
255 }
256 if (!seen_char) {
257 goto inside;
258 }
259 if (range == 1) {
260 FAIL("bad range");
261 }
262 range = 2;
263 ++c;
264 break;
265 case '[':
266 ++c;
267 switch (*c) {
268 case '.': /* collating element */
269 if (range != 0) {
270 --range;
271 }
272 ++c;
273 state = parse_ce;
274 seen_ce = false;
275 break;
276 case '=': /* equivalence class */
277 if (range == 2) {
278 FAIL("equivalence class in "
279 "range");
280 }
281 ++c;
282 state = parse_ec;
283 seen_ec = false;
284 break;
285 case ':': /* character class */
286 if (range == 2) {
287 FAIL("character class in "
288 "range");
289 }
290 ccname = c;
291 ++c;
292 state = parse_cc;
293 break;
294 }
295 seen_char = true;
296 break;
297 case ']':
298 if (!c[1] && !seen_char) {
299 FAIL("unfinished brace");
300 }
301 if (!seen_char) {
302 goto inside;
303 }
304 ++c;
305 range = 0;
306 have_atom = true;
307 state = none;
308 break;
309 default:
310 inside:
311 seen_char = true;
312 if (range == 2 && (*c & 0xff) < range_start) {
313 FAIL("out of order range");
314 }
315 if (range != 0) {
316 --range;
317 }
318 range_start = *c & 0xff;
319 ++c;
320 break;
321 }
322 break;
323 case parse_ce:
324 switch (*c) {
325 case '.':
326 ++c;
327 switch (*c) {
328 case ']':
329 if (!seen_ce) {
330 FAIL("empty ce");
331 }
332 ++c;
333 state = parse_bracket;
334 break;
335 default:
336 if (seen_ce) {
337 range_start = 256;
338 } else {
339 range_start = '.';
340 }
341 seen_ce = true;
342 break;
343 }
344 break;
345 default:
346 if (seen_ce) {
347 range_start = 256;
348 } else {
349 range_start = *c;
350 }
351 seen_ce = true;
352 ++c;
353 break;
354 }
355 break;
356 case parse_ec:
357 switch (*c) {
358 case '=':
359 ++c;
360 switch (*c) {
361 case ']':
362 if (!seen_ec) {
363 FAIL("no ec");
364 }
365 ++c;
366 state = parse_bracket;
367 break;
368 default:
369 seen_ec = true;
370 break;
371 }
372 break;
373 default:
374 seen_ec = true;
375 ++c;
376 break;
377 }
378 break;
379 case parse_cc:
380 switch (*c) {
381 case ':':
382 ++c;
383 switch (*c) {
384 case ']': {
385 unsigned int i;
386 bool found = false;
387 for (i = 0;
388 i < sizeof(cc) / sizeof(*cc); i++)
389 {
390 unsigned int len;
391 len = strlen(cc[i]);
392 if (len !=
393 (unsigned int)(c - ccname))
394 {
395 continue;
396 }
397 if (strncmp(cc[i], ccname, len))
398 {
399 continue;
400 }
401 found = true;
402 }
403 if (!found) {
404 FAIL("unknown cc");
405 }
406 ++c;
407 state = parse_bracket;
408 break;
409 }
410 default:
411 break;
412 }
413 break;
414 default:
415 ++c;
416 break;
417 }
418 break;
419 }
420 }
421 if (group != 0) {
422 FAIL("group open");
423 }
424 if (state != none) {
425 FAIL("incomplete");
426 }
427 if (!have_atom) {
428 FAIL("no atom");
429 }
430 return sub;
431
432 error:
433 #if VALREGEX_REPORT_REASON
434 fprintf(stderr, "%s\n", reason);
435 #endif /* if VALREGEX_REPORT_REASON */
436 return -1;
437 }
438