regex.c revision 1.2 1 1.2 christos /* $NetBSD: regex.c,v 1.2 2018/08/12 13:02:37 christos Exp $ */
2 1.1 christos
3 1.1 christos /*
4 1.1 christos * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5 1.1 christos *
6 1.1 christos * This Source Code Form is subject to the terms of the Mozilla Public
7 1.1 christos * License, v. 2.0. If a copy of the MPL was not distributed with this
8 1.1 christos * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 1.1 christos *
10 1.1 christos * See the COPYRIGHT file distributed with this work for additional
11 1.1 christos * information regarding copyright ownership.
12 1.1 christos */
13 1.1 christos
14 1.1 christos #include <config.h>
15 1.1 christos
16 1.1 christos #include <isc/file.h>
17 1.1 christos #include <isc/print.h>
18 1.1 christos #include <isc/regex.h>
19 1.1 christos #include <isc/string.h>
20 1.1 christos
21 1.1 christos #if VALREGEX_REPORT_REASON
22 1.2 christos #define FAIL(x) do { reason = (x); goto error; } while(/*CONSTCOND*/0)
23 1.1 christos #else
24 1.1 christos #define FAIL(x) goto error
25 1.1 christos #endif
26 1.1 christos
27 1.1 christos /*
28 1.1 christos * Validate the regular expression 'C' locale.
29 1.1 christos */
30 1.1 christos int
31 1.1 christos isc_regex_validate(const char *c) {
32 1.1 christos enum {
33 1.1 christos none, parse_bracket, parse_bound,
34 1.1 christos parse_ce, parse_ec, parse_cc
35 1.1 christos } state = none;
36 1.1 christos /* Well known character classes. */
37 1.1 christos const char *cc[] = {
38 1.1 christos ":alnum:", ":digit:", ":punct:", ":alpha:", ":graph:",
39 1.1 christos ":space:", ":blank:", ":lower:", ":upper:", ":cntrl:",
40 1.1 christos ":print:", ":xdigit:"
41 1.1 christos };
42 1.1 christos isc_boolean_t seen_comma = ISC_FALSE;
43 1.1 christos isc_boolean_t seen_high = ISC_FALSE;
44 1.1 christos isc_boolean_t seen_char = ISC_FALSE;
45 1.1 christos isc_boolean_t seen_ec = ISC_FALSE;
46 1.1 christos isc_boolean_t seen_ce = ISC_FALSE;
47 1.1 christos isc_boolean_t have_atom = ISC_FALSE;
48 1.1 christos int group = 0;
49 1.1 christos int range = 0;
50 1.1 christos int sub = 0;
51 1.1 christos isc_boolean_t empty_ok = ISC_FALSE;
52 1.1 christos isc_boolean_t neg = ISC_FALSE;
53 1.1 christos isc_boolean_t was_multiple = ISC_FALSE;
54 1.1 christos unsigned int low = 0;
55 1.1 christos unsigned int high = 0;
56 1.1 christos const char *ccname = NULL;
57 1.1 christos int range_start = 0;
58 1.1 christos #if VALREGEX_REPORT_REASON
59 1.1 christos const char *reason = "";
60 1.1 christos #endif
61 1.1 christos
62 1.1 christos if (c == NULL || *c == 0)
63 1.1 christos FAIL("empty string");
64 1.1 christos
65 1.1 christos while (c != NULL && *c != 0) {
66 1.1 christos switch (state) {
67 1.1 christos case none:
68 1.1 christos switch (*c) {
69 1.1 christos case '\\': /* make literal */
70 1.1 christos ++c;
71 1.1 christos switch (*c) {
72 1.1 christos case '1': case '2': case '3':
73 1.1 christos case '4': case '5': case '6':
74 1.1 christos case '7': case '8': case '9':
75 1.1 christos if ((*c - '0') > sub)
76 1.1 christos FAIL("bad back reference");
77 1.1 christos have_atom = ISC_TRUE;
78 1.1 christos was_multiple = ISC_FALSE;
79 1.1 christos break;
80 1.1 christos case 0:
81 1.1 christos FAIL("escaped end-of-string");
82 1.1 christos default:
83 1.1 christos goto literal;
84 1.1 christos }
85 1.1 christos ++c;
86 1.1 christos break;
87 1.1 christos case '[': /* bracket start */
88 1.1 christos ++c;
89 1.1 christos neg = ISC_FALSE;
90 1.1 christos was_multiple = ISC_FALSE;
91 1.1 christos seen_char = ISC_FALSE;
92 1.1 christos state = parse_bracket;
93 1.1 christos break;
94 1.1 christos case '{': /* bound start */
95 1.1 christos switch (c[1]) {
96 1.1 christos case '0': case '1': case '2': case '3':
97 1.1 christos case '4': case '5': case '6': case '7':
98 1.1 christos case '8': case '9':
99 1.1 christos if (!have_atom)
100 1.1 christos FAIL("no atom");
101 1.1 christos if (was_multiple)
102 1.1 christos FAIL("was multiple");
103 1.1 christos seen_comma = ISC_FALSE;
104 1.1 christos seen_high = ISC_FALSE;
105 1.1 christos low = high = 0;
106 1.1 christos state = parse_bound;
107 1.1 christos break;
108 1.1 christos default:
109 1.1 christos goto literal;
110 1.1 christos }
111 1.1 christos ++c;
112 1.1 christos have_atom = ISC_TRUE;
113 1.1 christos was_multiple = ISC_TRUE;
114 1.1 christos break;
115 1.1 christos case '}':
116 1.1 christos goto literal;
117 1.1 christos case '(': /* group start */
118 1.1 christos have_atom = ISC_FALSE;
119 1.1 christos was_multiple = ISC_FALSE;
120 1.1 christos empty_ok = ISC_TRUE;
121 1.1 christos ++group;
122 1.1 christos ++sub;
123 1.1 christos ++c;
124 1.1 christos break;
125 1.1 christos case ')': /* group end */
126 1.1 christos if (group && !have_atom && !empty_ok)
127 1.1 christos FAIL("empty alternative");
128 1.1 christos have_atom = ISC_TRUE;
129 1.1 christos was_multiple = ISC_FALSE;
130 1.1 christos if (group != 0)
131 1.1 christos --group;
132 1.1 christos ++c;
133 1.1 christos break;
134 1.1 christos case '|': /* alternative seperator */
135 1.1 christos if (!have_atom)
136 1.1 christos FAIL("no atom");
137 1.1 christos have_atom = ISC_FALSE;
138 1.1 christos empty_ok = ISC_FALSE;
139 1.1 christos was_multiple = ISC_FALSE;
140 1.1 christos ++c;
141 1.1 christos break;
142 1.1 christos case '^':
143 1.1 christos case '$':
144 1.1 christos have_atom = ISC_TRUE;
145 1.1 christos was_multiple = ISC_TRUE;
146 1.1 christos ++c;
147 1.1 christos break;
148 1.1 christos case '+':
149 1.1 christos case '*':
150 1.1 christos case '?':
151 1.1 christos if (was_multiple)
152 1.1 christos FAIL("was multiple");
153 1.1 christos if (!have_atom)
154 1.1 christos FAIL("no atom");
155 1.1 christos have_atom = ISC_TRUE;
156 1.1 christos was_multiple = ISC_TRUE;
157 1.1 christos ++c;
158 1.1 christos break;
159 1.1 christos case '.':
160 1.1 christos default:
161 1.1 christos literal:
162 1.1 christos have_atom = ISC_TRUE;
163 1.1 christos was_multiple = ISC_FALSE;
164 1.1 christos ++c;
165 1.1 christos break;
166 1.1 christos }
167 1.1 christos break;
168 1.1 christos case parse_bound:
169 1.1 christos switch (*c) {
170 1.1 christos case '0': case '1': case '2': case '3': case '4':
171 1.1 christos case '5': case '6': case '7': case '8': case '9':
172 1.1 christos if (!seen_comma) {
173 1.1 christos low = low * 10 + *c - '0';
174 1.1 christos if (low > 255)
175 1.1 christos FAIL("lower bound too big");
176 1.1 christos } else {
177 1.1 christos seen_high = ISC_TRUE;
178 1.1 christos high = high * 10 + *c - '0';
179 1.1 christos if (high > 255)
180 1.1 christos FAIL("upper bound too big");
181 1.1 christos }
182 1.1 christos ++c;
183 1.1 christos break;
184 1.1 christos case ',':
185 1.1 christos if (seen_comma)
186 1.1 christos FAIL("multiple commas");
187 1.1 christos seen_comma = ISC_TRUE;
188 1.1 christos ++c;
189 1.1 christos break;
190 1.1 christos default:
191 1.1 christos case '{':
192 1.1 christos FAIL("non digit/comma");
193 1.1 christos case '}':
194 1.1 christos if (seen_high && low > high)
195 1.1 christos FAIL("bad parse bound");
196 1.1 christos seen_comma = ISC_FALSE;
197 1.1 christos state = none;
198 1.1 christos ++c;
199 1.1 christos break;
200 1.1 christos }
201 1.1 christos break;
202 1.1 christos case parse_bracket:
203 1.1 christos switch (*c) {
204 1.1 christos case '^':
205 1.1 christos if (seen_char || neg) goto inside;
206 1.1 christos neg = ISC_TRUE;
207 1.1 christos ++c;
208 1.1 christos break;
209 1.1 christos case '-':
210 1.1 christos if (range == 2) goto inside;
211 1.1 christos if (!seen_char) goto inside;
212 1.1 christos if (range == 1)
213 1.1 christos FAIL("bad range");
214 1.1 christos range = 2;
215 1.1 christos ++c;
216 1.1 christos break;
217 1.1 christos case '[':
218 1.1 christos ++c;
219 1.1 christos switch (*c) {
220 1.1 christos case '.': /* collating element */
221 1.1 christos if (range != 0) --range;
222 1.1 christos ++c;
223 1.1 christos state = parse_ce;
224 1.1 christos seen_ce = ISC_FALSE;
225 1.1 christos break;
226 1.1 christos case '=': /* equivalence class */
227 1.1 christos if (range == 2)
228 1.1 christos FAIL("equivalence class in range");
229 1.1 christos ++c;
230 1.1 christos state = parse_ec;
231 1.1 christos seen_ec = ISC_FALSE;
232 1.1 christos break;
233 1.1 christos case ':': /* character class */
234 1.1 christos if (range == 2)
235 1.1 christos FAIL("character class in range");
236 1.1 christos ccname = c;
237 1.1 christos ++c;
238 1.1 christos state = parse_cc;
239 1.1 christos break;
240 1.1 christos }
241 1.1 christos seen_char = ISC_TRUE;
242 1.1 christos break;
243 1.1 christos case ']':
244 1.1 christos if (!c[1] && !seen_char)
245 1.1 christos FAIL("unfinished brace");
246 1.1 christos if (!seen_char)
247 1.1 christos goto inside;
248 1.1 christos ++c;
249 1.1 christos range = 0;
250 1.1 christos have_atom = ISC_TRUE;
251 1.1 christos state = none;
252 1.1 christos break;
253 1.1 christos default:
254 1.1 christos inside:
255 1.1 christos seen_char = ISC_TRUE;
256 1.1 christos if (range == 2 && (*c & 0xff) < range_start)
257 1.1 christos FAIL("out of order range");
258 1.1 christos if (range != 0)
259 1.1 christos --range;
260 1.1 christos range_start = *c & 0xff;
261 1.1 christos ++c;
262 1.1 christos break;
263 1.1 christos };
264 1.1 christos break;
265 1.1 christos case parse_ce:
266 1.1 christos switch (*c) {
267 1.1 christos case '.':
268 1.1 christos ++c;
269 1.1 christos switch (*c) {
270 1.1 christos case ']':
271 1.1 christos if (!seen_ce)
272 1.1 christos FAIL("empty ce");
273 1.1 christos ++c;
274 1.1 christos state = parse_bracket;
275 1.1 christos break;
276 1.1 christos default:
277 1.1 christos if (seen_ce)
278 1.1 christos range_start = 256;
279 1.1 christos else
280 1.1 christos range_start = '.';
281 1.1 christos seen_ce = ISC_TRUE;
282 1.1 christos break;
283 1.1 christos }
284 1.1 christos break;
285 1.1 christos default:
286 1.1 christos if (seen_ce)
287 1.1 christos range_start = 256;
288 1.1 christos else
289 1.1 christos range_start = *c;
290 1.1 christos seen_ce = ISC_TRUE;
291 1.1 christos ++c;
292 1.1 christos break;
293 1.1 christos }
294 1.1 christos break;
295 1.1 christos case parse_ec:
296 1.1 christos switch (*c) {
297 1.1 christos case '=':
298 1.1 christos ++c;
299 1.1 christos switch (*c) {
300 1.1 christos case ']':
301 1.1 christos if (!seen_ec)
302 1.1 christos FAIL("no ec");
303 1.1 christos ++c;
304 1.1 christos state = parse_bracket;
305 1.1 christos break;
306 1.1 christos default:
307 1.1 christos seen_ec = ISC_TRUE;
308 1.1 christos break;
309 1.1 christos }
310 1.1 christos break;
311 1.1 christos default:
312 1.1 christos seen_ec = ISC_TRUE;
313 1.1 christos ++c;
314 1.1 christos break;
315 1.1 christos }
316 1.1 christos break;
317 1.1 christos case parse_cc:
318 1.1 christos switch (*c) {
319 1.1 christos case ':':
320 1.1 christos ++c;
321 1.1 christos switch (*c) {
322 1.1 christos case ']': {
323 1.1 christos unsigned int i;
324 1.1 christos isc_boolean_t found = ISC_FALSE;
325 1.1 christos for (i = 0;
326 1.1 christos i < sizeof(cc)/sizeof(*cc);
327 1.1 christos i++)
328 1.1 christos {
329 1.1 christos unsigned int len;
330 1.1 christos len = strlen(cc[i]);
331 1.1 christos if (len !=
332 1.1 christos (unsigned int)(c - ccname))
333 1.1 christos continue;
334 1.1 christos if (strncmp(cc[i], ccname, len))
335 1.1 christos continue;
336 1.1 christos found = ISC_TRUE;
337 1.1 christos }
338 1.1 christos if (!found)
339 1.1 christos FAIL("unknown cc");
340 1.1 christos ++c;
341 1.1 christos state = parse_bracket;
342 1.1 christos break;
343 1.1 christos }
344 1.1 christos default:
345 1.1 christos break;
346 1.1 christos }
347 1.1 christos break;
348 1.1 christos default:
349 1.1 christos ++c;
350 1.1 christos break;
351 1.1 christos }
352 1.1 christos break;
353 1.1 christos }
354 1.1 christos }
355 1.1 christos if (group != 0)
356 1.1 christos FAIL("group open");
357 1.1 christos if (state != none)
358 1.1 christos FAIL("incomplete");
359 1.1 christos if (!have_atom)
360 1.1 christos FAIL("no atom");
361 1.1 christos return (sub);
362 1.1 christos
363 1.1 christos error:
364 1.1 christos #if VALREGEX_REPORT_REASON
365 1.1 christos fprintf(stderr, "%s\n", reason);
366 1.1 christos #endif
367 1.1 christos return (-1);
368 1.1 christos }
369