1 1.8 christos /* $NetBSD: regex.c,v 1.8 2025/01/26 16:25:38 christos Exp $ */ 2 1.1 christos 3 1.1 christos /* 4 1.1 christos * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 1.1 christos * 6 1.7 christos * SPDX-License-Identifier: MPL-2.0 7 1.7 christos * 8 1.1 christos * This Source Code Form is subject to the terms of the Mozilla Public 9 1.1 christos * License, v. 2.0. If a copy of the MPL was not distributed with this 10 1.5 christos * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 1.1 christos * 12 1.1 christos * See the COPYRIGHT file distributed with this work for additional 13 1.1 christos * information regarding copyright ownership. 14 1.1 christos */ 15 1.1 christos 16 1.3 christos #include <stdbool.h> 17 1.3 christos 18 1.1 christos #include <isc/file.h> 19 1.1 christos #include <isc/regex.h> 20 1.1 christos #include <isc/string.h> 21 1.1 christos 22 1.1 christos #if VALREGEX_REPORT_REASON 23 1.4 christos #define FAIL(x) \ 24 1.4 christos do { \ 25 1.4 christos reason = (x); \ 26 1.4 christos goto error; \ 27 1.6 rillig } while (0) 28 1.4 christos #else /* if VALREGEX_REPORT_REASON */ 29 1.1 christos #define FAIL(x) goto error 30 1.4 christos #endif /* if VALREGEX_REPORT_REASON */ 31 1.1 christos 32 1.1 christos /* 33 1.1 christos * Validate the regular expression 'C' locale. 34 1.1 christos */ 35 1.1 christos int 36 1.1 christos isc_regex_validate(const char *c) { 37 1.5 christos enum { 38 1.5 christos none, 39 1.5 christos parse_bracket, 40 1.5 christos parse_bound, 41 1.5 christos parse_ce, 42 1.5 christos parse_ec, 43 1.5 christos parse_cc 44 1.5 christos } state = none; 45 1.1 christos /* Well known character classes. */ 46 1.4 christos const char *cc[] = { ":alnum:", ":digit:", ":punct:", ":alpha:", 47 1.4 christos ":graph:", ":space:", ":blank:", ":lower:", 48 1.4 christos ":upper:", ":cntrl:", ":print:", ":xdigit:" }; 49 1.3 christos bool seen_comma = false; 50 1.3 christos bool seen_high = false; 51 1.3 christos bool seen_char = false; 52 1.3 christos bool seen_ec = false; 53 1.3 christos bool seen_ce = false; 54 1.3 christos bool have_atom = false; 55 1.1 christos int group = 0; 56 1.1 christos int range = 0; 57 1.1 christos int sub = 0; 58 1.3 christos bool empty_ok = false; 59 1.3 christos bool neg = false; 60 1.3 christos bool was_multiple = false; 61 1.1 christos unsigned int low = 0; 62 1.1 christos unsigned int high = 0; 63 1.1 christos const char *ccname = NULL; 64 1.1 christos int range_start = 0; 65 1.1 christos #if VALREGEX_REPORT_REASON 66 1.1 christos const char *reason = ""; 67 1.4 christos #endif /* if VALREGEX_REPORT_REASON */ 68 1.1 christos 69 1.4 christos if (c == NULL || *c == 0) { 70 1.1 christos FAIL("empty string"); 71 1.4 christos } 72 1.1 christos 73 1.1 christos while (c != NULL && *c != 0) { 74 1.1 christos switch (state) { 75 1.1 christos case none: 76 1.1 christos switch (*c) { 77 1.4 christos case '\\': /* make literal */ 78 1.1 christos ++c; 79 1.1 christos switch (*c) { 80 1.4 christos case '1': 81 1.4 christos case '2': 82 1.4 christos case '3': 83 1.4 christos case '4': 84 1.4 christos case '5': 85 1.4 christos case '6': 86 1.4 christos case '7': 87 1.4 christos case '8': 88 1.4 christos case '9': 89 1.4 christos if ((*c - '0') > sub) { 90 1.1 christos FAIL("bad back reference"); 91 1.4 christos } 92 1.3 christos have_atom = true; 93 1.3 christos was_multiple = false; 94 1.1 christos break; 95 1.1 christos case 0: 96 1.1 christos FAIL("escaped end-of-string"); 97 1.1 christos default: 98 1.1 christos goto literal; 99 1.1 christos } 100 1.1 christos ++c; 101 1.1 christos break; 102 1.4 christos case '[': /* bracket start */ 103 1.1 christos ++c; 104 1.3 christos neg = false; 105 1.3 christos was_multiple = false; 106 1.3 christos seen_char = false; 107 1.1 christos state = parse_bracket; 108 1.1 christos break; 109 1.4 christos case '{': /* bound start */ 110 1.1 christos switch (c[1]) { 111 1.4 christos case '0': 112 1.4 christos case '1': 113 1.4 christos case '2': 114 1.4 christos case '3': 115 1.4 christos case '4': 116 1.4 christos case '5': 117 1.4 christos case '6': 118 1.4 christos case '7': 119 1.4 christos case '8': 120 1.4 christos case '9': 121 1.4 christos if (!have_atom) { 122 1.1 christos FAIL("no atom"); 123 1.4 christos } 124 1.4 christos if (was_multiple) { 125 1.1 christos FAIL("was multiple"); 126 1.4 christos } 127 1.3 christos seen_comma = false; 128 1.3 christos seen_high = false; 129 1.1 christos low = high = 0; 130 1.1 christos state = parse_bound; 131 1.1 christos break; 132 1.1 christos default: 133 1.1 christos goto literal; 134 1.1 christos } 135 1.1 christos ++c; 136 1.3 christos have_atom = true; 137 1.3 christos was_multiple = true; 138 1.1 christos break; 139 1.1 christos case '}': 140 1.1 christos goto literal; 141 1.4 christos case '(': /* group start */ 142 1.3 christos have_atom = false; 143 1.3 christos was_multiple = false; 144 1.3 christos empty_ok = true; 145 1.1 christos ++group; 146 1.1 christos ++sub; 147 1.1 christos ++c; 148 1.1 christos break; 149 1.4 christos case ')': /* group end */ 150 1.4 christos if (group && !have_atom && !empty_ok) { 151 1.1 christos FAIL("empty alternative"); 152 1.4 christos } 153 1.3 christos have_atom = true; 154 1.3 christos was_multiple = false; 155 1.4 christos if (group != 0) { 156 1.1 christos --group; 157 1.4 christos } 158 1.1 christos ++c; 159 1.1 christos break; 160 1.4 christos case '|': /* alternative separator */ 161 1.4 christos if (!have_atom) { 162 1.1 christos FAIL("no atom"); 163 1.4 christos } 164 1.3 christos have_atom = false; 165 1.3 christos empty_ok = false; 166 1.3 christos was_multiple = false; 167 1.1 christos ++c; 168 1.1 christos break; 169 1.1 christos case '^': 170 1.1 christos case '$': 171 1.3 christos have_atom = true; 172 1.3 christos was_multiple = true; 173 1.1 christos ++c; 174 1.1 christos break; 175 1.1 christos case '+': 176 1.1 christos case '*': 177 1.1 christos case '?': 178 1.4 christos if (was_multiple) { 179 1.1 christos FAIL("was multiple"); 180 1.4 christos } 181 1.4 christos if (!have_atom) { 182 1.1 christos FAIL("no atom"); 183 1.4 christos } 184 1.3 christos have_atom = true; 185 1.3 christos was_multiple = true; 186 1.1 christos ++c; 187 1.1 christos break; 188 1.1 christos case '.': 189 1.1 christos default: 190 1.1 christos literal: 191 1.3 christos have_atom = true; 192 1.3 christos was_multiple = false; 193 1.1 christos ++c; 194 1.1 christos break; 195 1.1 christos } 196 1.1 christos break; 197 1.1 christos case parse_bound: 198 1.1 christos switch (*c) { 199 1.4 christos case '0': 200 1.4 christos case '1': 201 1.4 christos case '2': 202 1.4 christos case '3': 203 1.4 christos case '4': 204 1.4 christos case '5': 205 1.4 christos case '6': 206 1.4 christos case '7': 207 1.4 christos case '8': 208 1.4 christos case '9': 209 1.1 christos if (!seen_comma) { 210 1.1 christos low = low * 10 + *c - '0'; 211 1.4 christos if (low > 255) { 212 1.1 christos FAIL("lower bound too big"); 213 1.4 christos } 214 1.1 christos } else { 215 1.3 christos seen_high = true; 216 1.1 christos high = high * 10 + *c - '0'; 217 1.4 christos if (high > 255) { 218 1.1 christos FAIL("upper bound too big"); 219 1.4 christos } 220 1.1 christos } 221 1.1 christos ++c; 222 1.1 christos break; 223 1.1 christos case ',': 224 1.4 christos if (seen_comma) { 225 1.1 christos FAIL("multiple commas"); 226 1.4 christos } 227 1.3 christos seen_comma = true; 228 1.1 christos ++c; 229 1.1 christos break; 230 1.1 christos default: 231 1.1 christos case '{': 232 1.1 christos FAIL("non digit/comma"); 233 1.1 christos case '}': 234 1.4 christos if (seen_high && low > high) { 235 1.1 christos FAIL("bad parse bound"); 236 1.4 christos } 237 1.3 christos seen_comma = false; 238 1.1 christos state = none; 239 1.1 christos ++c; 240 1.1 christos break; 241 1.1 christos } 242 1.1 christos break; 243 1.1 christos case parse_bracket: 244 1.1 christos switch (*c) { 245 1.1 christos case '^': 246 1.4 christos if (seen_char || neg) { 247 1.4 christos goto inside; 248 1.4 christos } 249 1.3 christos neg = true; 250 1.1 christos ++c; 251 1.1 christos break; 252 1.1 christos case '-': 253 1.4 christos if (range == 2) { 254 1.4 christos goto inside; 255 1.4 christos } 256 1.4 christos if (!seen_char) { 257 1.4 christos goto inside; 258 1.4 christos } 259 1.4 christos if (range == 1) { 260 1.1 christos FAIL("bad range"); 261 1.4 christos } 262 1.1 christos range = 2; 263 1.1 christos ++c; 264 1.1 christos break; 265 1.1 christos case '[': 266 1.1 christos ++c; 267 1.1 christos switch (*c) { 268 1.4 christos case '.': /* collating element */ 269 1.4 christos if (range != 0) { 270 1.4 christos --range; 271 1.4 christos } 272 1.1 christos ++c; 273 1.1 christos state = parse_ce; 274 1.3 christos seen_ce = false; 275 1.1 christos break; 276 1.4 christos case '=': /* equivalence class */ 277 1.4 christos if (range == 2) { 278 1.4 christos FAIL("equivalence class in " 279 1.4 christos "range"); 280 1.4 christos } 281 1.1 christos ++c; 282 1.1 christos state = parse_ec; 283 1.3 christos seen_ec = false; 284 1.1 christos break; 285 1.4 christos case ':': /* character class */ 286 1.4 christos if (range == 2) { 287 1.4 christos FAIL("character class in " 288 1.4 christos "range"); 289 1.4 christos } 290 1.1 christos ccname = c; 291 1.1 christos ++c; 292 1.1 christos state = parse_cc; 293 1.1 christos break; 294 1.1 christos } 295 1.3 christos seen_char = true; 296 1.1 christos break; 297 1.1 christos case ']': 298 1.4 christos if (!c[1] && !seen_char) { 299 1.1 christos FAIL("unfinished brace"); 300 1.4 christos } 301 1.4 christos if (!seen_char) { 302 1.1 christos goto inside; 303 1.4 christos } 304 1.1 christos ++c; 305 1.1 christos range = 0; 306 1.3 christos have_atom = true; 307 1.1 christos state = none; 308 1.1 christos break; 309 1.1 christos default: 310 1.1 christos inside: 311 1.3 christos seen_char = true; 312 1.4 christos if (range == 2 && (*c & 0xff) < range_start) { 313 1.1 christos FAIL("out of order range"); 314 1.4 christos } 315 1.4 christos if (range != 0) { 316 1.1 christos --range; 317 1.4 christos } 318 1.1 christos range_start = *c & 0xff; 319 1.1 christos ++c; 320 1.1 christos break; 321 1.4 christos } 322 1.1 christos break; 323 1.1 christos case parse_ce: 324 1.1 christos switch (*c) { 325 1.1 christos case '.': 326 1.1 christos ++c; 327 1.1 christos switch (*c) { 328 1.1 christos case ']': 329 1.4 christos if (!seen_ce) { 330 1.4 christos FAIL("empty ce"); 331 1.4 christos } 332 1.1 christos ++c; 333 1.1 christos state = parse_bracket; 334 1.1 christos break; 335 1.1 christos default: 336 1.4 christos if (seen_ce) { 337 1.1 christos range_start = 256; 338 1.4 christos } else { 339 1.1 christos range_start = '.'; 340 1.4 christos } 341 1.3 christos seen_ce = true; 342 1.1 christos break; 343 1.1 christos } 344 1.1 christos break; 345 1.1 christos default: 346 1.4 christos if (seen_ce) { 347 1.1 christos range_start = 256; 348 1.4 christos } else { 349 1.1 christos range_start = *c; 350 1.4 christos } 351 1.3 christos seen_ce = true; 352 1.1 christos ++c; 353 1.1 christos break; 354 1.1 christos } 355 1.1 christos break; 356 1.1 christos case parse_ec: 357 1.1 christos switch (*c) { 358 1.1 christos case '=': 359 1.1 christos ++c; 360 1.1 christos switch (*c) { 361 1.1 christos case ']': 362 1.4 christos if (!seen_ec) { 363 1.1 christos FAIL("no ec"); 364 1.4 christos } 365 1.1 christos ++c; 366 1.1 christos state = parse_bracket; 367 1.1 christos break; 368 1.1 christos default: 369 1.3 christos seen_ec = true; 370 1.1 christos break; 371 1.1 christos } 372 1.1 christos break; 373 1.1 christos default: 374 1.3 christos seen_ec = true; 375 1.1 christos ++c; 376 1.1 christos break; 377 1.1 christos } 378 1.1 christos break; 379 1.1 christos case parse_cc: 380 1.1 christos switch (*c) { 381 1.1 christos case ':': 382 1.1 christos ++c; 383 1.1 christos switch (*c) { 384 1.1 christos case ']': { 385 1.1 christos unsigned int i; 386 1.3 christos bool found = false; 387 1.1 christos for (i = 0; 388 1.4 christos i < sizeof(cc) / sizeof(*cc); i++) 389 1.1 christos { 390 1.1 christos unsigned int len; 391 1.1 christos len = strlen(cc[i]); 392 1.1 christos if (len != 393 1.1 christos (unsigned int)(c - ccname)) 394 1.4 christos { 395 1.1 christos continue; 396 1.4 christos } 397 1.1 christos if (strncmp(cc[i], ccname, len)) 398 1.4 christos { 399 1.1 christos continue; 400 1.4 christos } 401 1.3 christos found = true; 402 1.1 christos } 403 1.4 christos if (!found) { 404 1.1 christos FAIL("unknown cc"); 405 1.4 christos } 406 1.1 christos ++c; 407 1.1 christos state = parse_bracket; 408 1.1 christos break; 409 1.4 christos } 410 1.1 christos default: 411 1.1 christos break; 412 1.1 christos } 413 1.1 christos break; 414 1.1 christos default: 415 1.1 christos ++c; 416 1.1 christos break; 417 1.1 christos } 418 1.1 christos break; 419 1.1 christos } 420 1.1 christos } 421 1.4 christos if (group != 0) { 422 1.1 christos FAIL("group open"); 423 1.4 christos } 424 1.4 christos if (state != none) { 425 1.1 christos FAIL("incomplete"); 426 1.4 christos } 427 1.4 christos if (!have_atom) { 428 1.1 christos FAIL("no atom"); 429 1.4 christos } 430 1.8 christos return sub; 431 1.1 christos 432 1.4 christos error: 433 1.1 christos #if VALREGEX_REPORT_REASON 434 1.1 christos fprintf(stderr, "%s\n", reason); 435 1.4 christos #endif /* if VALREGEX_REPORT_REASON */ 436 1.8 christos return -1; 437 1.1 christos } 438