Home | History | Annotate | Line # | Download | only in isc
      1  1.8  christos /*	$NetBSD: regex.c,v 1.8 2025/01/26 16:25:38 christos Exp $	*/
      2  1.1  christos 
      3  1.1  christos /*
      4  1.1  christos  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
      5  1.1  christos  *
      6  1.7  christos  * SPDX-License-Identifier: MPL-2.0
      7  1.7  christos  *
      8  1.1  christos  * This Source Code Form is subject to the terms of the Mozilla Public
      9  1.1  christos  * License, v. 2.0. If a copy of the MPL was not distributed with this
     10  1.5  christos  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
     11  1.1  christos  *
     12  1.1  christos  * See the COPYRIGHT file distributed with this work for additional
     13  1.1  christos  * information regarding copyright ownership.
     14  1.1  christos  */
     15  1.1  christos 
     16  1.3  christos #include <stdbool.h>
     17  1.3  christos 
     18  1.1  christos #include <isc/file.h>
     19  1.1  christos #include <isc/regex.h>
     20  1.1  christos #include <isc/string.h>
     21  1.1  christos 
     22  1.1  christos #if VALREGEX_REPORT_REASON
     23  1.4  christos #define FAIL(x)               \
     24  1.4  christos 	do {                  \
     25  1.4  christos 		reason = (x); \
     26  1.4  christos 		goto error;   \
     27  1.6    rillig 	} while (0)
     28  1.4  christos #else /* if VALREGEX_REPORT_REASON */
     29  1.1  christos #define FAIL(x) goto error
     30  1.4  christos #endif /* if VALREGEX_REPORT_REASON */
     31  1.1  christos 
     32  1.1  christos /*
     33  1.1  christos  * Validate the regular expression 'C' locale.
     34  1.1  christos  */
     35  1.1  christos int
     36  1.1  christos isc_regex_validate(const char *c) {
     37  1.5  christos 	enum {
     38  1.5  christos 		none,
     39  1.5  christos 		parse_bracket,
     40  1.5  christos 		parse_bound,
     41  1.5  christos 		parse_ce,
     42  1.5  christos 		parse_ec,
     43  1.5  christos 		parse_cc
     44  1.5  christos 	} state = none;
     45  1.1  christos 	/* Well known character classes. */
     46  1.4  christos 	const char *cc[] = { ":alnum:", ":digit:", ":punct:", ":alpha:",
     47  1.4  christos 			     ":graph:", ":space:", ":blank:", ":lower:",
     48  1.4  christos 			     ":upper:", ":cntrl:", ":print:", ":xdigit:" };
     49  1.3  christos 	bool seen_comma = false;
     50  1.3  christos 	bool seen_high = false;
     51  1.3  christos 	bool seen_char = false;
     52  1.3  christos 	bool seen_ec = false;
     53  1.3  christos 	bool seen_ce = false;
     54  1.3  christos 	bool have_atom = false;
     55  1.1  christos 	int group = 0;
     56  1.1  christos 	int range = 0;
     57  1.1  christos 	int sub = 0;
     58  1.3  christos 	bool empty_ok = false;
     59  1.3  christos 	bool neg = false;
     60  1.3  christos 	bool was_multiple = false;
     61  1.1  christos 	unsigned int low = 0;
     62  1.1  christos 	unsigned int high = 0;
     63  1.1  christos 	const char *ccname = NULL;
     64  1.1  christos 	int range_start = 0;
     65  1.1  christos #if VALREGEX_REPORT_REASON
     66  1.1  christos 	const char *reason = "";
     67  1.4  christos #endif /* if VALREGEX_REPORT_REASON */
     68  1.1  christos 
     69  1.4  christos 	if (c == NULL || *c == 0) {
     70  1.1  christos 		FAIL("empty string");
     71  1.4  christos 	}
     72  1.1  christos 
     73  1.1  christos 	while (c != NULL && *c != 0) {
     74  1.1  christos 		switch (state) {
     75  1.1  christos 		case none:
     76  1.1  christos 			switch (*c) {
     77  1.4  christos 			case '\\': /* make literal */
     78  1.1  christos 				++c;
     79  1.1  christos 				switch (*c) {
     80  1.4  christos 				case '1':
     81  1.4  christos 				case '2':
     82  1.4  christos 				case '3':
     83  1.4  christos 				case '4':
     84  1.4  christos 				case '5':
     85  1.4  christos 				case '6':
     86  1.4  christos 				case '7':
     87  1.4  christos 				case '8':
     88  1.4  christos 				case '9':
     89  1.4  christos 					if ((*c - '0') > sub) {
     90  1.1  christos 						FAIL("bad back reference");
     91  1.4  christos 					}
     92  1.3  christos 					have_atom = true;
     93  1.3  christos 					was_multiple = false;
     94  1.1  christos 					break;
     95  1.1  christos 				case 0:
     96  1.1  christos 					FAIL("escaped end-of-string");
     97  1.1  christos 				default:
     98  1.1  christos 					goto literal;
     99  1.1  christos 				}
    100  1.1  christos 				++c;
    101  1.1  christos 				break;
    102  1.4  christos 			case '[': /* bracket start */
    103  1.1  christos 				++c;
    104  1.3  christos 				neg = false;
    105  1.3  christos 				was_multiple = false;
    106  1.3  christos 				seen_char = false;
    107  1.1  christos 				state = parse_bracket;
    108  1.1  christos 				break;
    109  1.4  christos 			case '{': /* bound start */
    110  1.1  christos 				switch (c[1]) {
    111  1.4  christos 				case '0':
    112  1.4  christos 				case '1':
    113  1.4  christos 				case '2':
    114  1.4  christos 				case '3':
    115  1.4  christos 				case '4':
    116  1.4  christos 				case '5':
    117  1.4  christos 				case '6':
    118  1.4  christos 				case '7':
    119  1.4  christos 				case '8':
    120  1.4  christos 				case '9':
    121  1.4  christos 					if (!have_atom) {
    122  1.1  christos 						FAIL("no atom");
    123  1.4  christos 					}
    124  1.4  christos 					if (was_multiple) {
    125  1.1  christos 						FAIL("was multiple");
    126  1.4  christos 					}
    127  1.3  christos 					seen_comma = false;
    128  1.3  christos 					seen_high = false;
    129  1.1  christos 					low = high = 0;
    130  1.1  christos 					state = parse_bound;
    131  1.1  christos 					break;
    132  1.1  christos 				default:
    133  1.1  christos 					goto literal;
    134  1.1  christos 				}
    135  1.1  christos 				++c;
    136  1.3  christos 				have_atom = true;
    137  1.3  christos 				was_multiple = true;
    138  1.1  christos 				break;
    139  1.1  christos 			case '}':
    140  1.1  christos 				goto literal;
    141  1.4  christos 			case '(': /* group start */
    142  1.3  christos 				have_atom = false;
    143  1.3  christos 				was_multiple = false;
    144  1.3  christos 				empty_ok = true;
    145  1.1  christos 				++group;
    146  1.1  christos 				++sub;
    147  1.1  christos 				++c;
    148  1.1  christos 				break;
    149  1.4  christos 			case ')': /* group end */
    150  1.4  christos 				if (group && !have_atom && !empty_ok) {
    151  1.1  christos 					FAIL("empty alternative");
    152  1.4  christos 				}
    153  1.3  christos 				have_atom = true;
    154  1.3  christos 				was_multiple = false;
    155  1.4  christos 				if (group != 0) {
    156  1.1  christos 					--group;
    157  1.4  christos 				}
    158  1.1  christos 				++c;
    159  1.1  christos 				break;
    160  1.4  christos 			case '|': /* alternative separator */
    161  1.4  christos 				if (!have_atom) {
    162  1.1  christos 					FAIL("no atom");
    163  1.4  christos 				}
    164  1.3  christos 				have_atom = false;
    165  1.3  christos 				empty_ok = false;
    166  1.3  christos 				was_multiple = false;
    167  1.1  christos 				++c;
    168  1.1  christos 				break;
    169  1.1  christos 			case '^':
    170  1.1  christos 			case '$':
    171  1.3  christos 				have_atom = true;
    172  1.3  christos 				was_multiple = true;
    173  1.1  christos 				++c;
    174  1.1  christos 				break;
    175  1.1  christos 			case '+':
    176  1.1  christos 			case '*':
    177  1.1  christos 			case '?':
    178  1.4  christos 				if (was_multiple) {
    179  1.1  christos 					FAIL("was multiple");
    180  1.4  christos 				}
    181  1.4  christos 				if (!have_atom) {
    182  1.1  christos 					FAIL("no atom");
    183  1.4  christos 				}
    184  1.3  christos 				have_atom = true;
    185  1.3  christos 				was_multiple = true;
    186  1.1  christos 				++c;
    187  1.1  christos 				break;
    188  1.1  christos 			case '.':
    189  1.1  christos 			default:
    190  1.1  christos 			literal:
    191  1.3  christos 				have_atom = true;
    192  1.3  christos 				was_multiple = false;
    193  1.1  christos 				++c;
    194  1.1  christos 				break;
    195  1.1  christos 			}
    196  1.1  christos 			break;
    197  1.1  christos 		case parse_bound:
    198  1.1  christos 			switch (*c) {
    199  1.4  christos 			case '0':
    200  1.4  christos 			case '1':
    201  1.4  christos 			case '2':
    202  1.4  christos 			case '3':
    203  1.4  christos 			case '4':
    204  1.4  christos 			case '5':
    205  1.4  christos 			case '6':
    206  1.4  christos 			case '7':
    207  1.4  christos 			case '8':
    208  1.4  christos 			case '9':
    209  1.1  christos 				if (!seen_comma) {
    210  1.1  christos 					low = low * 10 + *c - '0';
    211  1.4  christos 					if (low > 255) {
    212  1.1  christos 						FAIL("lower bound too big");
    213  1.4  christos 					}
    214  1.1  christos 				} else {
    215  1.3  christos 					seen_high = true;
    216  1.1  christos 					high = high * 10 + *c - '0';
    217  1.4  christos 					if (high > 255) {
    218  1.1  christos 						FAIL("upper bound too big");
    219  1.4  christos 					}
    220  1.1  christos 				}
    221  1.1  christos 				++c;
    222  1.1  christos 				break;
    223  1.1  christos 			case ',':
    224  1.4  christos 				if (seen_comma) {
    225  1.1  christos 					FAIL("multiple commas");
    226  1.4  christos 				}
    227  1.3  christos 				seen_comma = true;
    228  1.1  christos 				++c;
    229  1.1  christos 				break;
    230  1.1  christos 			default:
    231  1.1  christos 			case '{':
    232  1.1  christos 				FAIL("non digit/comma");
    233  1.1  christos 			case '}':
    234  1.4  christos 				if (seen_high && low > high) {
    235  1.1  christos 					FAIL("bad parse bound");
    236  1.4  christos 				}
    237  1.3  christos 				seen_comma = false;
    238  1.1  christos 				state = none;
    239  1.1  christos 				++c;
    240  1.1  christos 				break;
    241  1.1  christos 			}
    242  1.1  christos 			break;
    243  1.1  christos 		case parse_bracket:
    244  1.1  christos 			switch (*c) {
    245  1.1  christos 			case '^':
    246  1.4  christos 				if (seen_char || neg) {
    247  1.4  christos 					goto inside;
    248  1.4  christos 				}
    249  1.3  christos 				neg = true;
    250  1.1  christos 				++c;
    251  1.1  christos 				break;
    252  1.1  christos 			case '-':
    253  1.4  christos 				if (range == 2) {
    254  1.4  christos 					goto inside;
    255  1.4  christos 				}
    256  1.4  christos 				if (!seen_char) {
    257  1.4  christos 					goto inside;
    258  1.4  christos 				}
    259  1.4  christos 				if (range == 1) {
    260  1.1  christos 					FAIL("bad range");
    261  1.4  christos 				}
    262  1.1  christos 				range = 2;
    263  1.1  christos 				++c;
    264  1.1  christos 				break;
    265  1.1  christos 			case '[':
    266  1.1  christos 				++c;
    267  1.1  christos 				switch (*c) {
    268  1.4  christos 				case '.': /* collating element */
    269  1.4  christos 					if (range != 0) {
    270  1.4  christos 						--range;
    271  1.4  christos 					}
    272  1.1  christos 					++c;
    273  1.1  christos 					state = parse_ce;
    274  1.3  christos 					seen_ce = false;
    275  1.1  christos 					break;
    276  1.4  christos 				case '=': /* equivalence class */
    277  1.4  christos 					if (range == 2) {
    278  1.4  christos 						FAIL("equivalence class in "
    279  1.4  christos 						     "range");
    280  1.4  christos 					}
    281  1.1  christos 					++c;
    282  1.1  christos 					state = parse_ec;
    283  1.3  christos 					seen_ec = false;
    284  1.1  christos 					break;
    285  1.4  christos 				case ':': /* character class */
    286  1.4  christos 					if (range == 2) {
    287  1.4  christos 						FAIL("character class in "
    288  1.4  christos 						     "range");
    289  1.4  christos 					}
    290  1.1  christos 					ccname = c;
    291  1.1  christos 					++c;
    292  1.1  christos 					state = parse_cc;
    293  1.1  christos 					break;
    294  1.1  christos 				}
    295  1.3  christos 				seen_char = true;
    296  1.1  christos 				break;
    297  1.1  christos 			case ']':
    298  1.4  christos 				if (!c[1] && !seen_char) {
    299  1.1  christos 					FAIL("unfinished brace");
    300  1.4  christos 				}
    301  1.4  christos 				if (!seen_char) {
    302  1.1  christos 					goto inside;
    303  1.4  christos 				}
    304  1.1  christos 				++c;
    305  1.1  christos 				range = 0;
    306  1.3  christos 				have_atom = true;
    307  1.1  christos 				state = none;
    308  1.1  christos 				break;
    309  1.1  christos 			default:
    310  1.1  christos 			inside:
    311  1.3  christos 				seen_char = true;
    312  1.4  christos 				if (range == 2 && (*c & 0xff) < range_start) {
    313  1.1  christos 					FAIL("out of order range");
    314  1.4  christos 				}
    315  1.4  christos 				if (range != 0) {
    316  1.1  christos 					--range;
    317  1.4  christos 				}
    318  1.1  christos 				range_start = *c & 0xff;
    319  1.1  christos 				++c;
    320  1.1  christos 				break;
    321  1.4  christos 			}
    322  1.1  christos 			break;
    323  1.1  christos 		case parse_ce:
    324  1.1  christos 			switch (*c) {
    325  1.1  christos 			case '.':
    326  1.1  christos 				++c;
    327  1.1  christos 				switch (*c) {
    328  1.1  christos 				case ']':
    329  1.4  christos 					if (!seen_ce) {
    330  1.4  christos 						FAIL("empty ce");
    331  1.4  christos 					}
    332  1.1  christos 					++c;
    333  1.1  christos 					state = parse_bracket;
    334  1.1  christos 					break;
    335  1.1  christos 				default:
    336  1.4  christos 					if (seen_ce) {
    337  1.1  christos 						range_start = 256;
    338  1.4  christos 					} else {
    339  1.1  christos 						range_start = '.';
    340  1.4  christos 					}
    341  1.3  christos 					seen_ce = true;
    342  1.1  christos 					break;
    343  1.1  christos 				}
    344  1.1  christos 				break;
    345  1.1  christos 			default:
    346  1.4  christos 				if (seen_ce) {
    347  1.1  christos 					range_start = 256;
    348  1.4  christos 				} else {
    349  1.1  christos 					range_start = *c;
    350  1.4  christos 				}
    351  1.3  christos 				seen_ce = true;
    352  1.1  christos 				++c;
    353  1.1  christos 				break;
    354  1.1  christos 			}
    355  1.1  christos 			break;
    356  1.1  christos 		case parse_ec:
    357  1.1  christos 			switch (*c) {
    358  1.1  christos 			case '=':
    359  1.1  christos 				++c;
    360  1.1  christos 				switch (*c) {
    361  1.1  christos 				case ']':
    362  1.4  christos 					if (!seen_ec) {
    363  1.1  christos 						FAIL("no ec");
    364  1.4  christos 					}
    365  1.1  christos 					++c;
    366  1.1  christos 					state = parse_bracket;
    367  1.1  christos 					break;
    368  1.1  christos 				default:
    369  1.3  christos 					seen_ec = true;
    370  1.1  christos 					break;
    371  1.1  christos 				}
    372  1.1  christos 				break;
    373  1.1  christos 			default:
    374  1.3  christos 				seen_ec = true;
    375  1.1  christos 				++c;
    376  1.1  christos 				break;
    377  1.1  christos 			}
    378  1.1  christos 			break;
    379  1.1  christos 		case parse_cc:
    380  1.1  christos 			switch (*c) {
    381  1.1  christos 			case ':':
    382  1.1  christos 				++c;
    383  1.1  christos 				switch (*c) {
    384  1.1  christos 				case ']': {
    385  1.1  christos 					unsigned int i;
    386  1.3  christos 					bool found = false;
    387  1.1  christos 					for (i = 0;
    388  1.4  christos 					     i < sizeof(cc) / sizeof(*cc); i++)
    389  1.1  christos 					{
    390  1.1  christos 						unsigned int len;
    391  1.1  christos 						len = strlen(cc[i]);
    392  1.1  christos 						if (len !=
    393  1.1  christos 						    (unsigned int)(c - ccname))
    394  1.4  christos 						{
    395  1.1  christos 							continue;
    396  1.4  christos 						}
    397  1.1  christos 						if (strncmp(cc[i], ccname, len))
    398  1.4  christos 						{
    399  1.1  christos 							continue;
    400  1.4  christos 						}
    401  1.3  christos 						found = true;
    402  1.1  christos 					}
    403  1.4  christos 					if (!found) {
    404  1.1  christos 						FAIL("unknown cc");
    405  1.4  christos 					}
    406  1.1  christos 					++c;
    407  1.1  christos 					state = parse_bracket;
    408  1.1  christos 					break;
    409  1.4  christos 				}
    410  1.1  christos 				default:
    411  1.1  christos 					break;
    412  1.1  christos 				}
    413  1.1  christos 				break;
    414  1.1  christos 			default:
    415  1.1  christos 				++c;
    416  1.1  christos 				break;
    417  1.1  christos 			}
    418  1.1  christos 			break;
    419  1.1  christos 		}
    420  1.1  christos 	}
    421  1.4  christos 	if (group != 0) {
    422  1.1  christos 		FAIL("group open");
    423  1.4  christos 	}
    424  1.4  christos 	if (state != none) {
    425  1.1  christos 		FAIL("incomplete");
    426  1.4  christos 	}
    427  1.4  christos 	if (!have_atom) {
    428  1.1  christos 		FAIL("no atom");
    429  1.4  christos 	}
    430  1.8  christos 	return sub;
    431  1.1  christos 
    432  1.4  christos error:
    433  1.1  christos #if VALREGEX_REPORT_REASON
    434  1.1  christos 	fprintf(stderr, "%s\n", reason);
    435  1.4  christos #endif /* if VALREGEX_REPORT_REASON */
    436  1.8  christos 	return -1;
    437  1.1  christos }
    438