1 1.14 andvar /* $NetBSD: rxp.c,v 1.14 2021/11/07 20:31:09 andvar Exp $ */ 2 1.5 cgd 3 1.1 cgd /*- 4 1.4 cgd * Copyright (c) 1991, 1993 5 1.4 cgd * The Regents of the University of California. All rights reserved. 6 1.1 cgd * 7 1.1 cgd * This code is derived from software contributed to Berkeley by 8 1.4 cgd * Jim R. Oldroyd at The Instruction Set and Keith Gabryelski at 9 1.4 cgd * Commodore Business Machines. 10 1.1 cgd * 11 1.1 cgd * Redistribution and use in source and binary forms, with or without 12 1.1 cgd * modification, are permitted provided that the following conditions 13 1.1 cgd * are met: 14 1.1 cgd * 1. Redistributions of source code must retain the above copyright 15 1.1 cgd * notice, this list of conditions and the following disclaimer. 16 1.1 cgd * 2. Redistributions in binary form must reproduce the above copyright 17 1.1 cgd * notice, this list of conditions and the following disclaimer in the 18 1.1 cgd * documentation and/or other materials provided with the distribution. 19 1.11 agc * 3. Neither the name of the University nor the names of its contributors 20 1.1 cgd * may be used to endorse or promote products derived from this software 21 1.1 cgd * without specific prior written permission. 22 1.1 cgd * 23 1.1 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 1.1 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 1.1 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 1.1 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 1.1 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 1.1 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 1.1 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 1.1 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 1.1 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 1.1 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 1.1 cgd * SUCH DAMAGE. 34 1.1 cgd */ 35 1.1 cgd 36 1.6 lukem #include <sys/cdefs.h> 37 1.1 cgd #ifndef lint 38 1.5 cgd #if 0 39 1.4 cgd static char sccsid[] = "@(#)rxp.c 8.1 (Berkeley) 5/31/93"; 40 1.5 cgd #else 41 1.14 andvar __RCSID("$NetBSD: rxp.c,v 1.14 2021/11/07 20:31:09 andvar Exp $"); 42 1.5 cgd #endif 43 1.1 cgd #endif /* not lint */ 44 1.1 cgd 45 1.1 cgd /* 46 1.1 cgd * regular expression parser 47 1.1 cgd * 48 1.1 cgd * external functions and return values are: 49 1.1 cgd * rxp_compile(s) 50 1.1 cgd * TRUE success 51 1.1 cgd * FALSE parse failure; error message will be in char rxperr[] 52 1.1 cgd * metas are: 53 1.14 andvar * {...} optional pattern, equivalent to [...|] 54 1.1 cgd * | alternate pattern 55 1.1 cgd * [...] pattern delimiters 56 1.1 cgd * 57 1.1 cgd * rxp_match(s) 58 1.1 cgd * TRUE string s matches compiled pattern 59 1.1 cgd * FALSE match failure or regexp error 60 1.1 cgd * 61 1.1 cgd * rxp_expand() 62 1.1 cgd * char * reverse-engineered regular expression string 63 1.1 cgd * NULL regexp error 64 1.1 cgd */ 65 1.1 cgd 66 1.1 cgd #include <stdio.h> 67 1.10 thorpej #include <stdlib.h> 68 1.1 cgd #include <ctype.h> 69 1.1 cgd #include "quiz.h" 70 1.1 cgd /* regexp tokens, arg */ 71 1.1 cgd #define LIT (-1) /* literal character, char */ 72 1.1 cgd #define SOT (-2) /* start text anchor, - */ 73 1.1 cgd #define EOT (-3) /* end text anchor, - */ 74 1.1 cgd #define GRP_S (-4) /* start alternate grp, ptr_to_end */ 75 1.1 cgd #define GRP_E (-5) /* end group, - */ 76 1.1 cgd #define ALT_S (-6) /* alternate starts, ptr_to_next */ 77 1.1 cgd #define ALT_E (-7) /* alternate ends, - */ 78 1.1 cgd #define END (-8) /* end of regexp, - */ 79 1.1 cgd 80 1.1 cgd typedef short Rxp_t; /* type for regexp tokens */ 81 1.1 cgd 82 1.1 cgd static Rxp_t rxpbuf[RXP_LINE_SZ]; /* compiled regular expression buffer */ 83 1.1 cgd char rxperr[128]; /* parser error message */ 84 1.1 cgd 85 1.12 jsm static int rxp__compile(const char *, int); 86 1.12 jsm static char *rxp__expand(int); 87 1.12 jsm static int rxp__match(const char *, int, Rxp_t *, Rxp_t *, const char *); 88 1.1 cgd 89 1.1 cgd int 90 1.13 dholland rxp_compile(const char *s) 91 1.1 cgd { 92 1.1 cgd return (rxp__compile(s, TRUE)); 93 1.1 cgd } 94 1.1 cgd 95 1.1 cgd static int 96 1.13 dholland rxp__compile(const char *s, int first) 97 1.1 cgd { 98 1.1 cgd static Rxp_t *rp; 99 1.7 jsm static const char *sp; 100 1.1 cgd Rxp_t *grp_ptr; 101 1.1 cgd Rxp_t *alt_ptr; 102 1.1 cgd int esc, err; 103 1.1 cgd 104 1.1 cgd esc = 0; 105 1.1 cgd if (first) { 106 1.1 cgd rp = rxpbuf; 107 1.1 cgd sp = s; 108 1.1 cgd *rp++ = SOT; /* auto-anchor: pat is really ^pat$ */ 109 1.1 cgd *rp++ = GRP_S; /* auto-group: ^pat$ is really ^[pat]$ */ 110 1.1 cgd *rp++ = 0; 111 1.1 cgd } 112 1.1 cgd *rp++ = ALT_S; 113 1.1 cgd alt_ptr = rp; 114 1.1 cgd *rp++ = 0; 115 1.1 cgd for (; *sp; ++sp) { 116 1.1 cgd if (rp - rxpbuf >= RXP_LINE_SZ - 4) { 117 1.1 cgd (void)snprintf(rxperr, sizeof(rxperr), 118 1.1 cgd "regular expression too long %s", s); 119 1.1 cgd return (FALSE); 120 1.1 cgd } 121 1.1 cgd if (*sp == ':' && !esc) 122 1.1 cgd break; 123 1.1 cgd if (esc) { 124 1.1 cgd *rp++ = LIT; 125 1.1 cgd *rp++ = *sp; 126 1.1 cgd esc = 0; 127 1.1 cgd } 128 1.1 cgd else switch (*sp) { 129 1.1 cgd case '\\': 130 1.1 cgd esc = 1; 131 1.1 cgd break; 132 1.1 cgd case '{': 133 1.1 cgd case '[': 134 1.1 cgd *rp++ = GRP_S; 135 1.1 cgd grp_ptr = rp; 136 1.1 cgd *rp++ = 0; 137 1.1 cgd sp++; 138 1.1 cgd if ((err = rxp__compile(s, FALSE)) != TRUE) 139 1.1 cgd return (err); 140 1.1 cgd *rp++ = GRP_E; 141 1.1 cgd *grp_ptr = rp - rxpbuf; 142 1.1 cgd break; 143 1.1 cgd case '}': 144 1.1 cgd case ']': 145 1.1 cgd case '|': 146 1.1 cgd *rp++ = ALT_E; 147 1.1 cgd *alt_ptr = rp - rxpbuf; 148 1.1 cgd if (*sp != ']') { 149 1.1 cgd *rp++ = ALT_S; 150 1.1 cgd alt_ptr = rp; 151 1.1 cgd *rp++ = 0; 152 1.1 cgd } 153 1.1 cgd if (*sp != '|') { 154 1.1 cgd if (*sp != ']') { 155 1.1 cgd *rp++ = ALT_E; 156 1.1 cgd *alt_ptr = rp - rxpbuf; 157 1.1 cgd } 158 1.1 cgd if (first) { 159 1.1 cgd (void)snprintf(rxperr, sizeof(rxperr), 160 1.1 cgd "unmatched alternator in regexp %s", 161 1.1 cgd s); 162 1.1 cgd return (FALSE); 163 1.1 cgd } 164 1.1 cgd return (TRUE); 165 1.1 cgd } 166 1.1 cgd break; 167 1.1 cgd default: 168 1.1 cgd *rp++ = LIT; 169 1.1 cgd *rp++ = *sp; 170 1.1 cgd esc = 0; 171 1.1 cgd break; 172 1.1 cgd } 173 1.1 cgd } 174 1.1 cgd if (!first) { 175 1.1 cgd (void)snprintf(rxperr, sizeof(rxperr), 176 1.1 cgd "unmatched alternator in regexp %s", s); 177 1.1 cgd return (FALSE); 178 1.1 cgd } 179 1.1 cgd *rp++ = ALT_E; 180 1.1 cgd *alt_ptr = rp - rxpbuf; 181 1.1 cgd *rp++ = GRP_E; 182 1.1 cgd *(rxpbuf + 2) = rp - rxpbuf; 183 1.1 cgd *rp++ = EOT; 184 1.1 cgd *rp = END; 185 1.1 cgd return (TRUE); 186 1.1 cgd } 187 1.1 cgd 188 1.1 cgd /* 189 1.1 cgd * match string against compiled regular expression 190 1.1 cgd */ 191 1.1 cgd int 192 1.13 dholland rxp_match(const char *s) 193 1.1 cgd { 194 1.1 cgd return (rxp__match(s, TRUE, NULL, NULL, NULL)); 195 1.1 cgd } 196 1.1 cgd 197 1.1 cgd static int 198 1.13 dholland rxp__match(const char *s, 199 1.13 dholland int first, 200 1.13 dholland Rxp_t *j_succ, /* jump here on successful alt match */ 201 1.13 dholland Rxp_t *j_fail, /* jump here on failed match */ 202 1.13 dholland const char *sp_fail) /* reset sp to here on failed match */ 203 1.1 cgd { 204 1.1 cgd static Rxp_t *rp; 205 1.7 jsm static const char *sp; 206 1.6 lukem int ch; 207 1.6 lukem Rxp_t *grp_end = NULL; 208 1.1 cgd 209 1.1 cgd if (first) { 210 1.1 cgd rp = rxpbuf; 211 1.1 cgd sp = s; 212 1.1 cgd } 213 1.1 cgd while (rp < rxpbuf + RXP_LINE_SZ && *rp != END) 214 1.1 cgd switch(*rp) { 215 1.1 cgd case LIT: 216 1.1 cgd rp++; 217 1.1 cgd ch = isascii(*rp) && isupper(*rp) ? tolower(*rp) : *rp; 218 1.1 cgd if (ch != *sp++) { 219 1.1 cgd rp = j_fail; 220 1.1 cgd sp = sp_fail; 221 1.8 dbj return (FALSE); 222 1.1 cgd } 223 1.1 cgd rp++; 224 1.1 cgd break; 225 1.1 cgd case SOT: 226 1.1 cgd if (sp != s) 227 1.1 cgd return (FALSE); 228 1.1 cgd rp++; 229 1.1 cgd break; 230 1.1 cgd case EOT: 231 1.1 cgd if (*sp != 0) 232 1.1 cgd return (FALSE); 233 1.1 cgd rp++; 234 1.1 cgd break; 235 1.1 cgd case GRP_S: 236 1.1 cgd rp++; 237 1.1 cgd grp_end = rxpbuf + *rp++; 238 1.1 cgd break; 239 1.1 cgd case ALT_S: 240 1.1 cgd rp++; 241 1.8 dbj rxp__match(sp, FALSE, grp_end, rxpbuf + *rp++, sp); 242 1.1 cgd break; 243 1.1 cgd case ALT_E: 244 1.1 cgd rp = j_succ; 245 1.1 cgd return (TRUE); 246 1.1 cgd case GRP_E: 247 1.8 dbj rp = j_fail; 248 1.8 dbj sp = sp_fail; 249 1.8 dbj return (FALSE); 250 1.1 cgd default: 251 1.8 dbj abort(); 252 1.1 cgd } 253 1.1 cgd return (*rp != END ? FALSE : TRUE); 254 1.1 cgd } 255 1.1 cgd 256 1.1 cgd /* 257 1.1 cgd * Reverse engineer the regular expression, by picking first of all alternates. 258 1.1 cgd */ 259 1.1 cgd char * 260 1.13 dholland rxp_expand(void) 261 1.1 cgd { 262 1.1 cgd return (rxp__expand(TRUE)); 263 1.1 cgd } 264 1.1 cgd 265 1.1 cgd static char * 266 1.13 dholland rxp__expand(int first) 267 1.1 cgd { 268 1.1 cgd static char buf[RXP_LINE_SZ/2]; 269 1.1 cgd static Rxp_t *rp; 270 1.1 cgd static char *bp; 271 1.1 cgd Rxp_t *grp_ptr; 272 1.1 cgd char *err; 273 1.1 cgd 274 1.1 cgd if (first) { 275 1.1 cgd rp = rxpbuf; 276 1.1 cgd bp = buf; 277 1.1 cgd } 278 1.1 cgd while (rp < rxpbuf + RXP_LINE_SZ && *rp != END) 279 1.1 cgd switch(*rp) { 280 1.1 cgd case LIT: 281 1.1 cgd rp++; 282 1.1 cgd *bp++ = *rp++; 283 1.1 cgd break; 284 1.1 cgd case GRP_S: 285 1.1 cgd rp++; 286 1.1 cgd grp_ptr = rxpbuf + *rp; 287 1.1 cgd rp++; 288 1.1 cgd if ((err = rxp__expand(FALSE)) == NULL) 289 1.1 cgd return (err); 290 1.1 cgd rp = grp_ptr; 291 1.1 cgd break; 292 1.1 cgd case ALT_E: 293 1.1 cgd return (buf); 294 1.1 cgd case ALT_S: 295 1.1 cgd rp++; 296 1.1 cgd /* FALLTHROUGH */ 297 1.1 cgd case SOT: 298 1.1 cgd case EOT: 299 1.1 cgd case GRP_E: 300 1.1 cgd rp++; 301 1.1 cgd break; 302 1.1 cgd default: 303 1.1 cgd return (NULL); 304 1.1 cgd } 305 1.1 cgd if (first) { 306 1.1 cgd if (*rp != END) 307 1.1 cgd return (NULL); 308 1.1 cgd *bp = '\0'; 309 1.1 cgd } 310 1.1 cgd return (buf); 311 1.1 cgd } 312