Home | History | Annotate | Line # | Download | only in ure
ure.c revision 1.3
      1  1.3  christos /*	$NetBSD: ure.c,v 1.3 2021/08/14 16:14:57 christos Exp $	*/
      2  1.2  christos 
      3  1.2  christos /* $OpenLDAP$ */
      4  1.1     lukem /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
      5  1.1     lukem  *
      6  1.3  christos  * Copyright 1998-2021 The OpenLDAP Foundation.
      7  1.1     lukem  * All rights reserved.
      8  1.1     lukem  *
      9  1.1     lukem  * Redistribution and use in source and binary forms, with or without
     10  1.1     lukem  * modification, are permitted only as authorized by the OpenLDAP
     11  1.1     lukem  * Public License.
     12  1.1     lukem  *
     13  1.1     lukem  * A copy of this license is available in file LICENSE in the
     14  1.1     lukem  * top-level directory of the distribution or, alternatively, at
     15  1.1     lukem  * <http://www.OpenLDAP.org/license.html>.
     16  1.1     lukem  */
     17  1.1     lukem /* Copyright 1997, 1998, 1999 Computing Research Labs,
     18  1.1     lukem  * New Mexico State University
     19  1.1     lukem  *
     20  1.1     lukem  * Permission is hereby granted, free of charge, to any person obtaining a
     21  1.1     lukem  * copy of this software and associated documentation files (the "Software"),
     22  1.1     lukem  * to deal in the Software without restriction, including without limitation
     23  1.1     lukem  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     24  1.1     lukem  * and/or sell copies of the Software, and to permit persons to whom the
     25  1.1     lukem  * Software is furnished to do so, subject to the following conditions:
     26  1.1     lukem  *
     27  1.1     lukem  * The above copyright notice and this permission notice shall be included in
     28  1.1     lukem  * all copies or substantial portions of the Software.
     29  1.1     lukem  *
     30  1.1     lukem  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     31  1.1     lukem  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     32  1.1     lukem  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     33  1.1     lukem  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
     34  1.1     lukem  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
     35  1.1     lukem  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
     36  1.1     lukem  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     37  1.1     lukem  */
     38  1.2  christos /* Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp " */
     39  1.2  christos 
     40  1.2  christos #include <sys/cdefs.h>
     41  1.3  christos __RCSID("$NetBSD: ure.c,v 1.3 2021/08/14 16:14:57 christos Exp $");
     42  1.1     lukem 
     43  1.1     lukem #include "portable.h"
     44  1.1     lukem 
     45  1.1     lukem #include <ac/stdlib.h>
     46  1.1     lukem #include <ac/string.h>
     47  1.1     lukem #include <ac/unistd.h>
     48  1.1     lukem 
     49  1.1     lukem #include "ure.h"
     50  1.1     lukem 
     51  1.1     lukem /*
     52  1.1     lukem  * Flags used internally in the DFA.
     53  1.1     lukem  */
     54  1.1     lukem #define _URE_DFA_CASEFOLD  0x01
     55  1.1     lukem #define _URE_DFA_BLANKLINE 0x02
     56  1.1     lukem 
     57  1.1     lukem static unsigned long cclass_flags[] = {
     58  1.1     lukem     0,
     59  1.1     lukem     _URE_NONSPACING,
     60  1.1     lukem     _URE_COMBINING,
     61  1.1     lukem     _URE_NUMDIGIT,
     62  1.1     lukem     _URE_NUMOTHER,
     63  1.1     lukem     _URE_SPACESEP,
     64  1.1     lukem     _URE_LINESEP,
     65  1.1     lukem     _URE_PARASEP,
     66  1.1     lukem     _URE_CNTRL,
     67  1.1     lukem     _URE_PUA,
     68  1.1     lukem     _URE_UPPER,
     69  1.1     lukem     _URE_LOWER,
     70  1.1     lukem     _URE_TITLE,
     71  1.1     lukem     _URE_MODIFIER,
     72  1.1     lukem     _URE_OTHERLETTER,
     73  1.1     lukem     _URE_DASHPUNCT,
     74  1.1     lukem     _URE_OPENPUNCT,
     75  1.1     lukem     _URE_CLOSEPUNCT,
     76  1.1     lukem     _URE_OTHERPUNCT,
     77  1.1     lukem     _URE_MATHSYM,
     78  1.1     lukem     _URE_CURRENCYSYM,
     79  1.1     lukem     _URE_OTHERSYM,
     80  1.1     lukem     _URE_LTR,
     81  1.1     lukem     _URE_RTL,
     82  1.1     lukem     _URE_EURONUM,
     83  1.1     lukem     _URE_EURONUMSEP,
     84  1.1     lukem     _URE_EURONUMTERM,
     85  1.1     lukem     _URE_ARABNUM,
     86  1.1     lukem     _URE_COMMONSEP,
     87  1.1     lukem     _URE_BLOCKSEP,
     88  1.1     lukem     _URE_SEGMENTSEP,
     89  1.1     lukem     _URE_WHITESPACE,
     90  1.1     lukem     _URE_OTHERNEUT,
     91  1.1     lukem };
     92  1.1     lukem 
     93  1.1     lukem /*
     94  1.1     lukem  * Symbol types for the DFA.
     95  1.1     lukem  */
     96  1.1     lukem #define _URE_ANY_CHAR   1
     97  1.1     lukem #define _URE_CHAR       2
     98  1.1     lukem #define _URE_CCLASS     3
     99  1.1     lukem #define _URE_NCCLASS    4
    100  1.1     lukem #define _URE_BOL_ANCHOR 5
    101  1.1     lukem #define _URE_EOL_ANCHOR 6
    102  1.1     lukem 
    103  1.1     lukem /*
    104  1.1     lukem  * Op codes for converting the NFA to a DFA.
    105  1.1     lukem  */
    106  1.1     lukem #define _URE_SYMBOL     10
    107  1.1     lukem #define _URE_PAREN      11
    108  1.1     lukem #define _URE_QUEST      12
    109  1.1     lukem #define _URE_STAR       13
    110  1.1     lukem #define _URE_PLUS       14
    111  1.1     lukem #define _URE_ONE        15
    112  1.1     lukem #define _URE_AND        16
    113  1.1     lukem #define _URE_OR         17
    114  1.1     lukem 
    115  1.1     lukem #define _URE_NOOP       0xffff
    116  1.1     lukem 
    117  1.1     lukem #define _URE_REGSTART 0x8000
    118  1.1     lukem #define _URE_REGEND   0x4000
    119  1.1     lukem 
    120  1.1     lukem /*
    121  1.1     lukem  * Structure used to handle a compacted range of characters.
    122  1.1     lukem  */
    123  1.1     lukem typedef struct {
    124  1.1     lukem     ucs4_t min_code;
    125  1.1     lukem     ucs4_t max_code;
    126  1.1     lukem } _ure_range_t;
    127  1.1     lukem 
    128  1.1     lukem typedef struct {
    129  1.1     lukem     _ure_range_t *ranges;
    130  1.1     lukem     ucs2_t ranges_used;
    131  1.1     lukem     ucs2_t ranges_size;
    132  1.1     lukem } _ure_ccl_t;
    133  1.1     lukem 
    134  1.1     lukem typedef union {
    135  1.1     lukem     ucs4_t chr;
    136  1.1     lukem     _ure_ccl_t ccl;
    137  1.1     lukem } _ure_sym_t;
    138  1.1     lukem 
    139  1.1     lukem /*
    140  1.1     lukem  * This is a general element structure used for expressions and stack
    141  1.1     lukem  * elements.
    142  1.1     lukem  */
    143  1.1     lukem typedef struct {
    144  1.1     lukem     ucs2_t reg;
    145  1.1     lukem     ucs2_t onstack;
    146  1.1     lukem     ucs2_t type;
    147  1.1     lukem     ucs2_t lhs;
    148  1.1     lukem     ucs2_t rhs;
    149  1.1     lukem } _ure_elt_t;
    150  1.1     lukem 
    151  1.1     lukem /*
    152  1.1     lukem  * This is a structure used to track a list or a stack of states.
    153  1.1     lukem  */
    154  1.1     lukem typedef struct {
    155  1.1     lukem     ucs2_t *slist;
    156  1.1     lukem     ucs2_t slist_size;
    157  1.1     lukem     ucs2_t slist_used;
    158  1.1     lukem } _ure_stlist_t;
    159  1.1     lukem 
    160  1.1     lukem /*
    161  1.1     lukem  * Structure to track the list of unique states for a symbol
    162  1.1     lukem  * during reduction.
    163  1.1     lukem  */
    164  1.1     lukem typedef struct {
    165  1.1     lukem     ucs2_t id;
    166  1.1     lukem     ucs2_t type;
    167  1.1     lukem     unsigned long mods;
    168  1.1     lukem     unsigned long props;
    169  1.1     lukem     _ure_sym_t sym;
    170  1.1     lukem     _ure_stlist_t states;
    171  1.1     lukem } _ure_symtab_t;
    172  1.1     lukem 
    173  1.1     lukem /*
    174  1.1     lukem  * Structure to hold a single state.
    175  1.1     lukem  */
    176  1.1     lukem typedef struct {
    177  1.1     lukem     ucs2_t id;
    178  1.1     lukem     ucs2_t accepting;
    179  1.1     lukem     ucs2_t pad;
    180  1.1     lukem     _ure_stlist_t st;
    181  1.1     lukem     _ure_elt_t *trans;
    182  1.1     lukem     ucs2_t trans_size;
    183  1.1     lukem     ucs2_t trans_used;
    184  1.1     lukem } _ure_state_t;
    185  1.1     lukem 
    186  1.1     lukem /*
    187  1.1     lukem  * Structure used for keeping lists of states.
    188  1.1     lukem  */
    189  1.1     lukem typedef struct {
    190  1.1     lukem     _ure_state_t *states;
    191  1.1     lukem     ucs2_t states_size;
    192  1.1     lukem     ucs2_t states_used;
    193  1.1     lukem } _ure_statetable_t;
    194  1.1     lukem 
    195  1.1     lukem /*
    196  1.1     lukem  * Structure to track pairs of DFA states when equivalent states are
    197  1.1     lukem  * merged.
    198  1.1     lukem  */
    199  1.1     lukem typedef struct {
    200  1.1     lukem     ucs2_t l;
    201  1.1     lukem     ucs2_t r;
    202  1.1     lukem } _ure_equiv_t;
    203  1.1     lukem 
    204  1.1     lukem /*
    205  1.1     lukem  * Structure used for constructing the NFA and reducing to a minimal DFA.
    206  1.1     lukem  */
    207  1.1     lukem typedef struct _ure_buffer_t {
    208  1.1     lukem     int reducing;
    209  1.1     lukem     int error;
    210  1.1     lukem     unsigned long flags;
    211  1.1     lukem 
    212  1.1     lukem     _ure_stlist_t stack;
    213  1.1     lukem 
    214  1.1     lukem     /*
    215  1.1     lukem      * Table of unique symbols encountered.
    216  1.1     lukem      */
    217  1.1     lukem     _ure_symtab_t *symtab;
    218  1.1     lukem     ucs2_t symtab_size;
    219  1.1     lukem     ucs2_t symtab_used;
    220  1.1     lukem 
    221  1.1     lukem     /*
    222  1.1     lukem      * Tracks the unique expressions generated for the NFA and when the NFA is
    223  1.1     lukem      * reduced.
    224  1.1     lukem      */
    225  1.1     lukem     _ure_elt_t *expr;
    226  1.1     lukem     ucs2_t expr_used;
    227  1.1     lukem     ucs2_t expr_size;
    228  1.1     lukem 
    229  1.1     lukem     /*
    230  1.1     lukem      * The reduced table of unique groups of NFA states.
    231  1.1     lukem      */
    232  1.1     lukem     _ure_statetable_t states;
    233  1.1     lukem 
    234  1.1     lukem     /*
    235  1.1     lukem      * Tracks states when equivalent states are merged.
    236  1.1     lukem      */
    237  1.1     lukem     _ure_equiv_t *equiv;
    238  1.1     lukem     ucs2_t equiv_used;
    239  1.1     lukem     ucs2_t equiv_size;
    240  1.1     lukem } _ure_buffer_t;
    241  1.1     lukem 
    242  1.1     lukem typedef struct {
    243  1.1     lukem     ucs2_t symbol;
    244  1.1     lukem     ucs2_t next_state;
    245  1.1     lukem } _ure_trans_t;
    246  1.1     lukem 
    247  1.1     lukem typedef struct {
    248  1.1     lukem     ucs2_t accepting;
    249  1.1     lukem     ucs2_t ntrans;
    250  1.1     lukem     _ure_trans_t *trans;
    251  1.1     lukem } _ure_dstate_t;
    252  1.1     lukem 
    253  1.1     lukem typedef struct _ure_dfa_t {
    254  1.1     lukem     unsigned long flags;
    255  1.1     lukem 
    256  1.1     lukem     _ure_symtab_t *syms;
    257  1.1     lukem     ucs2_t nsyms;
    258  1.1     lukem 
    259  1.1     lukem     _ure_dstate_t *states;
    260  1.1     lukem     ucs2_t nstates;
    261  1.1     lukem 
    262  1.1     lukem     _ure_trans_t *trans;
    263  1.1     lukem     ucs2_t ntrans;
    264  1.1     lukem } _ure_dfa_t;
    265  1.1     lukem 
    266  1.1     lukem /*************************************************************************
    267  1.1     lukem  *
    268  1.1     lukem  * Functions.
    269  1.1     lukem  *
    270  1.1     lukem  *************************************************************************/
    271  1.1     lukem 
    272  1.1     lukem static void
    273  1.1     lukem _ure_memmove(char *dest, char *src, unsigned long bytes)
    274  1.1     lukem {
    275  1.1     lukem     long i, j;
    276  1.1     lukem 
    277  1.1     lukem     i = (long) bytes;
    278  1.1     lukem     j = i & 7;
    279  1.1     lukem     i = (i + 7) >> 3;
    280  1.1     lukem 
    281  1.1     lukem     /*
    282  1.1     lukem      * Do a memmove using Ye Olde Duff's Device for efficiency.
    283  1.1     lukem      */
    284  1.1     lukem     if (src < dest) {
    285  1.1     lukem         src += bytes;
    286  1.1     lukem         dest += bytes;
    287  1.1     lukem 
    288  1.1     lukem         switch (j) {
    289  1.1     lukem           case 0: do {
    290  1.1     lukem               *--dest = *--src;
    291  1.1     lukem             case 7: *--dest = *--src;
    292  1.1     lukem             case 6: *--dest = *--src;
    293  1.1     lukem             case 5: *--dest = *--src;
    294  1.1     lukem             case 4: *--dest = *--src;
    295  1.1     lukem             case 3: *--dest = *--src;
    296  1.1     lukem             case 2: *--dest = *--src;
    297  1.1     lukem             case 1: *--dest = *--src;
    298  1.1     lukem           } while (--i > 0);
    299  1.1     lukem         }
    300  1.1     lukem     } else if (src > dest) {
    301  1.1     lukem         switch (j) {
    302  1.1     lukem           case 0: do {
    303  1.1     lukem               *dest++ = *src++;
    304  1.1     lukem             case 7: *dest++ = *src++;
    305  1.1     lukem             case 6: *dest++ = *src++;
    306  1.1     lukem             case 5: *dest++ = *src++;
    307  1.1     lukem             case 4: *dest++ = *src++;
    308  1.1     lukem             case 3: *dest++ = *src++;
    309  1.1     lukem             case 2: *dest++ = *src++;
    310  1.1     lukem             case 1: *dest++ = *src++;
    311  1.1     lukem           } while (--i > 0);
    312  1.1     lukem         }
    313  1.1     lukem     }
    314  1.1     lukem }
    315  1.1     lukem 
    316  1.1     lukem static void
    317  1.1     lukem _ure_push(ucs2_t v, _ure_buffer_t *b)
    318  1.1     lukem {
    319  1.1     lukem     _ure_stlist_t *s;
    320  1.1     lukem 
    321  1.1     lukem     if (b == 0)
    322  1.1     lukem       return;
    323  1.1     lukem 
    324  1.1     lukem     /*
    325  1.1     lukem      * If the `reducing' parameter is non-zero, check to see if the value
    326  1.1     lukem      * passed is already on the stack.
    327  1.1     lukem      */
    328  1.1     lukem     if (b->reducing != 0 && b->expr[v].onstack != 0)
    329  1.1     lukem       return;
    330  1.1     lukem 
    331  1.1     lukem     s = &b->stack;
    332  1.1     lukem     if (s->slist_used == s->slist_size) {
    333  1.1     lukem         if (s->slist_size == 0)
    334  1.1     lukem           s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
    335  1.1     lukem         else
    336  1.1     lukem           s->slist = (ucs2_t *) realloc((char *) s->slist,
    337  1.1     lukem                                         sizeof(ucs2_t) * (s->slist_size + 8));
    338  1.1     lukem         s->slist_size += 8;
    339  1.1     lukem     }
    340  1.1     lukem     s->slist[s->slist_used++] = v;
    341  1.1     lukem 
    342  1.1     lukem     /*
    343  1.1     lukem      * If the `reducing' parameter is non-zero, flag the element as being on
    344  1.1     lukem      * the stack.
    345  1.1     lukem      */
    346  1.1     lukem     if (b->reducing != 0)
    347  1.1     lukem       b->expr[v].onstack = 1;
    348  1.1     lukem }
    349  1.1     lukem 
    350  1.1     lukem static ucs2_t
    351  1.1     lukem _ure_peek(_ure_buffer_t *b)
    352  1.1     lukem {
    353  1.1     lukem     if (b == 0 || b->stack.slist_used == 0)
    354  1.1     lukem       return _URE_NOOP;
    355  1.1     lukem 
    356  1.1     lukem     return b->stack.slist[b->stack.slist_used - 1];
    357  1.1     lukem }
    358  1.1     lukem 
    359  1.1     lukem static ucs2_t
    360  1.1     lukem _ure_pop(_ure_buffer_t *b)
    361  1.1     lukem {
    362  1.1     lukem     ucs2_t v;
    363  1.1     lukem 
    364  1.1     lukem     if (b == 0 || b->stack.slist_used == 0)
    365  1.1     lukem       return _URE_NOOP;
    366  1.1     lukem 
    367  1.1     lukem     v = b->stack.slist[--b->stack.slist_used];
    368  1.1     lukem     if (b->reducing)
    369  1.1     lukem       b->expr[v].onstack = 0;
    370  1.1     lukem 
    371  1.1     lukem     return v;
    372  1.1     lukem }
    373  1.1     lukem 
    374  1.1     lukem /*************************************************************************
    375  1.1     lukem  *
    376  1.1     lukem  * Start symbol parse functions.
    377  1.1     lukem  *
    378  1.1     lukem  *************************************************************************/
    379  1.1     lukem 
    380  1.1     lukem /*
    381  1.1     lukem  * Parse a comma-separated list of integers that represent character
    382  1.1     lukem  * properties.  Combine them into a mask that is returned in the `mask'
    383  1.1     lukem  * variable, and return the number of characters consumed.
    384  1.1     lukem  */
    385  1.1     lukem static unsigned long
    386  1.1     lukem _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
    387  1.1     lukem                _ure_buffer_t *b)
    388  1.1     lukem {
    389  1.1     lukem     unsigned long n, m;
    390  1.1     lukem     ucs2_t *sp, *ep;
    391  1.1     lukem 
    392  1.1     lukem     sp = pp;
    393  1.1     lukem     ep = sp + limit;
    394  1.1     lukem 
    395  1.1     lukem     for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
    396  1.1     lukem         if (*sp == ',') {
    397  1.1     lukem             /*
    398  1.1     lukem              * Encountered a comma, so select the next character property flag
    399  1.1     lukem              * and reset the number.
    400  1.1     lukem              */
    401  1.1     lukem             m |= cclass_flags[n];
    402  1.1     lukem             n = 0;
    403  1.1     lukem         } else if (*sp >= '0' && *sp <= '9')
    404  1.1     lukem           /*
    405  1.1     lukem            * Encountered a digit, so start or continue building the cardinal
    406  1.1     lukem            * that represents the character property flag.
    407  1.1     lukem            */
    408  1.1     lukem           n = (n * 10) + (*sp - '0');
    409  1.1     lukem         else
    410  1.1     lukem           /*
    411  1.1     lukem            * Encountered something that is not part of the property list.
    412  1.1     lukem            * Indicate that we are done.
    413  1.1     lukem            */
    414  1.1     lukem           break;
    415  1.1     lukem 
    416  1.1     lukem         /*
    417  1.1     lukem          * If a property number greater than 32 occurs, then there is a
    418  1.1     lukem          * problem.  Most likely a missing comma separator.
    419  1.1     lukem          */
    420  1.1     lukem         if (n > 32)
    421  1.1     lukem           b->error = _URE_INVALID_PROPERTY;
    422  1.1     lukem     }
    423  1.1     lukem 
    424  1.2  christos     if (b->error == _URE_OK && n != 0)
    425  1.1     lukem       m |= cclass_flags[n];
    426  1.1     lukem 
    427  1.1     lukem     /*
    428  1.1     lukem      * Set the mask that represents the group of character properties.
    429  1.1     lukem      */
    430  1.1     lukem     *mask = m;
    431  1.1     lukem 
    432  1.1     lukem     /*
    433  1.1     lukem      * Return the number of characters consumed.
    434  1.1     lukem      */
    435  1.1     lukem     return sp - pp;
    436  1.1     lukem }
    437  1.1     lukem 
    438  1.1     lukem /*
    439  1.1     lukem  * Collect a hex number with 1 to 4 digits and return the number
    440  1.1     lukem  * of characters used.
    441  1.1     lukem  */
    442  1.1     lukem static unsigned long
    443  1.1     lukem _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
    444  1.1     lukem {
    445  1.1     lukem     ucs2_t i;
    446  1.1     lukem     ucs2_t *sp, *ep;
    447  1.1     lukem     ucs4_t nn;
    448  1.1     lukem 
    449  1.1     lukem     sp = np;
    450  1.1     lukem     ep = sp + limit;
    451  1.1     lukem 
    452  1.1     lukem     for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
    453  1.1     lukem         if (*sp >= '0' && *sp <= '9')
    454  1.1     lukem           nn = (nn << 4) + (*sp - '0');
    455  1.1     lukem         else if (*sp >= 'A' && *sp <= 'F')
    456  1.1     lukem           nn = (nn << 4) + ((*sp - 'A') + 10);
    457  1.1     lukem         else if (*sp >= 'a' && *sp <= 'f')
    458  1.1     lukem           nn = (nn << 4) + ((*sp - 'a') + 10);
    459  1.1     lukem         else
    460  1.1     lukem           /*
    461  1.1     lukem            * Encountered something that is not a hex digit.
    462  1.1     lukem            */
    463  1.1     lukem           break;
    464  1.1     lukem     }
    465  1.1     lukem 
    466  1.1     lukem     /*
    467  1.1     lukem      * Assign the character code collected and return the number of
    468  1.1     lukem      * characters used.
    469  1.1     lukem      */
    470  1.1     lukem     *n = nn;
    471  1.1     lukem 
    472  1.1     lukem     return sp - np;
    473  1.1     lukem }
    474  1.1     lukem 
    475  1.1     lukem /*
    476  1.1     lukem  * Insert a range into a character class, removing duplicates and ordering
    477  1.1     lukem  * them in increasing range-start order.
    478  1.1     lukem  */
    479  1.1     lukem static void
    480  1.1     lukem _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
    481  1.1     lukem {
    482  1.1     lukem     ucs2_t i;
    483  1.1     lukem     ucs4_t tmp;
    484  1.1     lukem     _ure_range_t *rp;
    485  1.1     lukem 
    486  1.1     lukem     /*
    487  1.1     lukem      * If the `casefold' flag is set, then make sure both endpoints of the
    488  1.1     lukem      * range are converted to lower case.
    489  1.1     lukem      */
    490  1.1     lukem     if (b->flags & _URE_DFA_CASEFOLD) {
    491  1.1     lukem         r->min_code = _ure_tolower(r->min_code);
    492  1.1     lukem         r->max_code = _ure_tolower(r->max_code);
    493  1.1     lukem     }
    494  1.1     lukem 
    495  1.1     lukem     /*
    496  1.1     lukem      * Swap the range endpoints if they are not in increasing order.
    497  1.1     lukem      */
    498  1.1     lukem     if (r->min_code > r->max_code) {
    499  1.1     lukem         tmp = r->min_code;
    500  1.1     lukem         r->min_code = r->max_code;
    501  1.1     lukem         r->max_code = tmp;
    502  1.1     lukem     }
    503  1.1     lukem 
    504  1.1     lukem     for (i = 0, rp = ccl->ranges;
    505  1.1     lukem          i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
    506  1.1     lukem 
    507  1.1     lukem     /*
    508  1.1     lukem      * Check for a duplicate.
    509  1.1     lukem      */
    510  1.1     lukem     if (i < ccl->ranges_used &&
    511  1.1     lukem         r->min_code == rp->min_code && r->max_code == rp->max_code)
    512  1.1     lukem       return;
    513  1.1     lukem 
    514  1.1     lukem     if (ccl->ranges_used == ccl->ranges_size) {
    515  1.1     lukem         if (ccl->ranges_size == 0)
    516  1.1     lukem           ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
    517  1.1     lukem         else
    518  1.1     lukem           ccl->ranges = (_ure_range_t *)
    519  1.1     lukem               realloc((char *) ccl->ranges,
    520  1.1     lukem                       sizeof(_ure_range_t) * (ccl->ranges_size + 8));
    521  1.1     lukem         ccl->ranges_size += 8;
    522  1.1     lukem     }
    523  1.1     lukem 
    524  1.1     lukem     rp = ccl->ranges + ccl->ranges_used;
    525  1.1     lukem 
    526  1.1     lukem     if (i < ccl->ranges_used)
    527  1.1     lukem       _ure_memmove((char *) (rp + 1), (char *) rp,
    528  1.1     lukem                    sizeof(_ure_range_t) * (ccl->ranges_used - i));
    529  1.1     lukem 
    530  1.1     lukem     ccl->ranges_used++;
    531  1.1     lukem     rp->min_code = r->min_code;
    532  1.1     lukem     rp->max_code = r->max_code;
    533  1.1     lukem }
    534  1.1     lukem 
    535  1.1     lukem #define _URE_ALPHA_MASK  (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
    536  1.1     lukem _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
    537  1.1     lukem #define _URE_ALNUM_MASK  (_URE_ALPHA_MASK|_URE_NUMDIGIT)
    538  1.1     lukem #define _URE_PUNCT_MASK  (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
    539  1.1     lukem _URE_OTHERPUNCT)
    540  1.1     lukem #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
    541  1.1     lukem _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
    542  1.1     lukem #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
    543  1.1     lukem #define _URE_SPACE_MASK  (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
    544  1.1     lukem 
    545  1.1     lukem typedef void (*_ure_cclsetup_t)(
    546  1.1     lukem     _ure_symtab_t *sym,
    547  1.1     lukem     unsigned long mask,
    548  1.1     lukem     _ure_buffer_t *b
    549  1.1     lukem );
    550  1.1     lukem 
    551  1.1     lukem typedef struct {
    552  1.1     lukem     ucs2_t key;
    553  1.1     lukem     unsigned long len;
    554  1.1     lukem     unsigned long next;
    555  1.1     lukem     _ure_cclsetup_t func;
    556  1.1     lukem     unsigned long mask;
    557  1.1     lukem } _ure_trie_t;
    558  1.1     lukem 
    559  1.1     lukem static void
    560  1.1     lukem _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
    561  1.1     lukem {
    562  1.1     lukem     sym->props |= mask;
    563  1.1     lukem }
    564  1.1     lukem 
    565  1.1     lukem static void
    566  1.1     lukem _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
    567  1.1     lukem {
    568  1.1     lukem     _ure_range_t range;
    569  1.1     lukem 
    570  1.1     lukem     sym->props |= mask;
    571  1.1     lukem 
    572  1.1     lukem     /*
    573  1.1     lukem      * Add the additional characters needed for handling isspace().
    574  1.1     lukem      */
    575  1.1     lukem     range.min_code = range.max_code = '\t';
    576  1.1     lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    577  1.1     lukem     range.min_code = range.max_code = '\r';
    578  1.1     lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    579  1.1     lukem     range.min_code = range.max_code = '\n';
    580  1.1     lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    581  1.1     lukem     range.min_code = range.max_code = '\f';
    582  1.1     lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    583  1.1     lukem     range.min_code = range.max_code = 0xfeff;
    584  1.1     lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    585  1.1     lukem }
    586  1.1     lukem 
    587  1.1     lukem static void
    588  1.1     lukem _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
    589  1.1     lukem {
    590  1.1     lukem     _ure_range_t range;
    591  1.1     lukem 
    592  1.1     lukem     /*
    593  1.1     lukem      * Add the additional characters needed for handling isxdigit().
    594  1.1     lukem      */
    595  1.1     lukem     range.min_code = '0';
    596  1.1     lukem     range.max_code = '9';
    597  1.1     lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    598  1.1     lukem     range.min_code = 'A';
    599  1.1     lukem     range.max_code = 'F';
    600  1.1     lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    601  1.1     lukem     range.min_code = 'a';
    602  1.1     lukem     range.max_code = 'f';
    603  1.1     lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    604  1.1     lukem }
    605  1.1     lukem 
    606  1.1     lukem static _ure_trie_t cclass_trie[] = {
    607  1.1     lukem     {0x003a, 1, 1, 0, 0},
    608  1.1     lukem     {0x0061, 9, 10, 0, 0},
    609  1.1     lukem     {0x0063, 8, 19, 0, 0},
    610  1.1     lukem     {0x0064, 7, 24, 0, 0},
    611  1.1     lukem     {0x0067, 6, 29, 0, 0},
    612  1.1     lukem     {0x006c, 5, 34, 0, 0},
    613  1.1     lukem     {0x0070, 4, 39, 0, 0},
    614  1.1     lukem     {0x0073, 3, 49, 0, 0},
    615  1.1     lukem     {0x0075, 2, 54, 0, 0},
    616  1.1     lukem     {0x0078, 1, 59, 0, 0},
    617  1.1     lukem     {0x006c, 1, 11, 0, 0},
    618  1.1     lukem     {0x006e, 2, 13, 0, 0},
    619  1.1     lukem     {0x0070, 1, 16, 0, 0},
    620  1.1     lukem     {0x0075, 1, 14, 0, 0},
    621  1.1     lukem     {0x006d, 1, 15, 0, 0},
    622  1.1     lukem     {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
    623  1.1     lukem     {0x0068, 1, 17, 0, 0},
    624  1.1     lukem     {0x0061, 1, 18, 0, 0},
    625  1.1     lukem     {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
    626  1.1     lukem     {0x006e, 1, 20, 0, 0},
    627  1.1     lukem     {0x0074, 1, 21, 0, 0},
    628  1.1     lukem     {0x0072, 1, 22, 0, 0},
    629  1.1     lukem     {0x006c, 1, 23, 0, 0},
    630  1.1     lukem     {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
    631  1.1     lukem     {0x0069, 1, 25, 0, 0},
    632  1.1     lukem     {0x0067, 1, 26, 0, 0},
    633  1.1     lukem     {0x0069, 1, 27, 0, 0},
    634  1.1     lukem     {0x0074, 1, 28, 0, 0},
    635  1.1     lukem     {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
    636  1.1     lukem     {0x0072, 1, 30, 0, 0},
    637  1.1     lukem     {0x0061, 1, 31, 0, 0},
    638  1.1     lukem     {0x0070, 1, 32, 0, 0},
    639  1.1     lukem     {0x0068, 1, 33, 0, 0},
    640  1.1     lukem     {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
    641  1.1     lukem     {0x006f, 1, 35, 0, 0},
    642  1.1     lukem     {0x0077, 1, 36, 0, 0},
    643  1.1     lukem     {0x0065, 1, 37, 0, 0},
    644  1.1     lukem     {0x0072, 1, 38, 0, 0},
    645  1.1     lukem     {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
    646  1.1     lukem     {0x0072, 2, 41, 0, 0},
    647  1.1     lukem     {0x0075, 1, 45, 0, 0},
    648  1.1     lukem     {0x0069, 1, 42, 0, 0},
    649  1.1     lukem     {0x006e, 1, 43, 0, 0},
    650  1.1     lukem     {0x0074, 1, 44, 0, 0},
    651  1.1     lukem     {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
    652  1.1     lukem     {0x006e, 1, 46, 0, 0},
    653  1.1     lukem     {0x0063, 1, 47, 0, 0},
    654  1.1     lukem     {0x0074, 1, 48, 0, 0},
    655  1.1     lukem     {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
    656  1.1     lukem     {0x0070, 1, 50, 0, 0},
    657  1.1     lukem     {0x0061, 1, 51, 0, 0},
    658  1.1     lukem     {0x0063, 1, 52, 0, 0},
    659  1.1     lukem     {0x0065, 1, 53, 0, 0},
    660  1.1     lukem     {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
    661  1.1     lukem     {0x0070, 1, 55, 0, 0},
    662  1.1     lukem     {0x0070, 1, 56, 0, 0},
    663  1.1     lukem     {0x0065, 1, 57, 0, 0},
    664  1.1     lukem     {0x0072, 1, 58, 0, 0},
    665  1.1     lukem     {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
    666  1.1     lukem     {0x0064, 1, 60, 0, 0},
    667  1.1     lukem     {0x0069, 1, 61, 0, 0},
    668  1.1     lukem     {0x0067, 1, 62, 0, 0},
    669  1.1     lukem     {0x0069, 1, 63, 0, 0},
    670  1.1     lukem     {0x0074, 1, 64, 0, 0},
    671  1.1     lukem     {0x003a, 1, 65, _ure_xdigit_setup, 0},
    672  1.1     lukem };
    673  1.1     lukem 
    674  1.1     lukem /*
    675  1.1     lukem  * Probe for one of the POSIX colon delimited character classes in the static
    676  1.1     lukem  * trie.
    677  1.1     lukem  */
    678  1.1     lukem static unsigned long
    679  1.1     lukem _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
    680  1.1     lukem                _ure_buffer_t *b)
    681  1.1     lukem {
    682  1.1     lukem     int i;
    683  1.1     lukem     unsigned long n;
    684  1.1     lukem     _ure_trie_t *tp;
    685  1.1     lukem     ucs2_t *sp, *ep;
    686  1.1     lukem 
    687  1.1     lukem     /*
    688  1.1     lukem      * If the number of characters left is less than 7, then this cannot be
    689  1.1     lukem      * interpreted as one of the colon delimited classes.
    690  1.1     lukem      */
    691  1.1     lukem     if (limit < 7)
    692  1.1     lukem       return 0;
    693  1.1     lukem 
    694  1.1     lukem     sp = cp;
    695  1.1     lukem     ep = sp + limit;
    696  1.1     lukem     tp = cclass_trie;
    697  1.1     lukem     for (i = 0; sp < ep && i < 8; i++, sp++) {
    698  1.1     lukem         n = tp->len;
    699  1.1     lukem 
    700  1.1     lukem         for (; n > 0 && tp->key != *sp; tp++, n--) ;
    701  1.1     lukem 
    702  1.1     lukem         if (n == 0)
    703  1.1     lukem           return 0;
    704  1.1     lukem 
    705  1.1     lukem         if (*sp == ':' && (i == 6 || i == 7)) {
    706  1.1     lukem             sp++;
    707  1.1     lukem             break;
    708  1.1     lukem         }
    709  1.1     lukem         if (sp + 1 < ep)
    710  1.1     lukem           tp = cclass_trie + tp->next;
    711  1.1     lukem     }
    712  1.1     lukem     if (tp->func == 0)
    713  1.1     lukem       return 0;
    714  1.1     lukem 
    715  1.1     lukem     (*tp->func)(sym, tp->mask, b);
    716  1.1     lukem 
    717  1.1     lukem     return sp - cp;
    718  1.1     lukem }
    719  1.1     lukem 
    720  1.1     lukem /*
    721  1.1     lukem  * Construct a list of ranges and return the number of characters consumed.
    722  1.1     lukem  */
    723  1.1     lukem static unsigned long
    724  1.1     lukem _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
    725  1.1     lukem             _ure_buffer_t *b)
    726  1.1     lukem {
    727  1.1     lukem     int range_end;
    728  1.1     lukem     unsigned long n;
    729  1.1     lukem     ucs2_t *sp, *ep;
    730  1.1     lukem     ucs4_t c, last;
    731  1.1     lukem     _ure_ccl_t *cclp;
    732  1.1     lukem     _ure_range_t range;
    733  1.1     lukem 
    734  1.1     lukem     sp = cp;
    735  1.1     lukem     ep = sp + limit;
    736  1.1     lukem 
    737  1.1     lukem     if (*sp == '^') {
    738  1.1     lukem       symp->type = _URE_NCCLASS;
    739  1.1     lukem       sp++;
    740  1.1     lukem     } else
    741  1.1     lukem       symp->type = _URE_CCLASS;
    742  1.1     lukem 
    743  1.1     lukem     for (last = 0, range_end = 0;
    744  1.1     lukem          b->error == _URE_OK && sp < ep && *sp != ']'; ) {
    745  1.1     lukem         c = *sp++;
    746  1.1     lukem         if (c == '\\') {
    747  1.1     lukem             if (sp == ep) {
    748  1.1     lukem                 /*
    749  1.1     lukem                  * The EOS was encountered when expecting the reverse solidus
    750  1.1     lukem                  * to be followed by the character it is escaping.  Set an
    751  1.1     lukem                  * error code and return the number of characters consumed up
    752  1.1     lukem                  * to this point.
    753  1.1     lukem                  */
    754  1.1     lukem                 b->error = _URE_UNEXPECTED_EOS;
    755  1.1     lukem                 return sp - cp;
    756  1.1     lukem             }
    757  1.1     lukem 
    758  1.1     lukem             c = *sp++;
    759  1.1     lukem             switch (c) {
    760  1.1     lukem               case 'a':
    761  1.1     lukem                 c = 0x07;
    762  1.1     lukem                 break;
    763  1.1     lukem               case 'b':
    764  1.1     lukem                 c = 0x08;
    765  1.1     lukem                 break;
    766  1.1     lukem               case 'f':
    767  1.1     lukem                 c = 0x0c;
    768  1.1     lukem                 break;
    769  1.1     lukem               case 'n':
    770  1.1     lukem                 c = 0x0a;
    771  1.1     lukem                 break;
    772  1.1     lukem               case 'r':
    773  1.1     lukem                 c = 0x0d;
    774  1.1     lukem                 break;
    775  1.1     lukem               case 't':
    776  1.1     lukem                 c = 0x09;
    777  1.1     lukem                 break;
    778  1.1     lukem               case 'v':
    779  1.1     lukem                 c = 0x0b;
    780  1.1     lukem                 break;
    781  1.1     lukem               case 'p':
    782  1.1     lukem               case 'P':
    783  1.1     lukem                 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
    784  1.1     lukem                 /*
    785  1.1     lukem                  * Invert the bit mask of the properties if this is a negated
    786  1.1     lukem                  * character class or if 'P' is used to specify a list of
    787  1.1     lukem                  * character properties that should *not* match in a
    788  1.1     lukem                  * character class.
    789  1.1     lukem                  */
    790  1.1     lukem                 if (c == 'P')
    791  1.1     lukem                   symp->props = ~symp->props;
    792  1.1     lukem                 continue;
    793  1.1     lukem                 break;
    794  1.1     lukem               case 'x':
    795  1.1     lukem               case 'X':
    796  1.1     lukem               case 'u':
    797  1.1     lukem               case 'U':
    798  1.1     lukem                 if (sp < ep &&
    799  1.1     lukem                     ((*sp >= '0' && *sp <= '9') ||
    800  1.1     lukem                      (*sp >= 'A' && *sp <= 'F') ||
    801  1.1     lukem                      (*sp >= 'a' && *sp <= 'f')))
    802  1.1     lukem                   sp += _ure_hex(sp, ep - sp, &c);
    803  1.1     lukem             }
    804  1.1     lukem         } else if (c == ':') {
    805  1.1     lukem             /*
    806  1.1     lukem              * Probe for a POSIX colon delimited character class.
    807  1.1     lukem              */
    808  1.1     lukem             sp--;
    809  1.1     lukem             if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
    810  1.1     lukem               sp++;
    811  1.1     lukem             else {
    812  1.1     lukem                 sp += n;
    813  1.1     lukem                 continue;
    814  1.1     lukem             }
    815  1.1     lukem         }
    816  1.1     lukem 
    817  1.1     lukem         cclp = &symp->sym.ccl;
    818  1.1     lukem 
    819  1.1     lukem         /*
    820  1.1     lukem          * Check to see if the current character is a low surrogate that needs
    821  1.1     lukem          * to be combined with a preceding high surrogate.
    822  1.1     lukem          */
    823  1.1     lukem         if (last != 0) {
    824  1.1     lukem             if (c >= 0xdc00 && c <= 0xdfff)
    825  1.1     lukem               /*
    826  1.1     lukem                * Construct the UTF16 character code.
    827  1.1     lukem                */
    828  1.1     lukem               c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
    829  1.1     lukem             else {
    830  1.1     lukem                 /*
    831  1.1     lukem                  * Add the isolated high surrogate to the range.
    832  1.1     lukem                  */
    833  1.1     lukem                 if (range_end == 1)
    834  1.1     lukem                   range.max_code = last & 0xffff;
    835  1.1     lukem                 else
    836  1.1     lukem                   range.min_code = range.max_code = last & 0xffff;
    837  1.1     lukem 
    838  1.1     lukem                 _ure_add_range(cclp, &range, b);
    839  1.1     lukem                 range_end = 0;
    840  1.1     lukem             }
    841  1.1     lukem         }
    842  1.1     lukem 
    843  1.1     lukem         /*
    844  1.1     lukem          * Clear the last character code.
    845  1.1     lukem          */
    846  1.1     lukem         last = 0;
    847  1.1     lukem 
    848  1.1     lukem         /*
    849  1.1     lukem          * This slightly awkward code handles the different cases needed to
    850  1.1     lukem          * construct a range.
    851  1.1     lukem          */
    852  1.1     lukem         if (c >= 0xd800 && c <= 0xdbff) {
    853  1.1     lukem             /*
    854  1.1     lukem              * If the high surrogate is followed by a range indicator, simply
    855  1.1     lukem              * add it as the range start.  Otherwise, save it in case the next
    856  1.1     lukem              * character is a low surrogate.
    857  1.1     lukem              */
    858  1.1     lukem             if (*sp == '-') {
    859  1.1     lukem                 sp++;
    860  1.1     lukem                 range.min_code = c;
    861  1.1     lukem                 range_end = 1;
    862  1.1     lukem             } else
    863  1.1     lukem               last = c;
    864  1.1     lukem         } else if (range_end == 1) {
    865  1.1     lukem             range.max_code = c;
    866  1.1     lukem             _ure_add_range(cclp, &range, b);
    867  1.1     lukem             range_end = 0;
    868  1.1     lukem         } else {
    869  1.1     lukem             range.min_code = range.max_code = c;
    870  1.1     lukem             if (*sp == '-') {
    871  1.1     lukem                 sp++;
    872  1.1     lukem                 range_end = 1;
    873  1.1     lukem             } else
    874  1.1     lukem               _ure_add_range(cclp, &range, b);
    875  1.1     lukem         }
    876  1.1     lukem     }
    877  1.1     lukem 
    878  1.1     lukem     if (sp < ep && *sp == ']')
    879  1.1     lukem       sp++;
    880  1.1     lukem     else
    881  1.1     lukem       /*
    882  1.1     lukem        * The parse was not terminated by the character class close symbol
    883  1.1     lukem        * (']'), so set an error code.
    884  1.1     lukem        */
    885  1.1     lukem       b->error = _URE_CCLASS_OPEN;
    886  1.1     lukem 
    887  1.1     lukem     return sp - cp;
    888  1.1     lukem }
    889  1.1     lukem 
    890  1.1     lukem /*
    891  1.1     lukem  * Probe for a low surrogate hex code.
    892  1.1     lukem  */
    893  1.1     lukem static unsigned long
    894  1.1     lukem _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
    895  1.1     lukem {
    896  1.1     lukem     ucs4_t i, code;
    897  1.1     lukem     ucs2_t *sp, *ep;
    898  1.1     lukem 
    899  1.1     lukem     for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
    900  1.1     lukem         if (*sp >= '0' && *sp <= '9')
    901  1.1     lukem           code = (code << 4) + (*sp - '0');
    902  1.1     lukem         else if (*sp >= 'A' && *sp <= 'F')
    903  1.1     lukem           code = (code << 4) + ((*sp - 'A') + 10);
    904  1.1     lukem         else if (*sp >= 'a' && *sp <= 'f')
    905  1.1     lukem           code = (code << 4) + ((*sp - 'a') + 10);
    906  1.1     lukem         else
    907  1.1     lukem           break;
    908  1.1     lukem     }
    909  1.1     lukem 
    910  1.1     lukem     *c = code;
    911  1.1     lukem     return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
    912  1.1     lukem }
    913  1.1     lukem 
    914  1.1     lukem static unsigned long
    915  1.1     lukem _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
    916  1.1     lukem                     _ure_buffer_t *b)
    917  1.1     lukem {
    918  1.1     lukem     ucs4_t c;
    919  1.1     lukem     ucs2_t *sp, *ep;
    920  1.1     lukem 
    921  1.1     lukem     sp = sym;
    922  1.1     lukem     ep = sym + limit;
    923  1.1     lukem 
    924  1.1     lukem     if ((c = *sp++) == '\\') {
    925  1.1     lukem 
    926  1.1     lukem         if (sp == ep) {
    927  1.1     lukem             /*
    928  1.1     lukem              * The EOS was encountered when expecting the reverse solidus to
    929  1.1     lukem              * be followed by the character it is escaping.  Set an error code
    930  1.1     lukem              * and return the number of characters consumed up to this point.
    931  1.1     lukem              */
    932  1.1     lukem             b->error = _URE_UNEXPECTED_EOS;
    933  1.1     lukem             return sp - sym;
    934  1.1     lukem         }
    935  1.1     lukem 
    936  1.1     lukem         c = *sp++;
    937  1.1     lukem         switch (c) {
    938  1.1     lukem           case 'p':
    939  1.1     lukem           case 'P':
    940  1.1     lukem             symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
    941  1.1     lukem             sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
    942  1.1     lukem             break;
    943  1.1     lukem           case 'a':
    944  1.1     lukem             symp->type = _URE_CHAR;
    945  1.1     lukem             symp->sym.chr = 0x07;
    946  1.1     lukem             break;
    947  1.1     lukem           case 'b':
    948  1.1     lukem             symp->type = _URE_CHAR;
    949  1.1     lukem             symp->sym.chr = 0x08;
    950  1.1     lukem             break;
    951  1.1     lukem           case 'f':
    952  1.1     lukem             symp->type = _URE_CHAR;
    953  1.1     lukem             symp->sym.chr = 0x0c;
    954  1.1     lukem             break;
    955  1.1     lukem           case 'n':
    956  1.1     lukem             symp->type = _URE_CHAR;
    957  1.1     lukem             symp->sym.chr = 0x0a;
    958  1.1     lukem             break;
    959  1.1     lukem           case 'r':
    960  1.1     lukem             symp->type = _URE_CHAR;
    961  1.1     lukem             symp->sym.chr = 0x0d;
    962  1.1     lukem             break;
    963  1.1     lukem           case 't':
    964  1.1     lukem             symp->type = _URE_CHAR;
    965  1.1     lukem             symp->sym.chr = 0x09;
    966  1.1     lukem             break;
    967  1.1     lukem           case 'v':
    968  1.1     lukem             symp->type = _URE_CHAR;
    969  1.1     lukem             symp->sym.chr = 0x0b;
    970  1.1     lukem             break;
    971  1.1     lukem           case 'x':
    972  1.1     lukem           case 'X':
    973  1.1     lukem           case 'u':
    974  1.1     lukem           case 'U':
    975  1.1     lukem             /*
    976  1.1     lukem              * Collect between 1 and 4 digits representing a UCS2 code.  Fall
    977  1.1     lukem              * through to the next case.
    978  1.1     lukem              */
    979  1.1     lukem             if (sp < ep &&
    980  1.1     lukem                 ((*sp >= '0' && *sp <= '9') ||
    981  1.1     lukem                  (*sp >= 'A' && *sp <= 'F') ||
    982  1.1     lukem                  (*sp >= 'a' && *sp <= 'f')))
    983  1.1     lukem               sp += _ure_hex(sp, ep - sp, &c);
    984  1.1     lukem             /* FALLTHROUGH */
    985  1.1     lukem           default:
    986  1.1     lukem             /*
    987  1.1     lukem              * Simply add an escaped character here.
    988  1.1     lukem              */
    989  1.1     lukem             symp->type = _URE_CHAR;
    990  1.1     lukem             symp->sym.chr = c;
    991  1.1     lukem         }
    992  1.1     lukem     } else if (c == '^' || c == '$')
    993  1.1     lukem       /*
    994  1.1     lukem        * Handle the BOL and EOL anchors.  This actually consists simply of
    995  1.1     lukem        * setting a flag that indicates that the user supplied anchor match
    996  1.1     lukem        * function should be called.  This needs to be done instead of simply
    997  1.1     lukem        * matching line/paragraph separators because beginning-of-text and
    998  1.1     lukem        * end-of-text tests are needed as well.
    999  1.1     lukem        */
   1000  1.1     lukem       symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
   1001  1.1     lukem     else if (c == '[')
   1002  1.1     lukem       /*
   1003  1.1     lukem        * Construct a character class.
   1004  1.1     lukem        */
   1005  1.1     lukem       sp += _ure_cclass(sp, ep - sp, symp, b);
   1006  1.1     lukem     else if (c == '.')
   1007  1.1     lukem       symp->type = _URE_ANY_CHAR;
   1008  1.1     lukem     else {
   1009  1.1     lukem         symp->type = _URE_CHAR;
   1010  1.1     lukem         symp->sym.chr = c;
   1011  1.1     lukem     }
   1012  1.1     lukem 
   1013  1.1     lukem     /*
   1014  1.1     lukem      * If the symbol type happens to be a character and is a high surrogate,
   1015  1.1     lukem      * then probe forward to see if it is followed by a low surrogate that
   1016  1.1     lukem      * needs to be added.
   1017  1.1     lukem      */
   1018  1.1     lukem     if (sp < ep && symp->type == _URE_CHAR &&
   1019  1.1     lukem         0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
   1020  1.1     lukem 
   1021  1.1     lukem         if (0xdc00 <= *sp && *sp <= 0xdfff) {
   1022  1.1     lukem             symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
   1023  1.1     lukem                                        (*sp & 0x03ff));
   1024  1.1     lukem             sp++;
   1025  1.1     lukem         } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
   1026  1.1     lukem                                  *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
   1027  1.1     lukem             sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
   1028  1.1     lukem             if (0xdc00 <= c && c <= 0xdfff) {
   1029  1.1     lukem                 /*
   1030  1.1     lukem                  * Take into account the \[xu] in front of the hex code.
   1031  1.1     lukem                  */
   1032  1.1     lukem                 sp += 2;
   1033  1.1     lukem                 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
   1034  1.1     lukem                                            (c & 0x03ff));
   1035  1.1     lukem             }
   1036  1.1     lukem         }
   1037  1.1     lukem     }
   1038  1.1     lukem 
   1039  1.1     lukem     /*
   1040  1.1     lukem      * Last, make sure any _URE_CHAR type symbols are changed to lower case if
   1041  1.1     lukem      * the `casefold' flag is set.
   1042  1.1     lukem      */
   1043  1.1     lukem     if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
   1044  1.1     lukem       symp->sym.chr = _ure_tolower(symp->sym.chr);
   1045  1.1     lukem 
   1046  1.1     lukem     /*
   1047  1.1     lukem      * If the symbol constructed is anything other than one of the anchors,
   1048  1.1     lukem      * make sure the _URE_DFA_BLANKLINE flag is removed.
   1049  1.1     lukem      */
   1050  1.1     lukem     if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
   1051  1.1     lukem       b->flags &= ~_URE_DFA_BLANKLINE;
   1052  1.1     lukem 
   1053  1.1     lukem     /*
   1054  1.1     lukem      * Return the number of characters consumed.
   1055  1.1     lukem      */
   1056  1.1     lukem     return sp - sym;
   1057  1.1     lukem }
   1058  1.1     lukem 
   1059  1.1     lukem static int
   1060  1.1     lukem _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
   1061  1.1     lukem {
   1062  1.1     lukem     if (a->type != b->type || a->mods != b->mods || a->props != b->props)
   1063  1.1     lukem       return 1;
   1064  1.1     lukem 
   1065  1.1     lukem     if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
   1066  1.1     lukem         if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
   1067  1.1     lukem           return 1;
   1068  1.1     lukem         if (a->sym.ccl.ranges_used > 0 &&
   1069  1.1     lukem             memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
   1070  1.1     lukem                    sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
   1071  1.1     lukem           return 1;
   1072  1.1     lukem     } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
   1073  1.1     lukem       return 1;
   1074  1.1     lukem     return 0;
   1075  1.1     lukem }
   1076  1.1     lukem 
   1077  1.1     lukem /*
   1078  1.1     lukem  * Construct a symbol, but only keep unique symbols.
   1079  1.1     lukem  */
   1080  1.1     lukem static ucs2_t
   1081  1.1     lukem _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
   1082  1.1     lukem                  _ure_buffer_t *b)
   1083  1.1     lukem {
   1084  1.1     lukem     ucs2_t i;
   1085  1.1     lukem     _ure_symtab_t *sp, symbol;
   1086  1.1     lukem 
   1087  1.1     lukem     /*
   1088  1.1     lukem      * Build the next symbol so we can test to see if it is already in the
   1089  1.1     lukem      * symbol table.
   1090  1.1     lukem      */
   1091  1.1     lukem     (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
   1092  1.1     lukem     *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
   1093  1.1     lukem 
   1094  1.1     lukem     /*
   1095  1.1     lukem      * Check to see if the symbol exists.
   1096  1.1     lukem      */
   1097  1.1     lukem     for (i = 0, sp = b->symtab;
   1098  1.1     lukem          i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
   1099  1.1     lukem 
   1100  1.1     lukem     if (i < b->symtab_used) {
   1101  1.1     lukem         /*
   1102  1.1     lukem          * Free up any ranges used for the symbol.
   1103  1.1     lukem          */
   1104  1.1     lukem         if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
   1105  1.1     lukem             symbol.sym.ccl.ranges_size > 0)
   1106  1.1     lukem           free((char *) symbol.sym.ccl.ranges);
   1107  1.1     lukem 
   1108  1.1     lukem         return b->symtab[i].id;
   1109  1.1     lukem     }
   1110  1.1     lukem 
   1111  1.1     lukem     /*
   1112  1.1     lukem      * Need to add the new symbol.
   1113  1.1     lukem      */
   1114  1.1     lukem     if (b->symtab_used == b->symtab_size) {
   1115  1.1     lukem         if (b->symtab_size == 0)
   1116  1.1     lukem           b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
   1117  1.1     lukem         else
   1118  1.1     lukem           b->symtab = (_ure_symtab_t *)
   1119  1.1     lukem               realloc((char *) b->symtab,
   1120  1.1     lukem                       sizeof(_ure_symtab_t) * (b->symtab_size + 8));
   1121  1.1     lukem         sp = b->symtab + b->symtab_size;
   1122  1.1     lukem         (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
   1123  1.1     lukem         b->symtab_size += 8;
   1124  1.1     lukem     }
   1125  1.1     lukem 
   1126  1.1     lukem     symbol.id = b->symtab_used++;
   1127  1.1     lukem     (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
   1128  1.1     lukem                   sizeof(_ure_symtab_t));
   1129  1.1     lukem 
   1130  1.1     lukem     return symbol.id;
   1131  1.1     lukem }
   1132  1.1     lukem 
   1133  1.1     lukem /*************************************************************************
   1134  1.1     lukem  *
   1135  1.1     lukem  * End symbol parse functions.
   1136  1.1     lukem  *
   1137  1.1     lukem  *************************************************************************/
   1138  1.1     lukem 
   1139  1.1     lukem static ucs2_t
   1140  1.1     lukem _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
   1141  1.1     lukem {
   1142  1.1     lukem     ucs2_t i;
   1143  1.1     lukem 
   1144  1.1     lukem     if (b == 0)
   1145  1.1     lukem       return _URE_NOOP;
   1146  1.1     lukem 
   1147  1.1     lukem     /*
   1148  1.1     lukem      * Determine if the expression already exists or not.
   1149  1.1     lukem      */
   1150  1.1     lukem     for (i = 0; i < b->expr_used; i++) {
   1151  1.1     lukem         if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
   1152  1.1     lukem             b->expr[i].rhs == rhs)
   1153  1.1     lukem           break;
   1154  1.1     lukem     }
   1155  1.1     lukem     if (i < b->expr_used)
   1156  1.1     lukem       return i;
   1157  1.1     lukem 
   1158  1.1     lukem     /*
   1159  1.1     lukem      * Need to add a new expression.
   1160  1.1     lukem      */
   1161  1.1     lukem     if (b->expr_used == b->expr_size) {
   1162  1.1     lukem         if (b->expr_size == 0)
   1163  1.1     lukem           b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
   1164  1.1     lukem         else
   1165  1.1     lukem           b->expr = (_ure_elt_t *)
   1166  1.1     lukem               realloc((char *) b->expr,
   1167  1.1     lukem                       sizeof(_ure_elt_t) * (b->expr_size + 8));
   1168  1.1     lukem         b->expr_size += 8;
   1169  1.1     lukem     }
   1170  1.1     lukem 
   1171  1.1     lukem     b->expr[b->expr_used].onstack = 0;
   1172  1.1     lukem     b->expr[b->expr_used].type = type;
   1173  1.1     lukem     b->expr[b->expr_used].lhs = lhs;
   1174  1.1     lukem     b->expr[b->expr_used].rhs = rhs;
   1175  1.1     lukem 
   1176  1.1     lukem     return b->expr_used++;
   1177  1.1     lukem }
   1178  1.1     lukem 
   1179  1.1     lukem static unsigned char spmap[] = {
   1180  1.1     lukem     0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
   1181  1.1     lukem     0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   1182  1.1     lukem     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   1183  1.1     lukem };
   1184  1.1     lukem 
   1185  1.1     lukem #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
   1186  1.1     lukem                             (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
   1187  1.1     lukem 
   1188  1.1     lukem /*
   1189  1.1     lukem  * Convert the regular expression into an NFA in a form that will be easy to
   1190  1.1     lukem  * reduce to a DFA.  The starting state for the reduction will be returned.
   1191  1.1     lukem  */
   1192  1.1     lukem static ucs2_t
   1193  1.1     lukem _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
   1194  1.1     lukem {
   1195  1.1     lukem     ucs2_t c, state, top, sym, *sp, *ep;
   1196  1.1     lukem     unsigned long used;
   1197  1.1     lukem 
   1198  1.1     lukem     state = _URE_NOOP;
   1199  1.1     lukem 
   1200  1.1     lukem     sp = re;
   1201  1.1     lukem     ep = sp + relen;
   1202  1.1     lukem     while (b->error == _URE_OK && sp < ep) {
   1203  1.1     lukem         c = *sp++;
   1204  1.1     lukem         switch (c) {
   1205  1.1     lukem           case '(':
   1206  1.1     lukem             _ure_push(_URE_PAREN, b);
   1207  1.1     lukem             break;
   1208  1.1     lukem           case ')':
   1209  1.1     lukem             /*
   1210  1.1     lukem              * Check for the case of too many close parentheses.
   1211  1.1     lukem              */
   1212  1.1     lukem             if (_ure_peek(b) == _URE_NOOP) {
   1213  1.1     lukem                 b->error = _URE_UNBALANCED_GROUP;
   1214  1.1     lukem                 break;
   1215  1.1     lukem             }
   1216  1.1     lukem 
   1217  1.1     lukem             while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
   1218  1.1     lukem               /*
   1219  1.1     lukem                * Make an expression with the AND or OR operator and its right
   1220  1.1     lukem                * hand side.
   1221  1.1     lukem                */
   1222  1.1     lukem               state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
   1223  1.1     lukem 
   1224  1.1     lukem             /*
   1225  1.1     lukem              * Remove the _URE_PAREN off the stack.
   1226  1.1     lukem              */
   1227  1.1     lukem             (void) _ure_pop(b);
   1228  1.1     lukem             break;
   1229  1.1     lukem           case '*':
   1230  1.1     lukem             state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
   1231  1.1     lukem             break;
   1232  1.1     lukem           case '+':
   1233  1.1     lukem             state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
   1234  1.1     lukem             break;
   1235  1.1     lukem           case '?':
   1236  1.1     lukem             state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
   1237  1.1     lukem             break;
   1238  1.1     lukem           case '|':
   1239  1.1     lukem             while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
   1240  1.1     lukem               /*
   1241  1.1     lukem                * Make an expression with the AND or OR operator and its right
   1242  1.1     lukem                * hand side.
   1243  1.1     lukem                */
   1244  1.1     lukem               state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
   1245  1.1     lukem 
   1246  1.1     lukem             _ure_push(state, b);
   1247  1.1     lukem             _ure_push(_URE_OR, b);
   1248  1.1     lukem             break;
   1249  1.1     lukem           default:
   1250  1.1     lukem             sp--;
   1251  1.1     lukem             sym = _ure_make_symbol(sp, ep - sp, &used, b);
   1252  1.1     lukem             sp += used;
   1253  1.1     lukem             state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
   1254  1.1     lukem             break;
   1255  1.1     lukem         }
   1256  1.1     lukem 
   1257  1.1     lukem         if (c != '(' && c != '|' && sp < ep &&
   1258  1.1     lukem             (!_ure_isspecial(*sp) || *sp == '(')) {
   1259  1.1     lukem             _ure_push(state, b);
   1260  1.1     lukem             _ure_push(_URE_AND, b);
   1261  1.1     lukem         }
   1262  1.1     lukem     }
   1263  1.1     lukem     while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
   1264  1.1     lukem       /*
   1265  1.1     lukem        * Make an expression with the AND or OR operator and its right
   1266  1.1     lukem        * hand side.
   1267  1.1     lukem        */
   1268  1.1     lukem       state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
   1269  1.1     lukem 
   1270  1.1     lukem     if (b->stack.slist_used > 0)
   1271  1.1     lukem       b->error = _URE_UNBALANCED_GROUP;
   1272  1.1     lukem 
   1273  1.1     lukem     return (b->error == _URE_OK) ? state : _URE_NOOP;
   1274  1.1     lukem }
   1275  1.1     lukem 
   1276  1.1     lukem static void
   1277  1.1     lukem _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
   1278  1.1     lukem {
   1279  1.1     lukem     ucs2_t i, *stp;
   1280  1.1     lukem     _ure_symtab_t *sp;
   1281  1.1     lukem 
   1282  1.1     lukem     /*
   1283  1.1     lukem      * Locate the symbol in the symbol table so the state can be added.
   1284  1.1     lukem      * If the symbol doesn't exist, then a real problem exists.
   1285  1.1     lukem      */
   1286  1.1     lukem     for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
   1287  1.1     lukem          i++, sp++) ;
   1288  1.1     lukem 
   1289  1.1     lukem     /*
   1290  1.1     lukem      * Now find out if the state exists in the symbol's state list.
   1291  1.1     lukem      */
   1292  1.1     lukem     for (i = 0, stp = sp->states.slist;
   1293  1.1     lukem          i < sp->states.slist_used && state > *stp; i++, stp++) ;
   1294  1.1     lukem 
   1295  1.1     lukem     if (i == sp->states.slist_used || state < *stp) {
   1296  1.1     lukem         /*
   1297  1.1     lukem          * Need to add the state in order.
   1298  1.1     lukem          */
   1299  1.1     lukem         if (sp->states.slist_used == sp->states.slist_size) {
   1300  1.1     lukem             if (sp->states.slist_size == 0)
   1301  1.1     lukem               sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
   1302  1.1     lukem             else
   1303  1.1     lukem               sp->states.slist = (ucs2_t *)
   1304  1.1     lukem                   realloc((char *) sp->states.slist,
   1305  1.1     lukem                           sizeof(ucs2_t) * (sp->states.slist_size + 8));
   1306  1.1     lukem             sp->states.slist_size += 8;
   1307  1.1     lukem         }
   1308  1.1     lukem         if (i < sp->states.slist_used)
   1309  1.1     lukem           (void) _ure_memmove((char *) (sp->states.slist + i + 1),
   1310  1.1     lukem                               (char *) (sp->states.slist + i),
   1311  1.1     lukem                               sizeof(ucs2_t) * (sp->states.slist_used - i));
   1312  1.1     lukem         sp->states.slist[i] = state;
   1313  1.1     lukem         sp->states.slist_used++;
   1314  1.1     lukem     }
   1315  1.1     lukem }
   1316  1.1     lukem 
   1317  1.1     lukem static ucs2_t
   1318  1.1     lukem _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
   1319  1.1     lukem {
   1320  1.1     lukem     ucs2_t i;
   1321  1.1     lukem     _ure_state_t *sp;
   1322  1.1     lukem 
   1323  1.1     lukem     for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
   1324  1.1     lukem         if (sp->st.slist_used == nstates &&
   1325  1.1     lukem             memcmp((char *) states, (char *) sp->st.slist,
   1326  1.1     lukem                    sizeof(ucs2_t) * nstates) == 0)
   1327  1.1     lukem           break;
   1328  1.1     lukem     }
   1329  1.1     lukem 
   1330  1.1     lukem     if (i == b->states.states_used) {
   1331  1.1     lukem         /*
   1332  1.1     lukem          * Need to add a new DFA state (set of NFA states).
   1333  1.1     lukem          */
   1334  1.1     lukem         if (b->states.states_used == b->states.states_size) {
   1335  1.1     lukem             if (b->states.states_size == 0)
   1336  1.1     lukem               b->states.states = (_ure_state_t *)
   1337  1.1     lukem                   malloc(sizeof(_ure_state_t) << 3);
   1338  1.1     lukem             else
   1339  1.1     lukem               b->states.states = (_ure_state_t *)
   1340  1.1     lukem                   realloc((char *) b->states.states,
   1341  1.1     lukem                           sizeof(_ure_state_t) * (b->states.states_size + 8));
   1342  1.1     lukem             sp = b->states.states + b->states.states_size;
   1343  1.1     lukem             (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
   1344  1.1     lukem             b->states.states_size += 8;
   1345  1.1     lukem         }
   1346  1.1     lukem 
   1347  1.1     lukem         sp = b->states.states + b->states.states_used++;
   1348  1.1     lukem         sp->id = i;
   1349  1.1     lukem 
   1350  1.1     lukem         if (sp->st.slist_used + nstates > sp->st.slist_size) {
   1351  1.1     lukem             if (sp->st.slist_size == 0)
   1352  1.1     lukem               sp->st.slist = (ucs2_t *)
   1353  1.1     lukem                   malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
   1354  1.1     lukem             else
   1355  1.1     lukem               sp->st.slist = (ucs2_t *)
   1356  1.1     lukem                   realloc((char *) sp->st.slist,
   1357  1.1     lukem                           sizeof(ucs2_t) * (sp->st.slist_used + nstates));
   1358  1.1     lukem             sp->st.slist_size = sp->st.slist_used + nstates;
   1359  1.1     lukem         }
   1360  1.1     lukem         sp->st.slist_used = nstates;
   1361  1.1     lukem         (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
   1362  1.1     lukem                       sizeof(ucs2_t) * nstates);
   1363  1.1     lukem     }
   1364  1.1     lukem 
   1365  1.1     lukem     /*
   1366  1.1     lukem      * Return the ID of the DFA state representing a group of NFA states.
   1367  1.1     lukem      */
   1368  1.1     lukem     return i;
   1369  1.1     lukem }
   1370  1.1     lukem 
   1371  1.1     lukem static void
   1372  1.1     lukem _ure_reduce(ucs2_t start, _ure_buffer_t *b)
   1373  1.1     lukem {
   1374  1.1     lukem     ucs2_t i, j, state, eval, syms, rhs;
   1375  1.1     lukem     ucs2_t s1, s2, ns1, ns2;
   1376  1.1     lukem     _ure_state_t *sp;
   1377  1.1     lukem     _ure_symtab_t *smp;
   1378  1.1     lukem 
   1379  1.1     lukem     b->reducing = 1;
   1380  1.1     lukem 
   1381  1.1     lukem     /*
   1382  1.1     lukem      * Add the starting state for the reduction.
   1383  1.1     lukem      */
   1384  1.1     lukem     _ure_add_state(1, &start, b);
   1385  1.1     lukem 
   1386  1.1     lukem     /*
   1387  1.1     lukem      * Process each set of NFA states that get created.
   1388  1.1     lukem      */
   1389  1.1     lukem     for (i = 0; i < b->states.states_used; i++) {
   1390  1.1     lukem         sp = b->states.states + i;
   1391  1.1     lukem 
   1392  1.1     lukem         /*
   1393  1.1     lukem          * Push the current states on the stack.
   1394  1.1     lukem          */
   1395  1.1     lukem         for (j = 0; j < sp->st.slist_used; j++)
   1396  1.1     lukem           _ure_push(sp->st.slist[j], b);
   1397  1.1     lukem 
   1398  1.1     lukem         /*
   1399  1.1     lukem          * Reduce the NFA states.
   1400  1.1     lukem          */
   1401  1.1     lukem         for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
   1402  1.1     lukem             state = b->stack.slist[j];
   1403  1.1     lukem             eval = 1;
   1404  1.1     lukem 
   1405  1.1     lukem             /*
   1406  1.1     lukem              * This inner loop is the iterative equivalent of recursively
   1407  1.1     lukem              * reducing subexpressions generated as a result of a reduction.
   1408  1.1     lukem              */
   1409  1.1     lukem             while (eval) {
   1410  1.1     lukem                 switch (b->expr[state].type) {
   1411  1.1     lukem                   case _URE_SYMBOL:
   1412  1.1     lukem                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
   1413  1.1     lukem                     _ure_add_symstate(b->expr[state].lhs, ns1, b);
   1414  1.1     lukem                     syms++;
   1415  1.1     lukem                     eval = 0;
   1416  1.1     lukem                     break;
   1417  1.1     lukem                   case _URE_ONE:
   1418  1.1     lukem                     sp->accepting = 1;
   1419  1.1     lukem                     eval = 0;
   1420  1.1     lukem                     break;
   1421  1.1     lukem                   case _URE_QUEST:
   1422  1.1     lukem                     s1 = b->expr[state].lhs;
   1423  1.1     lukem                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
   1424  1.1     lukem                     state = _ure_make_expr(_URE_OR, ns1, s1, b);
   1425  1.1     lukem                     break;
   1426  1.1     lukem                   case _URE_PLUS:
   1427  1.1     lukem                     s1 = b->expr[state].lhs;
   1428  1.1     lukem                     ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
   1429  1.1     lukem                     state = _ure_make_expr(_URE_AND, s1, ns1, b);
   1430  1.1     lukem                     break;
   1431  1.1     lukem                   case _URE_STAR:
   1432  1.1     lukem                     s1 = b->expr[state].lhs;
   1433  1.1     lukem                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
   1434  1.1     lukem                     ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
   1435  1.1     lukem                     state = _ure_make_expr(_URE_OR, ns1, ns2, b);
   1436  1.1     lukem                     break;
   1437  1.1     lukem                   case _URE_OR:
   1438  1.1     lukem                     s1 = b->expr[state].lhs;
   1439  1.1     lukem                     s2 = b->expr[state].rhs;
   1440  1.1     lukem                     _ure_push(s1, b);
   1441  1.1     lukem                     _ure_push(s2, b);
   1442  1.1     lukem                     eval = 0;
   1443  1.1     lukem                     break;
   1444  1.1     lukem                   case _URE_AND:
   1445  1.1     lukem                     s1 = b->expr[state].lhs;
   1446  1.1     lukem                     s2 = b->expr[state].rhs;
   1447  1.1     lukem                     switch (b->expr[s1].type) {
   1448  1.1     lukem                       case _URE_SYMBOL:
   1449  1.1     lukem                         _ure_add_symstate(b->expr[s1].lhs, s2, b);
   1450  1.1     lukem                         syms++;
   1451  1.1     lukem                         eval = 0;
   1452  1.1     lukem                         break;
   1453  1.1     lukem                       case _URE_ONE:
   1454  1.1     lukem                         state = s2;
   1455  1.1     lukem                         break;
   1456  1.1     lukem                       case _URE_QUEST:
   1457  1.1     lukem                         ns1 = b->expr[s1].lhs;
   1458  1.1     lukem                         ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
   1459  1.1     lukem                         state = _ure_make_expr(_URE_OR, s2, ns2, b);
   1460  1.1     lukem                         break;
   1461  1.1     lukem                       case _URE_PLUS:
   1462  1.1     lukem                         ns1 = b->expr[s1].lhs;
   1463  1.1     lukem                         ns2 = _ure_make_expr(_URE_OR, s2, state, b);
   1464  1.1     lukem                         state = _ure_make_expr(_URE_AND, ns1, ns2, b);
   1465  1.1     lukem                         break;
   1466  1.1     lukem                       case _URE_STAR:
   1467  1.1     lukem                         ns1 = b->expr[s1].lhs;
   1468  1.1     lukem                         ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
   1469  1.1     lukem                         state = _ure_make_expr(_URE_OR, s2, ns2, b);
   1470  1.1     lukem                         break;
   1471  1.1     lukem                       case _URE_OR:
   1472  1.1     lukem                         ns1 = b->expr[s1].lhs;
   1473  1.1     lukem                         ns2 = b->expr[s1].rhs;
   1474  1.1     lukem                         ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
   1475  1.1     lukem                         ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
   1476  1.1     lukem                         state = _ure_make_expr(_URE_OR, ns1, ns2, b);
   1477  1.1     lukem                         break;
   1478  1.1     lukem                       case _URE_AND:
   1479  1.1     lukem                         ns1 = b->expr[s1].lhs;
   1480  1.1     lukem                         ns2 = b->expr[s1].rhs;
   1481  1.1     lukem                         ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
   1482  1.1     lukem                         state = _ure_make_expr(_URE_AND, ns1, ns2, b);
   1483  1.1     lukem                         break;
   1484  1.1     lukem                     }
   1485  1.1     lukem                 }
   1486  1.1     lukem             }
   1487  1.1     lukem         }
   1488  1.1     lukem 
   1489  1.1     lukem         /*
   1490  1.1     lukem          * Clear the state stack.
   1491  1.1     lukem          */
   1492  1.1     lukem         while (_ure_pop(b) != _URE_NOOP) ;
   1493  1.1     lukem 
   1494  1.1     lukem         /*
   1495  1.1     lukem          * Reset the state pointer because the reduction may have moved it
   1496  1.1     lukem          * during a reallocation.
   1497  1.1     lukem          */
   1498  1.1     lukem         sp = b->states.states + i;
   1499  1.1     lukem 
   1500  1.1     lukem         /*
   1501  1.1     lukem          * Generate the DFA states for the symbols collected during the
   1502  1.1     lukem          * current reduction.
   1503  1.1     lukem          */
   1504  1.1     lukem         if (sp->trans_used + syms > sp->trans_size) {
   1505  1.1     lukem             if (sp->trans_size == 0)
   1506  1.1     lukem               sp->trans = (_ure_elt_t *)
   1507  1.1     lukem                   malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
   1508  1.1     lukem             else
   1509  1.1     lukem               sp->trans = (_ure_elt_t *)
   1510  1.1     lukem                   realloc((char *) sp->trans,
   1511  1.1     lukem                           sizeof(_ure_elt_t) * (sp->trans_used + syms));
   1512  1.1     lukem             sp->trans_size = sp->trans_used + syms;
   1513  1.1     lukem         }
   1514  1.1     lukem 
   1515  1.1     lukem         /*
   1516  1.1     lukem          * Go through the symbol table and generate the DFA state transitions
   1517  1.1     lukem          * for each symbol that has collected NFA states.
   1518  1.1     lukem          */
   1519  1.1     lukem         for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
   1520  1.1     lukem             sp = b->states.states + i;
   1521  1.1     lukem 
   1522  1.1     lukem             if (smp->states.slist_used > 0) {
   1523  1.1     lukem                 sp->trans[syms].lhs = smp->id;
   1524  1.1     lukem                 rhs = _ure_add_state(smp->states.slist_used,
   1525  1.1     lukem                                      smp->states.slist, b);
   1526  1.1     lukem                 /*
   1527  1.1     lukem                  * Reset the state pointer in case the reallocation moves it
   1528  1.1     lukem                  * in memory.
   1529  1.1     lukem                  */
   1530  1.1     lukem                 sp = b->states.states + i;
   1531  1.1     lukem                 sp->trans[syms].rhs = rhs;
   1532  1.1     lukem 
   1533  1.1     lukem                 smp->states.slist_used = 0;
   1534  1.1     lukem                 syms++;
   1535  1.1     lukem             }
   1536  1.1     lukem         }
   1537  1.1     lukem 
   1538  1.1     lukem         /*
   1539  1.1     lukem          * Set the number of transitions actually used.
   1540  1.1     lukem          */
   1541  1.1     lukem         sp->trans_used = syms;
   1542  1.1     lukem     }
   1543  1.1     lukem     b->reducing = 0;
   1544  1.1     lukem }
   1545  1.1     lukem 
   1546  1.1     lukem static void
   1547  1.1     lukem _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
   1548  1.1     lukem {
   1549  1.1     lukem     ucs2_t tmp;
   1550  1.1     lukem 
   1551  1.1     lukem     l = b->states.states[l].id;
   1552  1.1     lukem     r = b->states.states[r].id;
   1553  1.1     lukem 
   1554  1.1     lukem     if (l == r)
   1555  1.1     lukem       return;
   1556  1.1     lukem 
   1557  1.1     lukem     if (l > r) {
   1558  1.1     lukem         tmp = l;
   1559  1.1     lukem         l = r;
   1560  1.1     lukem         r = tmp;
   1561  1.1     lukem     }
   1562  1.1     lukem 
   1563  1.1     lukem     /*
   1564  1.1     lukem      * Check to see if the equivalence pair already exists.
   1565  1.1     lukem      */
   1566  1.1     lukem     for (tmp = 0; tmp < b->equiv_used &&
   1567  1.1     lukem              (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
   1568  1.1     lukem          tmp++) ;
   1569  1.1     lukem 
   1570  1.1     lukem     if (tmp < b->equiv_used)
   1571  1.1     lukem       return;
   1572  1.1     lukem 
   1573  1.1     lukem     if (b->equiv_used == b->equiv_size) {
   1574  1.1     lukem         if (b->equiv_size == 0)
   1575  1.1     lukem           b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
   1576  1.1     lukem         else
   1577  1.1     lukem           b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
   1578  1.1     lukem                                               sizeof(_ure_equiv_t) *
   1579  1.1     lukem                                               (b->equiv_size + 8));
   1580  1.1     lukem         b->equiv_size += 8;
   1581  1.1     lukem     }
   1582  1.1     lukem     b->equiv[b->equiv_used].l = l;
   1583  1.1     lukem     b->equiv[b->equiv_used].r = r;
   1584  1.1     lukem     b->equiv_used++;
   1585  1.1     lukem }
   1586  1.1     lukem 
   1587  1.1     lukem /*
   1588  1.1     lukem  * Merge the DFA states that are equivalent.
   1589  1.1     lukem  */
   1590  1.1     lukem static void
   1591  1.1     lukem _ure_merge_equiv(_ure_buffer_t *b)
   1592  1.1     lukem {
   1593  1.1     lukem     ucs2_t i, j, k, eq, done;
   1594  1.1     lukem     _ure_state_t *sp1, *sp2, *ls, *rs;
   1595  1.1     lukem 
   1596  1.1     lukem     for (i = 0; i < b->states.states_used; i++) {
   1597  1.1     lukem         sp1 = b->states.states + i;
   1598  1.1     lukem         if (sp1->id != i)
   1599  1.1     lukem           continue;
   1600  1.1     lukem         for (j = 0; j < i; j++) {
   1601  1.1     lukem             sp2 = b->states.states + j;
   1602  1.1     lukem             if (sp2->id != j)
   1603  1.1     lukem               continue;
   1604  1.1     lukem             b->equiv_used = 0;
   1605  1.1     lukem             _ure_add_equiv(i, j, b);
   1606  1.1     lukem             for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
   1607  1.1     lukem                 ls = b->states.states + b->equiv[eq].l;
   1608  1.1     lukem                 rs = b->states.states + b->equiv[eq].r;
   1609  1.1     lukem                 if (ls->accepting != rs->accepting ||
   1610  1.1     lukem                     ls->trans_used != rs->trans_used) {
   1611  1.1     lukem                     done = 1;
   1612  1.1     lukem                     break;
   1613  1.1     lukem                 }
   1614  1.1     lukem                 for (k = 0; k < ls->trans_used &&
   1615  1.1     lukem                          ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
   1616  1.1     lukem                 if (k < ls->trans_used) {
   1617  1.1     lukem                     done = 1;
   1618  1.1     lukem                     break;
   1619  1.1     lukem                 }
   1620  1.1     lukem 
   1621  1.1     lukem                 for (k = 0; k < ls->trans_used; k++)
   1622  1.1     lukem                   _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
   1623  1.1     lukem             }
   1624  1.1     lukem             if (done == 0)
   1625  1.1     lukem               break;
   1626  1.1     lukem         }
   1627  1.1     lukem         for (eq = 0; j < i && eq < b->equiv_used; eq++)
   1628  1.1     lukem           b->states.states[b->equiv[eq].r].id =
   1629  1.1     lukem               b->states.states[b->equiv[eq].l].id;
   1630  1.1     lukem     }
   1631  1.1     lukem 
   1632  1.1     lukem     /*
   1633  1.1     lukem      * Renumber the states appropriately.
   1634  1.1     lukem      */
   1635  1.1     lukem     for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
   1636  1.1     lukem          sp1++, i++)
   1637  1.1     lukem       sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
   1638  1.1     lukem }
   1639  1.1     lukem 
   1640  1.1     lukem /*************************************************************************
   1641  1.1     lukem  *
   1642  1.1     lukem  * API.
   1643  1.1     lukem  *
   1644  1.1     lukem  *************************************************************************/
   1645  1.1     lukem 
   1646  1.1     lukem ure_buffer_t
   1647  1.1     lukem ure_buffer_create(void)
   1648  1.1     lukem {
   1649  1.1     lukem     ure_buffer_t b;
   1650  1.1     lukem 
   1651  1.1     lukem     b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
   1652  1.1     lukem 
   1653  1.1     lukem     return b;
   1654  1.1     lukem }
   1655  1.1     lukem 
   1656  1.1     lukem void
   1657  1.1     lukem ure_buffer_free(ure_buffer_t buf)
   1658  1.1     lukem {
   1659  1.1     lukem     unsigned long i;
   1660  1.1     lukem 
   1661  1.1     lukem     if (buf == 0)
   1662  1.1     lukem       return;
   1663  1.1     lukem 
   1664  1.1     lukem     if (buf->stack.slist_size > 0)
   1665  1.1     lukem       free((char *) buf->stack.slist);
   1666  1.1     lukem 
   1667  1.1     lukem     if (buf->expr_size > 0)
   1668  1.1     lukem       free((char *) buf->expr);
   1669  1.1     lukem 
   1670  1.1     lukem     for (i = 0; i < buf->symtab_size; i++) {
   1671  1.1     lukem         if (buf->symtab[i].states.slist_size > 0)
   1672  1.1     lukem           free((char *) buf->symtab[i].states.slist);
   1673  1.1     lukem     }
   1674  1.1     lukem 
   1675  1.1     lukem     if (buf->symtab_size > 0)
   1676  1.1     lukem       free((char *) buf->symtab);
   1677  1.1     lukem 
   1678  1.1     lukem     for (i = 0; i < buf->states.states_size; i++) {
   1679  1.1     lukem         if (buf->states.states[i].trans_size > 0)
   1680  1.1     lukem           free((char *) buf->states.states[i].trans);
   1681  1.1     lukem         if (buf->states.states[i].st.slist_size > 0)
   1682  1.1     lukem           free((char *) buf->states.states[i].st.slist);
   1683  1.1     lukem     }
   1684  1.1     lukem 
   1685  1.1     lukem     if (buf->states.states_size > 0)
   1686  1.1     lukem       free((char *) buf->states.states);
   1687  1.1     lukem 
   1688  1.1     lukem     if (buf->equiv_size > 0)
   1689  1.1     lukem       free((char *) buf->equiv);
   1690  1.1     lukem 
   1691  1.1     lukem     free((char *) buf);
   1692  1.1     lukem }
   1693  1.1     lukem 
   1694  1.1     lukem ure_dfa_t
   1695  1.1     lukem ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
   1696  1.1     lukem {
   1697  1.1     lukem     ucs2_t i, j, state;
   1698  1.1     lukem     _ure_state_t *sp;
   1699  1.1     lukem     _ure_dstate_t *dsp;
   1700  1.1     lukem     _ure_trans_t *tp;
   1701  1.1     lukem     ure_dfa_t dfa;
   1702  1.1     lukem 
   1703  1.1     lukem     if (re == 0 || *re == 0 || relen == 0 || buf == 0)
   1704  1.1     lukem       return 0;
   1705  1.1     lukem 
   1706  1.1     lukem     /*
   1707  1.1     lukem      * Reset the various fields of the compilation buffer.  Default the flags
   1708  1.3  christos      * to indicate the presence of the "^$" pattern.  If any other pattern
   1709  1.1     lukem      * occurs, then this flag will be removed.  This is done to catch this
   1710  1.1     lukem      * special pattern and handle it specially when matching.
   1711  1.1     lukem      */
   1712  1.1     lukem     buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
   1713  1.1     lukem     buf->reducing = 0;
   1714  1.1     lukem     buf->stack.slist_used = 0;
   1715  1.1     lukem     buf->expr_used = 0;
   1716  1.1     lukem 
   1717  1.1     lukem     for (i = 0; i < buf->symtab_used; i++)
   1718  1.1     lukem       buf->symtab[i].states.slist_used = 0;
   1719  1.1     lukem     buf->symtab_used = 0;
   1720  1.1     lukem 
   1721  1.1     lukem     for (i = 0; i < buf->states.states_used; i++) {
   1722  1.1     lukem         buf->states.states[i].st.slist_used = 0;
   1723  1.1     lukem         buf->states.states[i].trans_used = 0;
   1724  1.1     lukem     }
   1725  1.1     lukem     buf->states.states_used = 0;
   1726  1.1     lukem 
   1727  1.1     lukem     /*
   1728  1.3  christos      * Construct the NFA.  If this stage returns a 0, then an error occurred or
   1729  1.1     lukem      * an empty expression was passed.
   1730  1.1     lukem      */
   1731  1.1     lukem     if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
   1732  1.1     lukem       return 0;
   1733  1.1     lukem 
   1734  1.1     lukem     /*
   1735  1.1     lukem      * Do the expression reduction to get the initial DFA.
   1736  1.1     lukem      */
   1737  1.1     lukem     _ure_reduce(state, buf);
   1738  1.1     lukem 
   1739  1.1     lukem     /*
   1740  1.1     lukem      * Merge all the equivalent DFA states.
   1741  1.1     lukem      */
   1742  1.1     lukem     _ure_merge_equiv(buf);
   1743  1.1     lukem 
   1744  1.1     lukem     /*
   1745  1.1     lukem      * Construct the minimal DFA.
   1746  1.1     lukem      */
   1747  1.1     lukem     dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
   1748  1.1     lukem     (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
   1749  1.1     lukem 
   1750  1.1     lukem     dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
   1751  1.1     lukem 
   1752  1.1     lukem     /*
   1753  1.1     lukem      * Free up the NFA state groups and transfer the symbols from the buffer
   1754  1.1     lukem      * to the DFA.
   1755  1.1     lukem      */
   1756  1.1     lukem     for (i = 0; i < buf->symtab_size; i++) {
   1757  1.1     lukem         if (buf->symtab[i].states.slist_size > 0)
   1758  1.1     lukem           free((char *) buf->symtab[i].states.slist);
   1759  1.1     lukem     }
   1760  1.1     lukem     dfa->syms = buf->symtab;
   1761  1.1     lukem     dfa->nsyms = buf->symtab_used;
   1762  1.1     lukem 
   1763  1.1     lukem     buf->symtab_used = buf->symtab_size = 0;
   1764  1.1     lukem 
   1765  1.1     lukem     /*
   1766  1.1     lukem      * Collect the total number of states and transitions needed for the DFA.
   1767  1.1     lukem      */
   1768  1.1     lukem     for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
   1769  1.1     lukem          i++, sp++) {
   1770  1.1     lukem         if (sp->id == state) {
   1771  1.1     lukem             dfa->nstates++;
   1772  1.1     lukem             dfa->ntrans += sp->trans_used;
   1773  1.1     lukem             state++;
   1774  1.1     lukem         }
   1775  1.1     lukem     }
   1776  1.1     lukem 
   1777  1.1     lukem     /*
   1778  1.1     lukem      * Allocate enough space for the states and transitions.
   1779  1.1     lukem      */
   1780  1.1     lukem     dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
   1781  1.1     lukem                                            dfa->nstates);
   1782  1.1     lukem     dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
   1783  1.1     lukem 
   1784  1.1     lukem     /*
   1785  1.1     lukem      * Actually transfer the DFA states from the buffer.
   1786  1.1     lukem      */
   1787  1.1     lukem     dsp = dfa->states;
   1788  1.1     lukem     tp = dfa->trans;
   1789  1.1     lukem     for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
   1790  1.1     lukem          i++, sp++) {
   1791  1.1     lukem         if (sp->id == state) {
   1792  1.1     lukem             dsp->trans = tp;
   1793  1.1     lukem             dsp->ntrans = sp->trans_used;
   1794  1.1     lukem             dsp->accepting = sp->accepting;
   1795  1.1     lukem 
   1796  1.1     lukem             /*
   1797  1.1     lukem              * Add the transitions for the state.
   1798  1.1     lukem              */
   1799  1.1     lukem             for (j = 0; j < dsp->ntrans; j++, tp++) {
   1800  1.1     lukem                 tp->symbol = sp->trans[j].lhs;
   1801  1.1     lukem                 tp->next_state = buf->states.states[sp->trans[j].rhs].id;
   1802  1.1     lukem             }
   1803  1.1     lukem 
   1804  1.1     lukem             dsp++;
   1805  1.1     lukem             state++;
   1806  1.1     lukem         }
   1807  1.1     lukem     }
   1808  1.1     lukem 
   1809  1.1     lukem     return dfa;
   1810  1.1     lukem }
   1811  1.1     lukem 
   1812  1.1     lukem void
   1813  1.1     lukem ure_dfa_free(ure_dfa_t dfa)
   1814  1.1     lukem {
   1815  1.1     lukem     ucs2_t i;
   1816  1.1     lukem 
   1817  1.1     lukem     if (dfa == 0)
   1818  1.1     lukem       return;
   1819  1.1     lukem 
   1820  1.1     lukem     for (i = 0; i < dfa->nsyms; i++) {
   1821  1.1     lukem         if ((dfa->syms[i].type == _URE_CCLASS ||
   1822  1.1     lukem              dfa->syms[i].type == _URE_NCCLASS) &&
   1823  1.1     lukem             dfa->syms[i].sym.ccl.ranges_size > 0)
   1824  1.1     lukem           free((char *) dfa->syms[i].sym.ccl.ranges);
   1825  1.1     lukem     }
   1826  1.1     lukem     if (dfa->nsyms > 0)
   1827  1.1     lukem       free((char *) dfa->syms);
   1828  1.1     lukem 
   1829  1.1     lukem     if (dfa->nstates > 0)
   1830  1.1     lukem       free((char *) dfa->states);
   1831  1.1     lukem     if (dfa->ntrans > 0)
   1832  1.1     lukem       free((char *) dfa->trans);
   1833  1.1     lukem     free((char *) dfa);
   1834  1.1     lukem }
   1835  1.1     lukem 
   1836  1.1     lukem void
   1837  1.1     lukem ure_write_dfa(ure_dfa_t dfa, FILE *out)
   1838  1.1     lukem {
   1839  1.1     lukem     ucs2_t i, j, k, h, l;
   1840  1.1     lukem     _ure_dstate_t *sp;
   1841  1.1     lukem     _ure_symtab_t *sym;
   1842  1.1     lukem     _ure_range_t *rp;
   1843  1.1     lukem 
   1844  1.1     lukem     if (dfa == 0 || out == 0)
   1845  1.1     lukem       return;
   1846  1.1     lukem 
   1847  1.1     lukem     /*
   1848  1.1     lukem      * Write all the different character classes.
   1849  1.1     lukem      */
   1850  1.1     lukem     for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
   1851  1.1     lukem         if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
   1852  1.1     lukem             fprintf(out, "C%hd = ", sym->id);
   1853  1.1     lukem             if (sym->sym.ccl.ranges_used > 0) {
   1854  1.1     lukem                 putc('[', out);
   1855  1.1     lukem                 if (sym->type == _URE_NCCLASS)
   1856  1.1     lukem                   putc('^', out);
   1857  1.1     lukem             }
   1858  1.1     lukem             if (sym->props != 0) {
   1859  1.1     lukem                 if (sym->type == _URE_NCCLASS)
   1860  1.1     lukem                   fprintf(out, "\\P");
   1861  1.1     lukem                 else
   1862  1.1     lukem                   fprintf(out, "\\p");
   1863  1.1     lukem                 for (k = h = 0; k < 32; k++) {
   1864  1.1     lukem                     if (sym->props & (1 << k)) {
   1865  1.1     lukem                         if (h != 0)
   1866  1.1     lukem                           putc(',', out);
   1867  1.3  christos                         fprintf(out, "%d", k + 1);
   1868  1.1     lukem                         h = 1;
   1869  1.1     lukem                     }
   1870  1.1     lukem                 }
   1871  1.1     lukem             }
   1872  1.1     lukem             /*
   1873  1.1     lukem              * Dump the ranges.
   1874  1.1     lukem              */
   1875  1.1     lukem             for (k = 0, rp = sym->sym.ccl.ranges;
   1876  1.1     lukem                  k < sym->sym.ccl.ranges_used; k++, rp++) {
   1877  1.1     lukem                 /*
   1878  1.1     lukem                  * Check for UTF16 characters.
   1879  1.1     lukem                  */
   1880  1.1     lukem                 if (0x10000 <= rp->min_code &&
   1881  1.1     lukem                     rp->min_code <= 0x10ffff) {
   1882  1.1     lukem                     h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
   1883  1.1     lukem                     l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
   1884  1.1     lukem                     fprintf(out, "\\x%04hX\\x%04hX", h, l);
   1885  1.1     lukem                 } else
   1886  1.1     lukem                   fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
   1887  1.1     lukem                 if (rp->max_code != rp->min_code) {
   1888  1.1     lukem                     putc('-', out);
   1889  1.1     lukem                     if (rp->max_code >= 0x10000 &&
   1890  1.1     lukem                         rp->max_code <= 0x10ffff) {
   1891  1.1     lukem                         h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
   1892  1.1     lukem                         l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
   1893  1.1     lukem                         fprintf(out, "\\x%04hX\\x%04hX", h, l);
   1894  1.1     lukem                     } else
   1895  1.1     lukem                       fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
   1896  1.1     lukem                 }
   1897  1.1     lukem             }
   1898  1.1     lukem             if (sym->sym.ccl.ranges_used > 0)
   1899  1.1     lukem               putc(']', out);
   1900  1.1     lukem             putc('\n', out);
   1901  1.1     lukem         }
   1902  1.1     lukem     }
   1903  1.1     lukem 
   1904  1.1     lukem     for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
   1905  1.1     lukem         fprintf(out, "S%hd = ", i);
   1906  1.1     lukem         if (sp->accepting) {
   1907  1.1     lukem             fprintf(out, "1 ");
   1908  1.1     lukem             if (sp->ntrans)
   1909  1.1     lukem               fprintf(out, "| ");
   1910  1.1     lukem         }
   1911  1.1     lukem         for (j = 0; j < sp->ntrans; j++) {
   1912  1.1     lukem             if (j > 0)
   1913  1.1     lukem               fprintf(out, "| ");
   1914  1.1     lukem 
   1915  1.1     lukem             sym = dfa->syms + sp->trans[j].symbol;
   1916  1.1     lukem             switch (sym->type) {
   1917  1.1     lukem               case _URE_CHAR:
   1918  1.1     lukem                 if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
   1919  1.1     lukem                     /*
   1920  1.1     lukem                      * Take care of UTF16 characters.
   1921  1.1     lukem                      */
   1922  1.1     lukem                     h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
   1923  1.1     lukem                     l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
   1924  1.1     lukem                     fprintf(out, "\\x%04hX\\x%04hX ", h, l);
   1925  1.1     lukem                 } else
   1926  1.1     lukem                   fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
   1927  1.1     lukem                 break;
   1928  1.1     lukem               case _URE_ANY_CHAR:
   1929  1.1     lukem                 fprintf(out, "<any> ");
   1930  1.1     lukem                 break;
   1931  1.1     lukem               case _URE_BOL_ANCHOR:
   1932  1.1     lukem                 fprintf(out, "<bol-anchor> ");
   1933  1.1     lukem                 break;
   1934  1.1     lukem               case _URE_EOL_ANCHOR:
   1935  1.1     lukem                 fprintf(out, "<eol-anchor> ");
   1936  1.1     lukem                 break;
   1937  1.1     lukem               case _URE_CCLASS:
   1938  1.1     lukem               case _URE_NCCLASS:
   1939  1.1     lukem                 fprintf(out, "[C%hd] ", sym->id);
   1940  1.1     lukem                 break;
   1941  1.1     lukem             }
   1942  1.1     lukem             fprintf(out, "S%hd", sp->trans[j].next_state);
   1943  1.1     lukem             if (j + 1 < sp->ntrans)
   1944  1.1     lukem               putc(' ', out);
   1945  1.1     lukem         }
   1946  1.1     lukem         putc('\n', out);
   1947  1.1     lukem     }
   1948  1.1     lukem }
   1949  1.1     lukem 
   1950  1.1     lukem #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
   1951  1.1     lukem                         (cc) == 0x2029)
   1952  1.1     lukem 
   1953  1.1     lukem int
   1954  1.1     lukem ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
   1955  1.1     lukem          unsigned long *match_start, unsigned long *match_end)
   1956  1.1     lukem {
   1957  1.1     lukem     int i, j, matched, found, skip;
   1958  1.1     lukem     unsigned long ms, me;
   1959  1.1     lukem     ucs4_t c;
   1960  1.1     lukem     ucs2_t *sp, *ep, *lp;
   1961  1.1     lukem     _ure_dstate_t *stp;
   1962  1.1     lukem     _ure_symtab_t *sym;
   1963  1.1     lukem     _ure_range_t *rp;
   1964  1.1     lukem 
   1965  1.1     lukem     if (dfa == 0 || text == 0)
   1966  1.1     lukem       return 0;
   1967  1.1     lukem 
   1968  1.1     lukem     /*
   1969  1.1     lukem      * Handle the special case of an empty string matching the "^$" pattern.
   1970  1.1     lukem      */
   1971  1.1     lukem     if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
   1972  1.1     lukem         *match_start = *match_end = 0;
   1973  1.1     lukem         return 1;
   1974  1.1     lukem     }
   1975  1.1     lukem 
   1976  1.1     lukem     sp = text;
   1977  1.1     lukem     ep = sp + textlen;
   1978  1.1     lukem 
   1979  1.1     lukem     ms = me = ~0;
   1980  1.1     lukem 
   1981  1.1     lukem     stp = dfa->states;
   1982  1.1     lukem 
   1983  1.1     lukem     for (found = skip = 0; found == 0 && sp < ep; ) {
   1984  1.1     lukem         lp = sp;
   1985  1.1     lukem         c = *sp++;
   1986  1.1     lukem 
   1987  1.1     lukem         /*
   1988  1.1     lukem          * Check to see if this is a high surrogate that should be
   1989  1.1     lukem          * combined with a following low surrogate.
   1990  1.1     lukem          */
   1991  1.1     lukem         if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
   1992  1.1     lukem             0xdc00 <= *sp && *sp <= 0xdfff)
   1993  1.1     lukem           c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
   1994  1.1     lukem 
   1995  1.1     lukem         /*
   1996  1.1     lukem          * Determine if the character is non-spacing and should be skipped.
   1997  1.1     lukem          */
   1998  1.1     lukem         if (_ure_matches_properties(_URE_NONSPACING, c) &&
   1999  1.1     lukem             (flags & URE_IGNORE_NONSPACING)) {
   2000  1.1     lukem             sp++;
   2001  1.1     lukem             continue;
   2002  1.1     lukem         }
   2003  1.1     lukem 
   2004  1.1     lukem         if (dfa->flags & _URE_DFA_CASEFOLD)
   2005  1.1     lukem           c = _ure_tolower(c);
   2006  1.1     lukem 
   2007  1.1     lukem         /*
   2008  1.1     lukem          * See if one of the transitions matches.
   2009  1.1     lukem          */
   2010  1.1     lukem         for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
   2011  1.1     lukem             sym = dfa->syms + stp->trans[i].symbol;
   2012  1.1     lukem             switch (sym->type) {
   2013  1.1     lukem               case _URE_ANY_CHAR:
   2014  1.1     lukem                 if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
   2015  1.1     lukem                     !_ure_issep(c))
   2016  1.1     lukem                   matched = 1;
   2017  1.1     lukem                 break;
   2018  1.1     lukem               case _URE_CHAR:
   2019  1.1     lukem                 if (c == sym->sym.chr)
   2020  1.1     lukem                   matched = 1;
   2021  1.1     lukem                 break;
   2022  1.1     lukem               case _URE_BOL_ANCHOR:
   2023  1.1     lukem                 if (lp == text) {
   2024  1.1     lukem                     sp = lp;
   2025  1.1     lukem                     matched = 1;
   2026  1.1     lukem                 } else if (_ure_issep(c)) {
   2027  1.1     lukem                     if (c == '\r' && sp < ep && *sp == '\n')
   2028  1.1     lukem                       sp++;
   2029  1.1     lukem                     lp = sp;
   2030  1.1     lukem                     matched = 1;
   2031  1.1     lukem                 }
   2032  1.1     lukem                 break;
   2033  1.1     lukem               case _URE_EOL_ANCHOR:
   2034  1.1     lukem                 if (_ure_issep(c)) {
   2035  1.1     lukem                     /*
   2036  1.1     lukem                      * Put the pointer back before the separator so the match
   2037  1.1     lukem                      * end position will be correct.  This case will also
   2038  1.1     lukem                      * cause the `sp' pointer to be advanced over the current
   2039  1.1     lukem                      * separator once the match end point has been recorded.
   2040  1.1     lukem                      */
   2041  1.1     lukem                     sp = lp;
   2042  1.1     lukem                     matched = 1;
   2043  1.1     lukem                 }
   2044  1.1     lukem                 break;
   2045  1.1     lukem               case _URE_CCLASS:
   2046  1.1     lukem               case _URE_NCCLASS:
   2047  1.1     lukem                 if (sym->props != 0)
   2048  1.1     lukem                   matched = _ure_matches_properties(sym->props, c);
   2049  1.1     lukem                 for (j = 0, rp = sym->sym.ccl.ranges;
   2050  1.1     lukem                      j < sym->sym.ccl.ranges_used; j++, rp++) {
   2051  1.1     lukem                     if (rp->min_code <= c && c <= rp->max_code)
   2052  1.1     lukem                       matched = 1;
   2053  1.1     lukem                 }
   2054  1.1     lukem                 if (sym->type == _URE_NCCLASS)
   2055  1.1     lukem                   matched = !matched;
   2056  1.1     lukem                 break;
   2057  1.1     lukem             }
   2058  1.1     lukem 
   2059  1.1     lukem             if (matched) {
   2060  1.1     lukem                 if (ms == ~0UL)
   2061  1.1     lukem                   ms = lp - text;
   2062  1.1     lukem                 else
   2063  1.1     lukem                   me = sp - text;
   2064  1.1     lukem                 stp = dfa->states + stp->trans[i].next_state;
   2065  1.1     lukem 
   2066  1.1     lukem                 /*
   2067  1.1     lukem                  * If the match was an EOL anchor, adjust the pointer past the
   2068  1.1     lukem                  * separator that caused the match.  The correct match
   2069  1.1     lukem                  * position has been recorded already.
   2070  1.1     lukem                  */
   2071  1.1     lukem                 if (sym->type == _URE_EOL_ANCHOR) {
   2072  1.1     lukem                     /*
   2073  1.1     lukem                      * Skip the character that caused the match.
   2074  1.1     lukem                      */
   2075  1.1     lukem                     sp++;
   2076  1.1     lukem 
   2077  1.1     lukem                     /*
   2078  1.1     lukem                      * Handle the infamous CRLF situation.
   2079  1.1     lukem                      */
   2080  1.1     lukem                     if (sp < ep && c == '\r' && *sp == '\n')
   2081  1.1     lukem                       sp++;
   2082  1.1     lukem                 }
   2083  1.1     lukem             }
   2084  1.1     lukem         }
   2085  1.1     lukem 
   2086  1.1     lukem         if (matched == 0) {
   2087  1.1     lukem             if (stp->accepting == 0) {
   2088  1.1     lukem                 /*
   2089  1.1     lukem                  * If the last state was not accepting, then reset
   2090  1.1     lukem                  * and start over.
   2091  1.1     lukem                  */
   2092  1.1     lukem                 stp = dfa->states;
   2093  1.1     lukem                 ms = me = ~0;
   2094  1.1     lukem             } else
   2095  1.1     lukem               /*
   2096  1.1     lukem                * The last state was accepting, so terminate the matching
   2097  1.1     lukem                * loop to avoid more work.
   2098  1.1     lukem                */
   2099  1.1     lukem               found = 1;
   2100  1.1     lukem         } else if (sp == ep) {
   2101  1.1     lukem             if (!stp->accepting) {
   2102  1.1     lukem                 /*
   2103  1.1     lukem                  * This ugly hack is to make sure the end-of-line anchors
   2104  1.1     lukem                  * match when the source text hits the end.  This is only done
   2105  1.1     lukem                  * if the last subexpression matches.
   2106  1.1     lukem                  */
   2107  1.1     lukem                 for (i = 0; found == 0 && i < stp->ntrans; i++) {
   2108  1.1     lukem                     sym = dfa->syms + stp->trans[i].symbol;
   2109  1.1     lukem                     if (sym->type ==_URE_EOL_ANCHOR) {
   2110  1.1     lukem                         stp = dfa->states + stp->trans[i].next_state;
   2111  1.1     lukem                         if (stp->accepting) {
   2112  1.1     lukem                             me = sp - text;
   2113  1.1     lukem                             found = 1;
   2114  1.1     lukem                         } else
   2115  1.1     lukem                           break;
   2116  1.1     lukem                     }
   2117  1.1     lukem                 }
   2118  1.1     lukem             } else {
   2119  1.1     lukem                 /*
   2120  1.1     lukem                  * Make sure any conditions that match all the way to the end
   2121  1.1     lukem                  * of the string match.
   2122  1.1     lukem                  */
   2123  1.1     lukem                 found = 1;
   2124  1.1     lukem                 me = sp - text;
   2125  1.1     lukem             }
   2126  1.1     lukem         }
   2127  1.1     lukem     }
   2128  1.1     lukem 
   2129  1.1     lukem     if (found == 0)
   2130  1.1     lukem       ms = me = ~0;
   2131  1.1     lukem 
   2132  1.1     lukem     *match_start = ms;
   2133  1.1     lukem     *match_end = me;
   2134  1.1     lukem 
   2135  1.1     lukem     return (ms != ~0UL) ? 1 : 0;
   2136  1.1     lukem }
   2137