Home | History | Annotate | Line # | Download | only in ure
ure.c revision 1.1
      1  1.1  lukem /* $OpenLDAP: pkg/ldap/libraries/liblunicode/ure/ure.c,v 1.17.2.3 2008/02/11 23:26:42 kurt Exp $ */
      2  1.1  lukem /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
      3  1.1  lukem  *
      4  1.1  lukem  * Copyright 1998-2008 The OpenLDAP Foundation.
      5  1.1  lukem  * All rights reserved.
      6  1.1  lukem  *
      7  1.1  lukem  * Redistribution and use in source and binary forms, with or without
      8  1.1  lukem  * modification, are permitted only as authorized by the OpenLDAP
      9  1.1  lukem  * Public License.
     10  1.1  lukem  *
     11  1.1  lukem  * A copy of this license is available in file LICENSE in the
     12  1.1  lukem  * top-level directory of the distribution or, alternatively, at
     13  1.1  lukem  * <http://www.OpenLDAP.org/license.html>.
     14  1.1  lukem  */
     15  1.1  lukem /* Copyright 1997, 1998, 1999 Computing Research Labs,
     16  1.1  lukem  * New Mexico State University
     17  1.1  lukem  *
     18  1.1  lukem  * Permission is hereby granted, free of charge, to any person obtaining a
     19  1.1  lukem  * copy of this software and associated documentation files (the "Software"),
     20  1.1  lukem  * to deal in the Software without restriction, including without limitation
     21  1.1  lukem  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     22  1.1  lukem  * and/or sell copies of the Software, and to permit persons to whom the
     23  1.1  lukem  * Software is furnished to do so, subject to the following conditions:
     24  1.1  lukem  *
     25  1.1  lukem  * The above copyright notice and this permission notice shall be included in
     26  1.1  lukem  * all copies or substantial portions of the Software.
     27  1.1  lukem  *
     28  1.1  lukem  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     29  1.1  lukem  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     30  1.1  lukem  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     31  1.1  lukem  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
     32  1.1  lukem  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
     33  1.1  lukem  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
     34  1.1  lukem  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     35  1.1  lukem  */
     36  1.1  lukem /* $Id: ure.c,v 1.1 2008/05/22 14:20:36 lukem Exp $" */
     37  1.1  lukem 
     38  1.1  lukem #include "portable.h"
     39  1.1  lukem 
     40  1.1  lukem #include <ac/stdlib.h>
     41  1.1  lukem #include <ac/string.h>
     42  1.1  lukem #include <ac/unistd.h>
     43  1.1  lukem 
     44  1.1  lukem #include "ure.h"
     45  1.1  lukem 
     46  1.1  lukem /*
     47  1.1  lukem  * Flags used internally in the DFA.
     48  1.1  lukem  */
     49  1.1  lukem #define _URE_DFA_CASEFOLD  0x01
     50  1.1  lukem #define _URE_DFA_BLANKLINE 0x02
     51  1.1  lukem 
     52  1.1  lukem static unsigned long cclass_flags[] = {
     53  1.1  lukem     0,
     54  1.1  lukem     _URE_NONSPACING,
     55  1.1  lukem     _URE_COMBINING,
     56  1.1  lukem     _URE_NUMDIGIT,
     57  1.1  lukem     _URE_NUMOTHER,
     58  1.1  lukem     _URE_SPACESEP,
     59  1.1  lukem     _URE_LINESEP,
     60  1.1  lukem     _URE_PARASEP,
     61  1.1  lukem     _URE_CNTRL,
     62  1.1  lukem     _URE_PUA,
     63  1.1  lukem     _URE_UPPER,
     64  1.1  lukem     _URE_LOWER,
     65  1.1  lukem     _URE_TITLE,
     66  1.1  lukem     _URE_MODIFIER,
     67  1.1  lukem     _URE_OTHERLETTER,
     68  1.1  lukem     _URE_DASHPUNCT,
     69  1.1  lukem     _URE_OPENPUNCT,
     70  1.1  lukem     _URE_CLOSEPUNCT,
     71  1.1  lukem     _URE_OTHERPUNCT,
     72  1.1  lukem     _URE_MATHSYM,
     73  1.1  lukem     _URE_CURRENCYSYM,
     74  1.1  lukem     _URE_OTHERSYM,
     75  1.1  lukem     _URE_LTR,
     76  1.1  lukem     _URE_RTL,
     77  1.1  lukem     _URE_EURONUM,
     78  1.1  lukem     _URE_EURONUMSEP,
     79  1.1  lukem     _URE_EURONUMTERM,
     80  1.1  lukem     _URE_ARABNUM,
     81  1.1  lukem     _URE_COMMONSEP,
     82  1.1  lukem     _URE_BLOCKSEP,
     83  1.1  lukem     _URE_SEGMENTSEP,
     84  1.1  lukem     _URE_WHITESPACE,
     85  1.1  lukem     _URE_OTHERNEUT,
     86  1.1  lukem };
     87  1.1  lukem 
     88  1.1  lukem /*
     89  1.1  lukem  * Symbol types for the DFA.
     90  1.1  lukem  */
     91  1.1  lukem #define _URE_ANY_CHAR   1
     92  1.1  lukem #define _URE_CHAR       2
     93  1.1  lukem #define _URE_CCLASS     3
     94  1.1  lukem #define _URE_NCCLASS    4
     95  1.1  lukem #define _URE_BOL_ANCHOR 5
     96  1.1  lukem #define _URE_EOL_ANCHOR 6
     97  1.1  lukem 
     98  1.1  lukem /*
     99  1.1  lukem  * Op codes for converting the NFA to a DFA.
    100  1.1  lukem  */
    101  1.1  lukem #define _URE_SYMBOL     10
    102  1.1  lukem #define _URE_PAREN      11
    103  1.1  lukem #define _URE_QUEST      12
    104  1.1  lukem #define _URE_STAR       13
    105  1.1  lukem #define _URE_PLUS       14
    106  1.1  lukem #define _URE_ONE        15
    107  1.1  lukem #define _URE_AND        16
    108  1.1  lukem #define _URE_OR         17
    109  1.1  lukem 
    110  1.1  lukem #define _URE_NOOP       0xffff
    111  1.1  lukem 
    112  1.1  lukem #define _URE_REGSTART 0x8000
    113  1.1  lukem #define _URE_REGEND   0x4000
    114  1.1  lukem 
    115  1.1  lukem /*
    116  1.1  lukem  * Structure used to handle a compacted range of characters.
    117  1.1  lukem  */
    118  1.1  lukem typedef struct {
    119  1.1  lukem     ucs4_t min_code;
    120  1.1  lukem     ucs4_t max_code;
    121  1.1  lukem } _ure_range_t;
    122  1.1  lukem 
    123  1.1  lukem typedef struct {
    124  1.1  lukem     _ure_range_t *ranges;
    125  1.1  lukem     ucs2_t ranges_used;
    126  1.1  lukem     ucs2_t ranges_size;
    127  1.1  lukem } _ure_ccl_t;
    128  1.1  lukem 
    129  1.1  lukem typedef union {
    130  1.1  lukem     ucs4_t chr;
    131  1.1  lukem     _ure_ccl_t ccl;
    132  1.1  lukem } _ure_sym_t;
    133  1.1  lukem 
    134  1.1  lukem /*
    135  1.1  lukem  * This is a general element structure used for expressions and stack
    136  1.1  lukem  * elements.
    137  1.1  lukem  */
    138  1.1  lukem typedef struct {
    139  1.1  lukem     ucs2_t reg;
    140  1.1  lukem     ucs2_t onstack;
    141  1.1  lukem     ucs2_t type;
    142  1.1  lukem     ucs2_t lhs;
    143  1.1  lukem     ucs2_t rhs;
    144  1.1  lukem } _ure_elt_t;
    145  1.1  lukem 
    146  1.1  lukem /*
    147  1.1  lukem  * This is a structure used to track a list or a stack of states.
    148  1.1  lukem  */
    149  1.1  lukem typedef struct {
    150  1.1  lukem     ucs2_t *slist;
    151  1.1  lukem     ucs2_t slist_size;
    152  1.1  lukem     ucs2_t slist_used;
    153  1.1  lukem } _ure_stlist_t;
    154  1.1  lukem 
    155  1.1  lukem /*
    156  1.1  lukem  * Structure to track the list of unique states for a symbol
    157  1.1  lukem  * during reduction.
    158  1.1  lukem  */
    159  1.1  lukem typedef struct {
    160  1.1  lukem     ucs2_t id;
    161  1.1  lukem     ucs2_t type;
    162  1.1  lukem     unsigned long mods;
    163  1.1  lukem     unsigned long props;
    164  1.1  lukem     _ure_sym_t sym;
    165  1.1  lukem     _ure_stlist_t states;
    166  1.1  lukem } _ure_symtab_t;
    167  1.1  lukem 
    168  1.1  lukem /*
    169  1.1  lukem  * Structure to hold a single state.
    170  1.1  lukem  */
    171  1.1  lukem typedef struct {
    172  1.1  lukem     ucs2_t id;
    173  1.1  lukem     ucs2_t accepting;
    174  1.1  lukem     ucs2_t pad;
    175  1.1  lukem     _ure_stlist_t st;
    176  1.1  lukem     _ure_elt_t *trans;
    177  1.1  lukem     ucs2_t trans_size;
    178  1.1  lukem     ucs2_t trans_used;
    179  1.1  lukem } _ure_state_t;
    180  1.1  lukem 
    181  1.1  lukem /*
    182  1.1  lukem  * Structure used for keeping lists of states.
    183  1.1  lukem  */
    184  1.1  lukem typedef struct {
    185  1.1  lukem     _ure_state_t *states;
    186  1.1  lukem     ucs2_t states_size;
    187  1.1  lukem     ucs2_t states_used;
    188  1.1  lukem } _ure_statetable_t;
    189  1.1  lukem 
    190  1.1  lukem /*
    191  1.1  lukem  * Structure to track pairs of DFA states when equivalent states are
    192  1.1  lukem  * merged.
    193  1.1  lukem  */
    194  1.1  lukem typedef struct {
    195  1.1  lukem     ucs2_t l;
    196  1.1  lukem     ucs2_t r;
    197  1.1  lukem } _ure_equiv_t;
    198  1.1  lukem 
    199  1.1  lukem /*
    200  1.1  lukem  * Structure used for constructing the NFA and reducing to a minimal DFA.
    201  1.1  lukem  */
    202  1.1  lukem typedef struct _ure_buffer_t {
    203  1.1  lukem     int reducing;
    204  1.1  lukem     int error;
    205  1.1  lukem     unsigned long flags;
    206  1.1  lukem 
    207  1.1  lukem     _ure_stlist_t stack;
    208  1.1  lukem 
    209  1.1  lukem     /*
    210  1.1  lukem      * Table of unique symbols encountered.
    211  1.1  lukem      */
    212  1.1  lukem     _ure_symtab_t *symtab;
    213  1.1  lukem     ucs2_t symtab_size;
    214  1.1  lukem     ucs2_t symtab_used;
    215  1.1  lukem 
    216  1.1  lukem     /*
    217  1.1  lukem      * Tracks the unique expressions generated for the NFA and when the NFA is
    218  1.1  lukem      * reduced.
    219  1.1  lukem      */
    220  1.1  lukem     _ure_elt_t *expr;
    221  1.1  lukem     ucs2_t expr_used;
    222  1.1  lukem     ucs2_t expr_size;
    223  1.1  lukem 
    224  1.1  lukem     /*
    225  1.1  lukem      * The reduced table of unique groups of NFA states.
    226  1.1  lukem      */
    227  1.1  lukem     _ure_statetable_t states;
    228  1.1  lukem 
    229  1.1  lukem     /*
    230  1.1  lukem      * Tracks states when equivalent states are merged.
    231  1.1  lukem      */
    232  1.1  lukem     _ure_equiv_t *equiv;
    233  1.1  lukem     ucs2_t equiv_used;
    234  1.1  lukem     ucs2_t equiv_size;
    235  1.1  lukem } _ure_buffer_t;
    236  1.1  lukem 
    237  1.1  lukem typedef struct {
    238  1.1  lukem     ucs2_t symbol;
    239  1.1  lukem     ucs2_t next_state;
    240  1.1  lukem } _ure_trans_t;
    241  1.1  lukem 
    242  1.1  lukem typedef struct {
    243  1.1  lukem     ucs2_t accepting;
    244  1.1  lukem     ucs2_t ntrans;
    245  1.1  lukem     _ure_trans_t *trans;
    246  1.1  lukem } _ure_dstate_t;
    247  1.1  lukem 
    248  1.1  lukem typedef struct _ure_dfa_t {
    249  1.1  lukem     unsigned long flags;
    250  1.1  lukem 
    251  1.1  lukem     _ure_symtab_t *syms;
    252  1.1  lukem     ucs2_t nsyms;
    253  1.1  lukem 
    254  1.1  lukem     _ure_dstate_t *states;
    255  1.1  lukem     ucs2_t nstates;
    256  1.1  lukem 
    257  1.1  lukem     _ure_trans_t *trans;
    258  1.1  lukem     ucs2_t ntrans;
    259  1.1  lukem } _ure_dfa_t;
    260  1.1  lukem 
    261  1.1  lukem /*************************************************************************
    262  1.1  lukem  *
    263  1.1  lukem  * Functions.
    264  1.1  lukem  *
    265  1.1  lukem  *************************************************************************/
    266  1.1  lukem 
    267  1.1  lukem static void
    268  1.1  lukem _ure_memmove(char *dest, char *src, unsigned long bytes)
    269  1.1  lukem {
    270  1.1  lukem     long i, j;
    271  1.1  lukem 
    272  1.1  lukem     i = (long) bytes;
    273  1.1  lukem     j = i & 7;
    274  1.1  lukem     i = (i + 7) >> 3;
    275  1.1  lukem 
    276  1.1  lukem     /*
    277  1.1  lukem      * Do a memmove using Ye Olde Duff's Device for efficiency.
    278  1.1  lukem      */
    279  1.1  lukem     if (src < dest) {
    280  1.1  lukem         src += bytes;
    281  1.1  lukem         dest += bytes;
    282  1.1  lukem 
    283  1.1  lukem         switch (j) {
    284  1.1  lukem           case 0: do {
    285  1.1  lukem               *--dest = *--src;
    286  1.1  lukem             case 7: *--dest = *--src;
    287  1.1  lukem             case 6: *--dest = *--src;
    288  1.1  lukem             case 5: *--dest = *--src;
    289  1.1  lukem             case 4: *--dest = *--src;
    290  1.1  lukem             case 3: *--dest = *--src;
    291  1.1  lukem             case 2: *--dest = *--src;
    292  1.1  lukem             case 1: *--dest = *--src;
    293  1.1  lukem           } while (--i > 0);
    294  1.1  lukem         }
    295  1.1  lukem     } else if (src > dest) {
    296  1.1  lukem         switch (j) {
    297  1.1  lukem           case 0: do {
    298  1.1  lukem               *dest++ = *src++;
    299  1.1  lukem             case 7: *dest++ = *src++;
    300  1.1  lukem             case 6: *dest++ = *src++;
    301  1.1  lukem             case 5: *dest++ = *src++;
    302  1.1  lukem             case 4: *dest++ = *src++;
    303  1.1  lukem             case 3: *dest++ = *src++;
    304  1.1  lukem             case 2: *dest++ = *src++;
    305  1.1  lukem             case 1: *dest++ = *src++;
    306  1.1  lukem           } while (--i > 0);
    307  1.1  lukem         }
    308  1.1  lukem     }
    309  1.1  lukem }
    310  1.1  lukem 
    311  1.1  lukem static void
    312  1.1  lukem _ure_push(ucs2_t v, _ure_buffer_t *b)
    313  1.1  lukem {
    314  1.1  lukem     _ure_stlist_t *s;
    315  1.1  lukem 
    316  1.1  lukem     if (b == 0)
    317  1.1  lukem       return;
    318  1.1  lukem 
    319  1.1  lukem     /*
    320  1.1  lukem      * If the `reducing' parameter is non-zero, check to see if the value
    321  1.1  lukem      * passed is already on the stack.
    322  1.1  lukem      */
    323  1.1  lukem     if (b->reducing != 0 && b->expr[v].onstack != 0)
    324  1.1  lukem       return;
    325  1.1  lukem 
    326  1.1  lukem     s = &b->stack;
    327  1.1  lukem     if (s->slist_used == s->slist_size) {
    328  1.1  lukem         if (s->slist_size == 0)
    329  1.1  lukem           s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
    330  1.1  lukem         else
    331  1.1  lukem           s->slist = (ucs2_t *) realloc((char *) s->slist,
    332  1.1  lukem                                         sizeof(ucs2_t) * (s->slist_size + 8));
    333  1.1  lukem         s->slist_size += 8;
    334  1.1  lukem     }
    335  1.1  lukem     s->slist[s->slist_used++] = v;
    336  1.1  lukem 
    337  1.1  lukem     /*
    338  1.1  lukem      * If the `reducing' parameter is non-zero, flag the element as being on
    339  1.1  lukem      * the stack.
    340  1.1  lukem      */
    341  1.1  lukem     if (b->reducing != 0)
    342  1.1  lukem       b->expr[v].onstack = 1;
    343  1.1  lukem }
    344  1.1  lukem 
    345  1.1  lukem static ucs2_t
    346  1.1  lukem _ure_peek(_ure_buffer_t *b)
    347  1.1  lukem {
    348  1.1  lukem     if (b == 0 || b->stack.slist_used == 0)
    349  1.1  lukem       return _URE_NOOP;
    350  1.1  lukem 
    351  1.1  lukem     return b->stack.slist[b->stack.slist_used - 1];
    352  1.1  lukem }
    353  1.1  lukem 
    354  1.1  lukem static ucs2_t
    355  1.1  lukem _ure_pop(_ure_buffer_t *b)
    356  1.1  lukem {
    357  1.1  lukem     ucs2_t v;
    358  1.1  lukem 
    359  1.1  lukem     if (b == 0 || b->stack.slist_used == 0)
    360  1.1  lukem       return _URE_NOOP;
    361  1.1  lukem 
    362  1.1  lukem     v = b->stack.slist[--b->stack.slist_used];
    363  1.1  lukem     if (b->reducing)
    364  1.1  lukem       b->expr[v].onstack = 0;
    365  1.1  lukem 
    366  1.1  lukem     return v;
    367  1.1  lukem }
    368  1.1  lukem 
    369  1.1  lukem /*************************************************************************
    370  1.1  lukem  *
    371  1.1  lukem  * Start symbol parse functions.
    372  1.1  lukem  *
    373  1.1  lukem  *************************************************************************/
    374  1.1  lukem 
    375  1.1  lukem /*
    376  1.1  lukem  * Parse a comma-separated list of integers that represent character
    377  1.1  lukem  * properties.  Combine them into a mask that is returned in the `mask'
    378  1.1  lukem  * variable, and return the number of characters consumed.
    379  1.1  lukem  */
    380  1.1  lukem static unsigned long
    381  1.1  lukem _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
    382  1.1  lukem                _ure_buffer_t *b)
    383  1.1  lukem {
    384  1.1  lukem     unsigned long n, m;
    385  1.1  lukem     ucs2_t *sp, *ep;
    386  1.1  lukem 
    387  1.1  lukem     sp = pp;
    388  1.1  lukem     ep = sp + limit;
    389  1.1  lukem 
    390  1.1  lukem     for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
    391  1.1  lukem         if (*sp == ',') {
    392  1.1  lukem             /*
    393  1.1  lukem              * Encountered a comma, so select the next character property flag
    394  1.1  lukem              * and reset the number.
    395  1.1  lukem              */
    396  1.1  lukem             m |= cclass_flags[n];
    397  1.1  lukem             n = 0;
    398  1.1  lukem         } else if (*sp >= '0' && *sp <= '9')
    399  1.1  lukem           /*
    400  1.1  lukem            * Encountered a digit, so start or continue building the cardinal
    401  1.1  lukem            * that represents the character property flag.
    402  1.1  lukem            */
    403  1.1  lukem           n = (n * 10) + (*sp - '0');
    404  1.1  lukem         else
    405  1.1  lukem           /*
    406  1.1  lukem            * Encountered something that is not part of the property list.
    407  1.1  lukem            * Indicate that we are done.
    408  1.1  lukem            */
    409  1.1  lukem           break;
    410  1.1  lukem 
    411  1.1  lukem         /*
    412  1.1  lukem          * If a property number greater than 32 occurs, then there is a
    413  1.1  lukem          * problem.  Most likely a missing comma separator.
    414  1.1  lukem          */
    415  1.1  lukem         if (n > 32)
    416  1.1  lukem           b->error = _URE_INVALID_PROPERTY;
    417  1.1  lukem     }
    418  1.1  lukem 
    419  1.1  lukem     if (n != 0)
    420  1.1  lukem       m |= cclass_flags[n];
    421  1.1  lukem 
    422  1.1  lukem     /*
    423  1.1  lukem      * Set the mask that represents the group of character properties.
    424  1.1  lukem      */
    425  1.1  lukem     *mask = m;
    426  1.1  lukem 
    427  1.1  lukem     /*
    428  1.1  lukem      * Return the number of characters consumed.
    429  1.1  lukem      */
    430  1.1  lukem     return sp - pp;
    431  1.1  lukem }
    432  1.1  lukem 
    433  1.1  lukem /*
    434  1.1  lukem  * Collect a hex number with 1 to 4 digits and return the number
    435  1.1  lukem  * of characters used.
    436  1.1  lukem  */
    437  1.1  lukem static unsigned long
    438  1.1  lukem _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
    439  1.1  lukem {
    440  1.1  lukem     ucs2_t i;
    441  1.1  lukem     ucs2_t *sp, *ep;
    442  1.1  lukem     ucs4_t nn;
    443  1.1  lukem 
    444  1.1  lukem     sp = np;
    445  1.1  lukem     ep = sp + limit;
    446  1.1  lukem 
    447  1.1  lukem     for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
    448  1.1  lukem         if (*sp >= '0' && *sp <= '9')
    449  1.1  lukem           nn = (nn << 4) + (*sp - '0');
    450  1.1  lukem         else if (*sp >= 'A' && *sp <= 'F')
    451  1.1  lukem           nn = (nn << 4) + ((*sp - 'A') + 10);
    452  1.1  lukem         else if (*sp >= 'a' && *sp <= 'f')
    453  1.1  lukem           nn = (nn << 4) + ((*sp - 'a') + 10);
    454  1.1  lukem         else
    455  1.1  lukem           /*
    456  1.1  lukem            * Encountered something that is not a hex digit.
    457  1.1  lukem            */
    458  1.1  lukem           break;
    459  1.1  lukem     }
    460  1.1  lukem 
    461  1.1  lukem     /*
    462  1.1  lukem      * Assign the character code collected and return the number of
    463  1.1  lukem      * characters used.
    464  1.1  lukem      */
    465  1.1  lukem     *n = nn;
    466  1.1  lukem 
    467  1.1  lukem     return sp - np;
    468  1.1  lukem }
    469  1.1  lukem 
    470  1.1  lukem /*
    471  1.1  lukem  * Insert a range into a character class, removing duplicates and ordering
    472  1.1  lukem  * them in increasing range-start order.
    473  1.1  lukem  */
    474  1.1  lukem static void
    475  1.1  lukem _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
    476  1.1  lukem {
    477  1.1  lukem     ucs2_t i;
    478  1.1  lukem     ucs4_t tmp;
    479  1.1  lukem     _ure_range_t *rp;
    480  1.1  lukem 
    481  1.1  lukem     /*
    482  1.1  lukem      * If the `casefold' flag is set, then make sure both endpoints of the
    483  1.1  lukem      * range are converted to lower case.
    484  1.1  lukem      */
    485  1.1  lukem     if (b->flags & _URE_DFA_CASEFOLD) {
    486  1.1  lukem         r->min_code = _ure_tolower(r->min_code);
    487  1.1  lukem         r->max_code = _ure_tolower(r->max_code);
    488  1.1  lukem     }
    489  1.1  lukem 
    490  1.1  lukem     /*
    491  1.1  lukem      * Swap the range endpoints if they are not in increasing order.
    492  1.1  lukem      */
    493  1.1  lukem     if (r->min_code > r->max_code) {
    494  1.1  lukem         tmp = r->min_code;
    495  1.1  lukem         r->min_code = r->max_code;
    496  1.1  lukem         r->max_code = tmp;
    497  1.1  lukem     }
    498  1.1  lukem 
    499  1.1  lukem     for (i = 0, rp = ccl->ranges;
    500  1.1  lukem          i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
    501  1.1  lukem 
    502  1.1  lukem     /*
    503  1.1  lukem      * Check for a duplicate.
    504  1.1  lukem      */
    505  1.1  lukem     if (i < ccl->ranges_used &&
    506  1.1  lukem         r->min_code == rp->min_code && r->max_code == rp->max_code)
    507  1.1  lukem       return;
    508  1.1  lukem 
    509  1.1  lukem     if (ccl->ranges_used == ccl->ranges_size) {
    510  1.1  lukem         if (ccl->ranges_size == 0)
    511  1.1  lukem           ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
    512  1.1  lukem         else
    513  1.1  lukem           ccl->ranges = (_ure_range_t *)
    514  1.1  lukem               realloc((char *) ccl->ranges,
    515  1.1  lukem                       sizeof(_ure_range_t) * (ccl->ranges_size + 8));
    516  1.1  lukem         ccl->ranges_size += 8;
    517  1.1  lukem     }
    518  1.1  lukem 
    519  1.1  lukem     rp = ccl->ranges + ccl->ranges_used;
    520  1.1  lukem 
    521  1.1  lukem     if (i < ccl->ranges_used)
    522  1.1  lukem       _ure_memmove((char *) (rp + 1), (char *) rp,
    523  1.1  lukem                    sizeof(_ure_range_t) * (ccl->ranges_used - i));
    524  1.1  lukem 
    525  1.1  lukem     ccl->ranges_used++;
    526  1.1  lukem     rp->min_code = r->min_code;
    527  1.1  lukem     rp->max_code = r->max_code;
    528  1.1  lukem }
    529  1.1  lukem 
    530  1.1  lukem #define _URE_ALPHA_MASK  (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
    531  1.1  lukem _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
    532  1.1  lukem #define _URE_ALNUM_MASK  (_URE_ALPHA_MASK|_URE_NUMDIGIT)
    533  1.1  lukem #define _URE_PUNCT_MASK  (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
    534  1.1  lukem _URE_OTHERPUNCT)
    535  1.1  lukem #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
    536  1.1  lukem _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
    537  1.1  lukem #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
    538  1.1  lukem #define _URE_SPACE_MASK  (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
    539  1.1  lukem 
    540  1.1  lukem typedef void (*_ure_cclsetup_t)(
    541  1.1  lukem     _ure_symtab_t *sym,
    542  1.1  lukem     unsigned long mask,
    543  1.1  lukem     _ure_buffer_t *b
    544  1.1  lukem );
    545  1.1  lukem 
    546  1.1  lukem typedef struct {
    547  1.1  lukem     ucs2_t key;
    548  1.1  lukem     unsigned long len;
    549  1.1  lukem     unsigned long next;
    550  1.1  lukem     _ure_cclsetup_t func;
    551  1.1  lukem     unsigned long mask;
    552  1.1  lukem } _ure_trie_t;
    553  1.1  lukem 
    554  1.1  lukem static void
    555  1.1  lukem _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
    556  1.1  lukem {
    557  1.1  lukem     sym->props |= mask;
    558  1.1  lukem }
    559  1.1  lukem 
    560  1.1  lukem static void
    561  1.1  lukem _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
    562  1.1  lukem {
    563  1.1  lukem     _ure_range_t range;
    564  1.1  lukem 
    565  1.1  lukem     sym->props |= mask;
    566  1.1  lukem 
    567  1.1  lukem     /*
    568  1.1  lukem      * Add the additional characters needed for handling isspace().
    569  1.1  lukem      */
    570  1.1  lukem     range.min_code = range.max_code = '\t';
    571  1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    572  1.1  lukem     range.min_code = range.max_code = '\r';
    573  1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    574  1.1  lukem     range.min_code = range.max_code = '\n';
    575  1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    576  1.1  lukem     range.min_code = range.max_code = '\f';
    577  1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    578  1.1  lukem     range.min_code = range.max_code = 0xfeff;
    579  1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    580  1.1  lukem }
    581  1.1  lukem 
    582  1.1  lukem static void
    583  1.1  lukem _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
    584  1.1  lukem {
    585  1.1  lukem     _ure_range_t range;
    586  1.1  lukem 
    587  1.1  lukem     /*
    588  1.1  lukem      * Add the additional characters needed for handling isxdigit().
    589  1.1  lukem      */
    590  1.1  lukem     range.min_code = '0';
    591  1.1  lukem     range.max_code = '9';
    592  1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    593  1.1  lukem     range.min_code = 'A';
    594  1.1  lukem     range.max_code = 'F';
    595  1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    596  1.1  lukem     range.min_code = 'a';
    597  1.1  lukem     range.max_code = 'f';
    598  1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    599  1.1  lukem }
    600  1.1  lukem 
    601  1.1  lukem static _ure_trie_t cclass_trie[] = {
    602  1.1  lukem     {0x003a, 1, 1, 0, 0},
    603  1.1  lukem     {0x0061, 9, 10, 0, 0},
    604  1.1  lukem     {0x0063, 8, 19, 0, 0},
    605  1.1  lukem     {0x0064, 7, 24, 0, 0},
    606  1.1  lukem     {0x0067, 6, 29, 0, 0},
    607  1.1  lukem     {0x006c, 5, 34, 0, 0},
    608  1.1  lukem     {0x0070, 4, 39, 0, 0},
    609  1.1  lukem     {0x0073, 3, 49, 0, 0},
    610  1.1  lukem     {0x0075, 2, 54, 0, 0},
    611  1.1  lukem     {0x0078, 1, 59, 0, 0},
    612  1.1  lukem     {0x006c, 1, 11, 0, 0},
    613  1.1  lukem     {0x006e, 2, 13, 0, 0},
    614  1.1  lukem     {0x0070, 1, 16, 0, 0},
    615  1.1  lukem     {0x0075, 1, 14, 0, 0},
    616  1.1  lukem     {0x006d, 1, 15, 0, 0},
    617  1.1  lukem     {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
    618  1.1  lukem     {0x0068, 1, 17, 0, 0},
    619  1.1  lukem     {0x0061, 1, 18, 0, 0},
    620  1.1  lukem     {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
    621  1.1  lukem     {0x006e, 1, 20, 0, 0},
    622  1.1  lukem     {0x0074, 1, 21, 0, 0},
    623  1.1  lukem     {0x0072, 1, 22, 0, 0},
    624  1.1  lukem     {0x006c, 1, 23, 0, 0},
    625  1.1  lukem     {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
    626  1.1  lukem     {0x0069, 1, 25, 0, 0},
    627  1.1  lukem     {0x0067, 1, 26, 0, 0},
    628  1.1  lukem     {0x0069, 1, 27, 0, 0},
    629  1.1  lukem     {0x0074, 1, 28, 0, 0},
    630  1.1  lukem     {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
    631  1.1  lukem     {0x0072, 1, 30, 0, 0},
    632  1.1  lukem     {0x0061, 1, 31, 0, 0},
    633  1.1  lukem     {0x0070, 1, 32, 0, 0},
    634  1.1  lukem     {0x0068, 1, 33, 0, 0},
    635  1.1  lukem     {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
    636  1.1  lukem     {0x006f, 1, 35, 0, 0},
    637  1.1  lukem     {0x0077, 1, 36, 0, 0},
    638  1.1  lukem     {0x0065, 1, 37, 0, 0},
    639  1.1  lukem     {0x0072, 1, 38, 0, 0},
    640  1.1  lukem     {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
    641  1.1  lukem     {0x0072, 2, 41, 0, 0},
    642  1.1  lukem     {0x0075, 1, 45, 0, 0},
    643  1.1  lukem     {0x0069, 1, 42, 0, 0},
    644  1.1  lukem     {0x006e, 1, 43, 0, 0},
    645  1.1  lukem     {0x0074, 1, 44, 0, 0},
    646  1.1  lukem     {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
    647  1.1  lukem     {0x006e, 1, 46, 0, 0},
    648  1.1  lukem     {0x0063, 1, 47, 0, 0},
    649  1.1  lukem     {0x0074, 1, 48, 0, 0},
    650  1.1  lukem     {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
    651  1.1  lukem     {0x0070, 1, 50, 0, 0},
    652  1.1  lukem     {0x0061, 1, 51, 0, 0},
    653  1.1  lukem     {0x0063, 1, 52, 0, 0},
    654  1.1  lukem     {0x0065, 1, 53, 0, 0},
    655  1.1  lukem     {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
    656  1.1  lukem     {0x0070, 1, 55, 0, 0},
    657  1.1  lukem     {0x0070, 1, 56, 0, 0},
    658  1.1  lukem     {0x0065, 1, 57, 0, 0},
    659  1.1  lukem     {0x0072, 1, 58, 0, 0},
    660  1.1  lukem     {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
    661  1.1  lukem     {0x0064, 1, 60, 0, 0},
    662  1.1  lukem     {0x0069, 1, 61, 0, 0},
    663  1.1  lukem     {0x0067, 1, 62, 0, 0},
    664  1.1  lukem     {0x0069, 1, 63, 0, 0},
    665  1.1  lukem     {0x0074, 1, 64, 0, 0},
    666  1.1  lukem     {0x003a, 1, 65, _ure_xdigit_setup, 0},
    667  1.1  lukem };
    668  1.1  lukem 
    669  1.1  lukem /*
    670  1.1  lukem  * Probe for one of the POSIX colon delimited character classes in the static
    671  1.1  lukem  * trie.
    672  1.1  lukem  */
    673  1.1  lukem static unsigned long
    674  1.1  lukem _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
    675  1.1  lukem                _ure_buffer_t *b)
    676  1.1  lukem {
    677  1.1  lukem     int i;
    678  1.1  lukem     unsigned long n;
    679  1.1  lukem     _ure_trie_t *tp;
    680  1.1  lukem     ucs2_t *sp, *ep;
    681  1.1  lukem 
    682  1.1  lukem     /*
    683  1.1  lukem      * If the number of characters left is less than 7, then this cannot be
    684  1.1  lukem      * interpreted as one of the colon delimited classes.
    685  1.1  lukem      */
    686  1.1  lukem     if (limit < 7)
    687  1.1  lukem       return 0;
    688  1.1  lukem 
    689  1.1  lukem     sp = cp;
    690  1.1  lukem     ep = sp + limit;
    691  1.1  lukem     tp = cclass_trie;
    692  1.1  lukem     for (i = 0; sp < ep && i < 8; i++, sp++) {
    693  1.1  lukem         n = tp->len;
    694  1.1  lukem 
    695  1.1  lukem         for (; n > 0 && tp->key != *sp; tp++, n--) ;
    696  1.1  lukem 
    697  1.1  lukem         if (n == 0)
    698  1.1  lukem           return 0;
    699  1.1  lukem 
    700  1.1  lukem         if (*sp == ':' && (i == 6 || i == 7)) {
    701  1.1  lukem             sp++;
    702  1.1  lukem             break;
    703  1.1  lukem         }
    704  1.1  lukem         if (sp + 1 < ep)
    705  1.1  lukem           tp = cclass_trie + tp->next;
    706  1.1  lukem     }
    707  1.1  lukem     if (tp->func == 0)
    708  1.1  lukem       return 0;
    709  1.1  lukem 
    710  1.1  lukem     (*tp->func)(sym, tp->mask, b);
    711  1.1  lukem 
    712  1.1  lukem     return sp - cp;
    713  1.1  lukem }
    714  1.1  lukem 
    715  1.1  lukem /*
    716  1.1  lukem  * Construct a list of ranges and return the number of characters consumed.
    717  1.1  lukem  */
    718  1.1  lukem static unsigned long
    719  1.1  lukem _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
    720  1.1  lukem             _ure_buffer_t *b)
    721  1.1  lukem {
    722  1.1  lukem     int range_end;
    723  1.1  lukem     unsigned long n;
    724  1.1  lukem     ucs2_t *sp, *ep;
    725  1.1  lukem     ucs4_t c, last;
    726  1.1  lukem     _ure_ccl_t *cclp;
    727  1.1  lukem     _ure_range_t range;
    728  1.1  lukem 
    729  1.1  lukem     sp = cp;
    730  1.1  lukem     ep = sp + limit;
    731  1.1  lukem 
    732  1.1  lukem     if (*sp == '^') {
    733  1.1  lukem       symp->type = _URE_NCCLASS;
    734  1.1  lukem       sp++;
    735  1.1  lukem     } else
    736  1.1  lukem       symp->type = _URE_CCLASS;
    737  1.1  lukem 
    738  1.1  lukem     for (last = 0, range_end = 0;
    739  1.1  lukem          b->error == _URE_OK && sp < ep && *sp != ']'; ) {
    740  1.1  lukem         c = *sp++;
    741  1.1  lukem         if (c == '\\') {
    742  1.1  lukem             if (sp == ep) {
    743  1.1  lukem                 /*
    744  1.1  lukem                  * The EOS was encountered when expecting the reverse solidus
    745  1.1  lukem                  * to be followed by the character it is escaping.  Set an
    746  1.1  lukem                  * error code and return the number of characters consumed up
    747  1.1  lukem                  * to this point.
    748  1.1  lukem                  */
    749  1.1  lukem                 b->error = _URE_UNEXPECTED_EOS;
    750  1.1  lukem                 return sp - cp;
    751  1.1  lukem             }
    752  1.1  lukem 
    753  1.1  lukem             c = *sp++;
    754  1.1  lukem             switch (c) {
    755  1.1  lukem               case 'a':
    756  1.1  lukem                 c = 0x07;
    757  1.1  lukem                 break;
    758  1.1  lukem               case 'b':
    759  1.1  lukem                 c = 0x08;
    760  1.1  lukem                 break;
    761  1.1  lukem               case 'f':
    762  1.1  lukem                 c = 0x0c;
    763  1.1  lukem                 break;
    764  1.1  lukem               case 'n':
    765  1.1  lukem                 c = 0x0a;
    766  1.1  lukem                 break;
    767  1.1  lukem               case 'r':
    768  1.1  lukem                 c = 0x0d;
    769  1.1  lukem                 break;
    770  1.1  lukem               case 't':
    771  1.1  lukem                 c = 0x09;
    772  1.1  lukem                 break;
    773  1.1  lukem               case 'v':
    774  1.1  lukem                 c = 0x0b;
    775  1.1  lukem                 break;
    776  1.1  lukem               case 'p':
    777  1.1  lukem               case 'P':
    778  1.1  lukem                 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
    779  1.1  lukem                 /*
    780  1.1  lukem                  * Invert the bit mask of the properties if this is a negated
    781  1.1  lukem                  * character class or if 'P' is used to specify a list of
    782  1.1  lukem                  * character properties that should *not* match in a
    783  1.1  lukem                  * character class.
    784  1.1  lukem                  */
    785  1.1  lukem                 if (c == 'P')
    786  1.1  lukem                   symp->props = ~symp->props;
    787  1.1  lukem                 continue;
    788  1.1  lukem                 break;
    789  1.1  lukem               case 'x':
    790  1.1  lukem               case 'X':
    791  1.1  lukem               case 'u':
    792  1.1  lukem               case 'U':
    793  1.1  lukem                 if (sp < ep &&
    794  1.1  lukem                     ((*sp >= '0' && *sp <= '9') ||
    795  1.1  lukem                      (*sp >= 'A' && *sp <= 'F') ||
    796  1.1  lukem                      (*sp >= 'a' && *sp <= 'f')))
    797  1.1  lukem                   sp += _ure_hex(sp, ep - sp, &c);
    798  1.1  lukem             }
    799  1.1  lukem         } else if (c == ':') {
    800  1.1  lukem             /*
    801  1.1  lukem              * Probe for a POSIX colon delimited character class.
    802  1.1  lukem              */
    803  1.1  lukem             sp--;
    804  1.1  lukem             if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
    805  1.1  lukem               sp++;
    806  1.1  lukem             else {
    807  1.1  lukem                 sp += n;
    808  1.1  lukem                 continue;
    809  1.1  lukem             }
    810  1.1  lukem         }
    811  1.1  lukem 
    812  1.1  lukem         cclp = &symp->sym.ccl;
    813  1.1  lukem 
    814  1.1  lukem         /*
    815  1.1  lukem          * Check to see if the current character is a low surrogate that needs
    816  1.1  lukem          * to be combined with a preceding high surrogate.
    817  1.1  lukem          */
    818  1.1  lukem         if (last != 0) {
    819  1.1  lukem             if (c >= 0xdc00 && c <= 0xdfff)
    820  1.1  lukem               /*
    821  1.1  lukem                * Construct the UTF16 character code.
    822  1.1  lukem                */
    823  1.1  lukem               c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
    824  1.1  lukem             else {
    825  1.1  lukem                 /*
    826  1.1  lukem                  * Add the isolated high surrogate to the range.
    827  1.1  lukem                  */
    828  1.1  lukem                 if (range_end == 1)
    829  1.1  lukem                   range.max_code = last & 0xffff;
    830  1.1  lukem                 else
    831  1.1  lukem                   range.min_code = range.max_code = last & 0xffff;
    832  1.1  lukem 
    833  1.1  lukem                 _ure_add_range(cclp, &range, b);
    834  1.1  lukem                 range_end = 0;
    835  1.1  lukem             }
    836  1.1  lukem         }
    837  1.1  lukem 
    838  1.1  lukem         /*
    839  1.1  lukem          * Clear the last character code.
    840  1.1  lukem          */
    841  1.1  lukem         last = 0;
    842  1.1  lukem 
    843  1.1  lukem         /*
    844  1.1  lukem          * This slightly awkward code handles the different cases needed to
    845  1.1  lukem          * construct a range.
    846  1.1  lukem          */
    847  1.1  lukem         if (c >= 0xd800 && c <= 0xdbff) {
    848  1.1  lukem             /*
    849  1.1  lukem              * If the high surrogate is followed by a range indicator, simply
    850  1.1  lukem              * add it as the range start.  Otherwise, save it in case the next
    851  1.1  lukem              * character is a low surrogate.
    852  1.1  lukem              */
    853  1.1  lukem             if (*sp == '-') {
    854  1.1  lukem                 sp++;
    855  1.1  lukem                 range.min_code = c;
    856  1.1  lukem                 range_end = 1;
    857  1.1  lukem             } else
    858  1.1  lukem               last = c;
    859  1.1  lukem         } else if (range_end == 1) {
    860  1.1  lukem             range.max_code = c;
    861  1.1  lukem             _ure_add_range(cclp, &range, b);
    862  1.1  lukem             range_end = 0;
    863  1.1  lukem         } else {
    864  1.1  lukem             range.min_code = range.max_code = c;
    865  1.1  lukem             if (*sp == '-') {
    866  1.1  lukem                 sp++;
    867  1.1  lukem                 range_end = 1;
    868  1.1  lukem             } else
    869  1.1  lukem               _ure_add_range(cclp, &range, b);
    870  1.1  lukem         }
    871  1.1  lukem     }
    872  1.1  lukem 
    873  1.1  lukem     if (sp < ep && *sp == ']')
    874  1.1  lukem       sp++;
    875  1.1  lukem     else
    876  1.1  lukem       /*
    877  1.1  lukem        * The parse was not terminated by the character class close symbol
    878  1.1  lukem        * (']'), so set an error code.
    879  1.1  lukem        */
    880  1.1  lukem       b->error = _URE_CCLASS_OPEN;
    881  1.1  lukem 
    882  1.1  lukem     return sp - cp;
    883  1.1  lukem }
    884  1.1  lukem 
    885  1.1  lukem /*
    886  1.1  lukem  * Probe for a low surrogate hex code.
    887  1.1  lukem  */
    888  1.1  lukem static unsigned long
    889  1.1  lukem _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
    890  1.1  lukem {
    891  1.1  lukem     ucs4_t i, code;
    892  1.1  lukem     ucs2_t *sp, *ep;
    893  1.1  lukem 
    894  1.1  lukem     for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
    895  1.1  lukem         if (*sp >= '0' && *sp <= '9')
    896  1.1  lukem           code = (code << 4) + (*sp - '0');
    897  1.1  lukem         else if (*sp >= 'A' && *sp <= 'F')
    898  1.1  lukem           code = (code << 4) + ((*sp - 'A') + 10);
    899  1.1  lukem         else if (*sp >= 'a' && *sp <= 'f')
    900  1.1  lukem           code = (code << 4) + ((*sp - 'a') + 10);
    901  1.1  lukem         else
    902  1.1  lukem           break;
    903  1.1  lukem     }
    904  1.1  lukem 
    905  1.1  lukem     *c = code;
    906  1.1  lukem     return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
    907  1.1  lukem }
    908  1.1  lukem 
    909  1.1  lukem static unsigned long
    910  1.1  lukem _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
    911  1.1  lukem                     _ure_buffer_t *b)
    912  1.1  lukem {
    913  1.1  lukem     ucs4_t c;
    914  1.1  lukem     ucs2_t *sp, *ep;
    915  1.1  lukem 
    916  1.1  lukem     sp = sym;
    917  1.1  lukem     ep = sym + limit;
    918  1.1  lukem 
    919  1.1  lukem     if ((c = *sp++) == '\\') {
    920  1.1  lukem 
    921  1.1  lukem         if (sp == ep) {
    922  1.1  lukem             /*
    923  1.1  lukem              * The EOS was encountered when expecting the reverse solidus to
    924  1.1  lukem              * be followed by the character it is escaping.  Set an error code
    925  1.1  lukem              * and return the number of characters consumed up to this point.
    926  1.1  lukem              */
    927  1.1  lukem             b->error = _URE_UNEXPECTED_EOS;
    928  1.1  lukem             return sp - sym;
    929  1.1  lukem         }
    930  1.1  lukem 
    931  1.1  lukem         c = *sp++;
    932  1.1  lukem         switch (c) {
    933  1.1  lukem           case 'p':
    934  1.1  lukem           case 'P':
    935  1.1  lukem             symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
    936  1.1  lukem             sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
    937  1.1  lukem             break;
    938  1.1  lukem           case 'a':
    939  1.1  lukem             symp->type = _URE_CHAR;
    940  1.1  lukem             symp->sym.chr = 0x07;
    941  1.1  lukem             break;
    942  1.1  lukem           case 'b':
    943  1.1  lukem             symp->type = _URE_CHAR;
    944  1.1  lukem             symp->sym.chr = 0x08;
    945  1.1  lukem             break;
    946  1.1  lukem           case 'f':
    947  1.1  lukem             symp->type = _URE_CHAR;
    948  1.1  lukem             symp->sym.chr = 0x0c;
    949  1.1  lukem             break;
    950  1.1  lukem           case 'n':
    951  1.1  lukem             symp->type = _URE_CHAR;
    952  1.1  lukem             symp->sym.chr = 0x0a;
    953  1.1  lukem             break;
    954  1.1  lukem           case 'r':
    955  1.1  lukem             symp->type = _URE_CHAR;
    956  1.1  lukem             symp->sym.chr = 0x0d;
    957  1.1  lukem             break;
    958  1.1  lukem           case 't':
    959  1.1  lukem             symp->type = _URE_CHAR;
    960  1.1  lukem             symp->sym.chr = 0x09;
    961  1.1  lukem             break;
    962  1.1  lukem           case 'v':
    963  1.1  lukem             symp->type = _URE_CHAR;
    964  1.1  lukem             symp->sym.chr = 0x0b;
    965  1.1  lukem             break;
    966  1.1  lukem           case 'x':
    967  1.1  lukem           case 'X':
    968  1.1  lukem           case 'u':
    969  1.1  lukem           case 'U':
    970  1.1  lukem             /*
    971  1.1  lukem              * Collect between 1 and 4 digits representing a UCS2 code.  Fall
    972  1.1  lukem              * through to the next case.
    973  1.1  lukem              */
    974  1.1  lukem             if (sp < ep &&
    975  1.1  lukem                 ((*sp >= '0' && *sp <= '9') ||
    976  1.1  lukem                  (*sp >= 'A' && *sp <= 'F') ||
    977  1.1  lukem                  (*sp >= 'a' && *sp <= 'f')))
    978  1.1  lukem               sp += _ure_hex(sp, ep - sp, &c);
    979  1.1  lukem             /* FALLTHROUGH */
    980  1.1  lukem           default:
    981  1.1  lukem             /*
    982  1.1  lukem              * Simply add an escaped character here.
    983  1.1  lukem              */
    984  1.1  lukem             symp->type = _URE_CHAR;
    985  1.1  lukem             symp->sym.chr = c;
    986  1.1  lukem         }
    987  1.1  lukem     } else if (c == '^' || c == '$')
    988  1.1  lukem       /*
    989  1.1  lukem        * Handle the BOL and EOL anchors.  This actually consists simply of
    990  1.1  lukem        * setting a flag that indicates that the user supplied anchor match
    991  1.1  lukem        * function should be called.  This needs to be done instead of simply
    992  1.1  lukem        * matching line/paragraph separators because beginning-of-text and
    993  1.1  lukem        * end-of-text tests are needed as well.
    994  1.1  lukem        */
    995  1.1  lukem       symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
    996  1.1  lukem     else if (c == '[')
    997  1.1  lukem       /*
    998  1.1  lukem        * Construct a character class.
    999  1.1  lukem        */
   1000  1.1  lukem       sp += _ure_cclass(sp, ep - sp, symp, b);
   1001  1.1  lukem     else if (c == '.')
   1002  1.1  lukem       symp->type = _URE_ANY_CHAR;
   1003  1.1  lukem     else {
   1004  1.1  lukem         symp->type = _URE_CHAR;
   1005  1.1  lukem         symp->sym.chr = c;
   1006  1.1  lukem     }
   1007  1.1  lukem 
   1008  1.1  lukem     /*
   1009  1.1  lukem      * If the symbol type happens to be a character and is a high surrogate,
   1010  1.1  lukem      * then probe forward to see if it is followed by a low surrogate that
   1011  1.1  lukem      * needs to be added.
   1012  1.1  lukem      */
   1013  1.1  lukem     if (sp < ep && symp->type == _URE_CHAR &&
   1014  1.1  lukem         0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
   1015  1.1  lukem 
   1016  1.1  lukem         if (0xdc00 <= *sp && *sp <= 0xdfff) {
   1017  1.1  lukem             symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
   1018  1.1  lukem                                        (*sp & 0x03ff));
   1019  1.1  lukem             sp++;
   1020  1.1  lukem         } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
   1021  1.1  lukem                                  *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
   1022  1.1  lukem             sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
   1023  1.1  lukem             if (0xdc00 <= c && c <= 0xdfff) {
   1024  1.1  lukem                 /*
   1025  1.1  lukem                  * Take into account the \[xu] in front of the hex code.
   1026  1.1  lukem                  */
   1027  1.1  lukem                 sp += 2;
   1028  1.1  lukem                 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
   1029  1.1  lukem                                            (c & 0x03ff));
   1030  1.1  lukem             }
   1031  1.1  lukem         }
   1032  1.1  lukem     }
   1033  1.1  lukem 
   1034  1.1  lukem     /*
   1035  1.1  lukem      * Last, make sure any _URE_CHAR type symbols are changed to lower case if
   1036  1.1  lukem      * the `casefold' flag is set.
   1037  1.1  lukem      */
   1038  1.1  lukem     if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
   1039  1.1  lukem       symp->sym.chr = _ure_tolower(symp->sym.chr);
   1040  1.1  lukem 
   1041  1.1  lukem     /*
   1042  1.1  lukem      * If the symbol constructed is anything other than one of the anchors,
   1043  1.1  lukem      * make sure the _URE_DFA_BLANKLINE flag is removed.
   1044  1.1  lukem      */
   1045  1.1  lukem     if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
   1046  1.1  lukem       b->flags &= ~_URE_DFA_BLANKLINE;
   1047  1.1  lukem 
   1048  1.1  lukem     /*
   1049  1.1  lukem      * Return the number of characters consumed.
   1050  1.1  lukem      */
   1051  1.1  lukem     return sp - sym;
   1052  1.1  lukem }
   1053  1.1  lukem 
   1054  1.1  lukem static int
   1055  1.1  lukem _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
   1056  1.1  lukem {
   1057  1.1  lukem     if (a->type != b->type || a->mods != b->mods || a->props != b->props)
   1058  1.1  lukem       return 1;
   1059  1.1  lukem 
   1060  1.1  lukem     if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
   1061  1.1  lukem         if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
   1062  1.1  lukem           return 1;
   1063  1.1  lukem         if (a->sym.ccl.ranges_used > 0 &&
   1064  1.1  lukem             memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
   1065  1.1  lukem                    sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
   1066  1.1  lukem           return 1;
   1067  1.1  lukem     } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
   1068  1.1  lukem       return 1;
   1069  1.1  lukem     return 0;
   1070  1.1  lukem }
   1071  1.1  lukem 
   1072  1.1  lukem /*
   1073  1.1  lukem  * Construct a symbol, but only keep unique symbols.
   1074  1.1  lukem  */
   1075  1.1  lukem static ucs2_t
   1076  1.1  lukem _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
   1077  1.1  lukem                  _ure_buffer_t *b)
   1078  1.1  lukem {
   1079  1.1  lukem     ucs2_t i;
   1080  1.1  lukem     _ure_symtab_t *sp, symbol;
   1081  1.1  lukem 
   1082  1.1  lukem     /*
   1083  1.1  lukem      * Build the next symbol so we can test to see if it is already in the
   1084  1.1  lukem      * symbol table.
   1085  1.1  lukem      */
   1086  1.1  lukem     (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
   1087  1.1  lukem     *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
   1088  1.1  lukem 
   1089  1.1  lukem     /*
   1090  1.1  lukem      * Check to see if the symbol exists.
   1091  1.1  lukem      */
   1092  1.1  lukem     for (i = 0, sp = b->symtab;
   1093  1.1  lukem          i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
   1094  1.1  lukem 
   1095  1.1  lukem     if (i < b->symtab_used) {
   1096  1.1  lukem         /*
   1097  1.1  lukem          * Free up any ranges used for the symbol.
   1098  1.1  lukem          */
   1099  1.1  lukem         if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
   1100  1.1  lukem             symbol.sym.ccl.ranges_size > 0)
   1101  1.1  lukem           free((char *) symbol.sym.ccl.ranges);
   1102  1.1  lukem 
   1103  1.1  lukem         return b->symtab[i].id;
   1104  1.1  lukem     }
   1105  1.1  lukem 
   1106  1.1  lukem     /*
   1107  1.1  lukem      * Need to add the new symbol.
   1108  1.1  lukem      */
   1109  1.1  lukem     if (b->symtab_used == b->symtab_size) {
   1110  1.1  lukem         if (b->symtab_size == 0)
   1111  1.1  lukem           b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
   1112  1.1  lukem         else
   1113  1.1  lukem           b->symtab = (_ure_symtab_t *)
   1114  1.1  lukem               realloc((char *) b->symtab,
   1115  1.1  lukem                       sizeof(_ure_symtab_t) * (b->symtab_size + 8));
   1116  1.1  lukem         sp = b->symtab + b->symtab_size;
   1117  1.1  lukem         (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
   1118  1.1  lukem         b->symtab_size += 8;
   1119  1.1  lukem     }
   1120  1.1  lukem 
   1121  1.1  lukem     symbol.id = b->symtab_used++;
   1122  1.1  lukem     (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
   1123  1.1  lukem                   sizeof(_ure_symtab_t));
   1124  1.1  lukem 
   1125  1.1  lukem     return symbol.id;
   1126  1.1  lukem }
   1127  1.1  lukem 
   1128  1.1  lukem /*************************************************************************
   1129  1.1  lukem  *
   1130  1.1  lukem  * End symbol parse functions.
   1131  1.1  lukem  *
   1132  1.1  lukem  *************************************************************************/
   1133  1.1  lukem 
   1134  1.1  lukem static ucs2_t
   1135  1.1  lukem _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
   1136  1.1  lukem {
   1137  1.1  lukem     ucs2_t i;
   1138  1.1  lukem 
   1139  1.1  lukem     if (b == 0)
   1140  1.1  lukem       return _URE_NOOP;
   1141  1.1  lukem 
   1142  1.1  lukem     /*
   1143  1.1  lukem      * Determine if the expression already exists or not.
   1144  1.1  lukem      */
   1145  1.1  lukem     for (i = 0; i < b->expr_used; i++) {
   1146  1.1  lukem         if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
   1147  1.1  lukem             b->expr[i].rhs == rhs)
   1148  1.1  lukem           break;
   1149  1.1  lukem     }
   1150  1.1  lukem     if (i < b->expr_used)
   1151  1.1  lukem       return i;
   1152  1.1  lukem 
   1153  1.1  lukem     /*
   1154  1.1  lukem      * Need to add a new expression.
   1155  1.1  lukem      */
   1156  1.1  lukem     if (b->expr_used == b->expr_size) {
   1157  1.1  lukem         if (b->expr_size == 0)
   1158  1.1  lukem           b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
   1159  1.1  lukem         else
   1160  1.1  lukem           b->expr = (_ure_elt_t *)
   1161  1.1  lukem               realloc((char *) b->expr,
   1162  1.1  lukem                       sizeof(_ure_elt_t) * (b->expr_size + 8));
   1163  1.1  lukem         b->expr_size += 8;
   1164  1.1  lukem     }
   1165  1.1  lukem 
   1166  1.1  lukem     b->expr[b->expr_used].onstack = 0;
   1167  1.1  lukem     b->expr[b->expr_used].type = type;
   1168  1.1  lukem     b->expr[b->expr_used].lhs = lhs;
   1169  1.1  lukem     b->expr[b->expr_used].rhs = rhs;
   1170  1.1  lukem 
   1171  1.1  lukem     return b->expr_used++;
   1172  1.1  lukem }
   1173  1.1  lukem 
   1174  1.1  lukem static unsigned char spmap[] = {
   1175  1.1  lukem     0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
   1176  1.1  lukem     0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   1177  1.1  lukem     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   1178  1.1  lukem };
   1179  1.1  lukem 
   1180  1.1  lukem #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
   1181  1.1  lukem                             (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
   1182  1.1  lukem 
   1183  1.1  lukem /*
   1184  1.1  lukem  * Convert the regular expression into an NFA in a form that will be easy to
   1185  1.1  lukem  * reduce to a DFA.  The starting state for the reduction will be returned.
   1186  1.1  lukem  */
   1187  1.1  lukem static ucs2_t
   1188  1.1  lukem _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
   1189  1.1  lukem {
   1190  1.1  lukem     ucs2_t c, state, top, sym, *sp, *ep;
   1191  1.1  lukem     unsigned long used;
   1192  1.1  lukem 
   1193  1.1  lukem     state = _URE_NOOP;
   1194  1.1  lukem 
   1195  1.1  lukem     sp = re;
   1196  1.1  lukem     ep = sp + relen;
   1197  1.1  lukem     while (b->error == _URE_OK && sp < ep) {
   1198  1.1  lukem         c = *sp++;
   1199  1.1  lukem         switch (c) {
   1200  1.1  lukem           case '(':
   1201  1.1  lukem             _ure_push(_URE_PAREN, b);
   1202  1.1  lukem             break;
   1203  1.1  lukem           case ')':
   1204  1.1  lukem             /*
   1205  1.1  lukem              * Check for the case of too many close parentheses.
   1206  1.1  lukem              */
   1207  1.1  lukem             if (_ure_peek(b) == _URE_NOOP) {
   1208  1.1  lukem                 b->error = _URE_UNBALANCED_GROUP;
   1209  1.1  lukem                 break;
   1210  1.1  lukem             }
   1211  1.1  lukem 
   1212  1.1  lukem             while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
   1213  1.1  lukem               /*
   1214  1.1  lukem                * Make an expression with the AND or OR operator and its right
   1215  1.1  lukem                * hand side.
   1216  1.1  lukem                */
   1217  1.1  lukem               state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
   1218  1.1  lukem 
   1219  1.1  lukem             /*
   1220  1.1  lukem              * Remove the _URE_PAREN off the stack.
   1221  1.1  lukem              */
   1222  1.1  lukem             (void) _ure_pop(b);
   1223  1.1  lukem             break;
   1224  1.1  lukem           case '*':
   1225  1.1  lukem             state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
   1226  1.1  lukem             break;
   1227  1.1  lukem           case '+':
   1228  1.1  lukem             state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
   1229  1.1  lukem             break;
   1230  1.1  lukem           case '?':
   1231  1.1  lukem             state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
   1232  1.1  lukem             break;
   1233  1.1  lukem           case '|':
   1234  1.1  lukem             while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
   1235  1.1  lukem               /*
   1236  1.1  lukem                * Make an expression with the AND or OR operator and its right
   1237  1.1  lukem                * hand side.
   1238  1.1  lukem                */
   1239  1.1  lukem               state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
   1240  1.1  lukem 
   1241  1.1  lukem             _ure_push(state, b);
   1242  1.1  lukem             _ure_push(_URE_OR, b);
   1243  1.1  lukem             break;
   1244  1.1  lukem           default:
   1245  1.1  lukem             sp--;
   1246  1.1  lukem             sym = _ure_make_symbol(sp, ep - sp, &used, b);
   1247  1.1  lukem             sp += used;
   1248  1.1  lukem             state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
   1249  1.1  lukem             break;
   1250  1.1  lukem         }
   1251  1.1  lukem 
   1252  1.1  lukem         if (c != '(' && c != '|' && sp < ep &&
   1253  1.1  lukem             (!_ure_isspecial(*sp) || *sp == '(')) {
   1254  1.1  lukem             _ure_push(state, b);
   1255  1.1  lukem             _ure_push(_URE_AND, b);
   1256  1.1  lukem         }
   1257  1.1  lukem     }
   1258  1.1  lukem     while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
   1259  1.1  lukem       /*
   1260  1.1  lukem        * Make an expression with the AND or OR operator and its right
   1261  1.1  lukem        * hand side.
   1262  1.1  lukem        */
   1263  1.1  lukem       state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
   1264  1.1  lukem 
   1265  1.1  lukem     if (b->stack.slist_used > 0)
   1266  1.1  lukem       b->error = _URE_UNBALANCED_GROUP;
   1267  1.1  lukem 
   1268  1.1  lukem     return (b->error == _URE_OK) ? state : _URE_NOOP;
   1269  1.1  lukem }
   1270  1.1  lukem 
   1271  1.1  lukem static void
   1272  1.1  lukem _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
   1273  1.1  lukem {
   1274  1.1  lukem     ucs2_t i, *stp;
   1275  1.1  lukem     _ure_symtab_t *sp;
   1276  1.1  lukem 
   1277  1.1  lukem     /*
   1278  1.1  lukem      * Locate the symbol in the symbol table so the state can be added.
   1279  1.1  lukem      * If the symbol doesn't exist, then a real problem exists.
   1280  1.1  lukem      */
   1281  1.1  lukem     for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
   1282  1.1  lukem          i++, sp++) ;
   1283  1.1  lukem 
   1284  1.1  lukem     /*
   1285  1.1  lukem      * Now find out if the state exists in the symbol's state list.
   1286  1.1  lukem      */
   1287  1.1  lukem     for (i = 0, stp = sp->states.slist;
   1288  1.1  lukem          i < sp->states.slist_used && state > *stp; i++, stp++) ;
   1289  1.1  lukem 
   1290  1.1  lukem     if (i == sp->states.slist_used || state < *stp) {
   1291  1.1  lukem         /*
   1292  1.1  lukem          * Need to add the state in order.
   1293  1.1  lukem          */
   1294  1.1  lukem         if (sp->states.slist_used == sp->states.slist_size) {
   1295  1.1  lukem             if (sp->states.slist_size == 0)
   1296  1.1  lukem               sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
   1297  1.1  lukem             else
   1298  1.1  lukem               sp->states.slist = (ucs2_t *)
   1299  1.1  lukem                   realloc((char *) sp->states.slist,
   1300  1.1  lukem                           sizeof(ucs2_t) * (sp->states.slist_size + 8));
   1301  1.1  lukem             sp->states.slist_size += 8;
   1302  1.1  lukem         }
   1303  1.1  lukem         if (i < sp->states.slist_used)
   1304  1.1  lukem           (void) _ure_memmove((char *) (sp->states.slist + i + 1),
   1305  1.1  lukem                               (char *) (sp->states.slist + i),
   1306  1.1  lukem                               sizeof(ucs2_t) * (sp->states.slist_used - i));
   1307  1.1  lukem         sp->states.slist[i] = state;
   1308  1.1  lukem         sp->states.slist_used++;
   1309  1.1  lukem     }
   1310  1.1  lukem }
   1311  1.1  lukem 
   1312  1.1  lukem static ucs2_t
   1313  1.1  lukem _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
   1314  1.1  lukem {
   1315  1.1  lukem     ucs2_t i;
   1316  1.1  lukem     _ure_state_t *sp;
   1317  1.1  lukem 
   1318  1.1  lukem     for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
   1319  1.1  lukem         if (sp->st.slist_used == nstates &&
   1320  1.1  lukem             memcmp((char *) states, (char *) sp->st.slist,
   1321  1.1  lukem                    sizeof(ucs2_t) * nstates) == 0)
   1322  1.1  lukem           break;
   1323  1.1  lukem     }
   1324  1.1  lukem 
   1325  1.1  lukem     if (i == b->states.states_used) {
   1326  1.1  lukem         /*
   1327  1.1  lukem          * Need to add a new DFA state (set of NFA states).
   1328  1.1  lukem          */
   1329  1.1  lukem         if (b->states.states_used == b->states.states_size) {
   1330  1.1  lukem             if (b->states.states_size == 0)
   1331  1.1  lukem               b->states.states = (_ure_state_t *)
   1332  1.1  lukem                   malloc(sizeof(_ure_state_t) << 3);
   1333  1.1  lukem             else
   1334  1.1  lukem               b->states.states = (_ure_state_t *)
   1335  1.1  lukem                   realloc((char *) b->states.states,
   1336  1.1  lukem                           sizeof(_ure_state_t) * (b->states.states_size + 8));
   1337  1.1  lukem             sp = b->states.states + b->states.states_size;
   1338  1.1  lukem             (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
   1339  1.1  lukem             b->states.states_size += 8;
   1340  1.1  lukem         }
   1341  1.1  lukem 
   1342  1.1  lukem         sp = b->states.states + b->states.states_used++;
   1343  1.1  lukem         sp->id = i;
   1344  1.1  lukem 
   1345  1.1  lukem         if (sp->st.slist_used + nstates > sp->st.slist_size) {
   1346  1.1  lukem             if (sp->st.slist_size == 0)
   1347  1.1  lukem               sp->st.slist = (ucs2_t *)
   1348  1.1  lukem                   malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
   1349  1.1  lukem             else
   1350  1.1  lukem               sp->st.slist = (ucs2_t *)
   1351  1.1  lukem                   realloc((char *) sp->st.slist,
   1352  1.1  lukem                           sizeof(ucs2_t) * (sp->st.slist_used + nstates));
   1353  1.1  lukem             sp->st.slist_size = sp->st.slist_used + nstates;
   1354  1.1  lukem         }
   1355  1.1  lukem         sp->st.slist_used = nstates;
   1356  1.1  lukem         (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
   1357  1.1  lukem                       sizeof(ucs2_t) * nstates);
   1358  1.1  lukem     }
   1359  1.1  lukem 
   1360  1.1  lukem     /*
   1361  1.1  lukem      * Return the ID of the DFA state representing a group of NFA states.
   1362  1.1  lukem      */
   1363  1.1  lukem     return i;
   1364  1.1  lukem }
   1365  1.1  lukem 
   1366  1.1  lukem static void
   1367  1.1  lukem _ure_reduce(ucs2_t start, _ure_buffer_t *b)
   1368  1.1  lukem {
   1369  1.1  lukem     ucs2_t i, j, state, eval, syms, rhs;
   1370  1.1  lukem     ucs2_t s1, s2, ns1, ns2;
   1371  1.1  lukem     _ure_state_t *sp;
   1372  1.1  lukem     _ure_symtab_t *smp;
   1373  1.1  lukem 
   1374  1.1  lukem     b->reducing = 1;
   1375  1.1  lukem 
   1376  1.1  lukem     /*
   1377  1.1  lukem      * Add the starting state for the reduction.
   1378  1.1  lukem      */
   1379  1.1  lukem     _ure_add_state(1, &start, b);
   1380  1.1  lukem 
   1381  1.1  lukem     /*
   1382  1.1  lukem      * Process each set of NFA states that get created.
   1383  1.1  lukem      */
   1384  1.1  lukem     for (i = 0; i < b->states.states_used; i++) {
   1385  1.1  lukem         sp = b->states.states + i;
   1386  1.1  lukem 
   1387  1.1  lukem         /*
   1388  1.1  lukem          * Push the current states on the stack.
   1389  1.1  lukem          */
   1390  1.1  lukem         for (j = 0; j < sp->st.slist_used; j++)
   1391  1.1  lukem           _ure_push(sp->st.slist[j], b);
   1392  1.1  lukem 
   1393  1.1  lukem         /*
   1394  1.1  lukem          * Reduce the NFA states.
   1395  1.1  lukem          */
   1396  1.1  lukem         for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
   1397  1.1  lukem             state = b->stack.slist[j];
   1398  1.1  lukem             eval = 1;
   1399  1.1  lukem 
   1400  1.1  lukem             /*
   1401  1.1  lukem              * This inner loop is the iterative equivalent of recursively
   1402  1.1  lukem              * reducing subexpressions generated as a result of a reduction.
   1403  1.1  lukem              */
   1404  1.1  lukem             while (eval) {
   1405  1.1  lukem                 switch (b->expr[state].type) {
   1406  1.1  lukem                   case _URE_SYMBOL:
   1407  1.1  lukem                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
   1408  1.1  lukem                     _ure_add_symstate(b->expr[state].lhs, ns1, b);
   1409  1.1  lukem                     syms++;
   1410  1.1  lukem                     eval = 0;
   1411  1.1  lukem                     break;
   1412  1.1  lukem                   case _URE_ONE:
   1413  1.1  lukem                     sp->accepting = 1;
   1414  1.1  lukem                     eval = 0;
   1415  1.1  lukem                     break;
   1416  1.1  lukem                   case _URE_QUEST:
   1417  1.1  lukem                     s1 = b->expr[state].lhs;
   1418  1.1  lukem                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
   1419  1.1  lukem                     state = _ure_make_expr(_URE_OR, ns1, s1, b);
   1420  1.1  lukem                     break;
   1421  1.1  lukem                   case _URE_PLUS:
   1422  1.1  lukem                     s1 = b->expr[state].lhs;
   1423  1.1  lukem                     ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
   1424  1.1  lukem                     state = _ure_make_expr(_URE_AND, s1, ns1, b);
   1425  1.1  lukem                     break;
   1426  1.1  lukem                   case _URE_STAR:
   1427  1.1  lukem                     s1 = b->expr[state].lhs;
   1428  1.1  lukem                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
   1429  1.1  lukem                     ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
   1430  1.1  lukem                     state = _ure_make_expr(_URE_OR, ns1, ns2, b);
   1431  1.1  lukem                     break;
   1432  1.1  lukem                   case _URE_OR:
   1433  1.1  lukem                     s1 = b->expr[state].lhs;
   1434  1.1  lukem                     s2 = b->expr[state].rhs;
   1435  1.1  lukem                     _ure_push(s1, b);
   1436  1.1  lukem                     _ure_push(s2, b);
   1437  1.1  lukem                     eval = 0;
   1438  1.1  lukem                     break;
   1439  1.1  lukem                   case _URE_AND:
   1440  1.1  lukem                     s1 = b->expr[state].lhs;
   1441  1.1  lukem                     s2 = b->expr[state].rhs;
   1442  1.1  lukem                     switch (b->expr[s1].type) {
   1443  1.1  lukem                       case _URE_SYMBOL:
   1444  1.1  lukem                         _ure_add_symstate(b->expr[s1].lhs, s2, b);
   1445  1.1  lukem                         syms++;
   1446  1.1  lukem                         eval = 0;
   1447  1.1  lukem                         break;
   1448  1.1  lukem                       case _URE_ONE:
   1449  1.1  lukem                         state = s2;
   1450  1.1  lukem                         break;
   1451  1.1  lukem                       case _URE_QUEST:
   1452  1.1  lukem                         ns1 = b->expr[s1].lhs;
   1453  1.1  lukem                         ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
   1454  1.1  lukem                         state = _ure_make_expr(_URE_OR, s2, ns2, b);
   1455  1.1  lukem                         break;
   1456  1.1  lukem                       case _URE_PLUS:
   1457  1.1  lukem                         ns1 = b->expr[s1].lhs;
   1458  1.1  lukem                         ns2 = _ure_make_expr(_URE_OR, s2, state, b);
   1459  1.1  lukem                         state = _ure_make_expr(_URE_AND, ns1, ns2, b);
   1460  1.1  lukem                         break;
   1461  1.1  lukem                       case _URE_STAR:
   1462  1.1  lukem                         ns1 = b->expr[s1].lhs;
   1463  1.1  lukem                         ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
   1464  1.1  lukem                         state = _ure_make_expr(_URE_OR, s2, ns2, b);
   1465  1.1  lukem                         break;
   1466  1.1  lukem                       case _URE_OR:
   1467  1.1  lukem                         ns1 = b->expr[s1].lhs;
   1468  1.1  lukem                         ns2 = b->expr[s1].rhs;
   1469  1.1  lukem                         ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
   1470  1.1  lukem                         ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
   1471  1.1  lukem                         state = _ure_make_expr(_URE_OR, ns1, ns2, b);
   1472  1.1  lukem                         break;
   1473  1.1  lukem                       case _URE_AND:
   1474  1.1  lukem                         ns1 = b->expr[s1].lhs;
   1475  1.1  lukem                         ns2 = b->expr[s1].rhs;
   1476  1.1  lukem                         ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
   1477  1.1  lukem                         state = _ure_make_expr(_URE_AND, ns1, ns2, b);
   1478  1.1  lukem                         break;
   1479  1.1  lukem                     }
   1480  1.1  lukem                 }
   1481  1.1  lukem             }
   1482  1.1  lukem         }
   1483  1.1  lukem 
   1484  1.1  lukem         /*
   1485  1.1  lukem          * Clear the state stack.
   1486  1.1  lukem          */
   1487  1.1  lukem         while (_ure_pop(b) != _URE_NOOP) ;
   1488  1.1  lukem 
   1489  1.1  lukem         /*
   1490  1.1  lukem          * Reset the state pointer because the reduction may have moved it
   1491  1.1  lukem          * during a reallocation.
   1492  1.1  lukem          */
   1493  1.1  lukem         sp = b->states.states + i;
   1494  1.1  lukem 
   1495  1.1  lukem         /*
   1496  1.1  lukem          * Generate the DFA states for the symbols collected during the
   1497  1.1  lukem          * current reduction.
   1498  1.1  lukem          */
   1499  1.1  lukem         if (sp->trans_used + syms > sp->trans_size) {
   1500  1.1  lukem             if (sp->trans_size == 0)
   1501  1.1  lukem               sp->trans = (_ure_elt_t *)
   1502  1.1  lukem                   malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
   1503  1.1  lukem             else
   1504  1.1  lukem               sp->trans = (_ure_elt_t *)
   1505  1.1  lukem                   realloc((char *) sp->trans,
   1506  1.1  lukem                           sizeof(_ure_elt_t) * (sp->trans_used + syms));
   1507  1.1  lukem             sp->trans_size = sp->trans_used + syms;
   1508  1.1  lukem         }
   1509  1.1  lukem 
   1510  1.1  lukem         /*
   1511  1.1  lukem          * Go through the symbol table and generate the DFA state transitions
   1512  1.1  lukem          * for each symbol that has collected NFA states.
   1513  1.1  lukem          */
   1514  1.1  lukem         for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
   1515  1.1  lukem             sp = b->states.states + i;
   1516  1.1  lukem 
   1517  1.1  lukem             if (smp->states.slist_used > 0) {
   1518  1.1  lukem                 sp->trans[syms].lhs = smp->id;
   1519  1.1  lukem                 rhs = _ure_add_state(smp->states.slist_used,
   1520  1.1  lukem                                      smp->states.slist, b);
   1521  1.1  lukem                 /*
   1522  1.1  lukem                  * Reset the state pointer in case the reallocation moves it
   1523  1.1  lukem                  * in memory.
   1524  1.1  lukem                  */
   1525  1.1  lukem                 sp = b->states.states + i;
   1526  1.1  lukem                 sp->trans[syms].rhs = rhs;
   1527  1.1  lukem 
   1528  1.1  lukem                 smp->states.slist_used = 0;
   1529  1.1  lukem                 syms++;
   1530  1.1  lukem             }
   1531  1.1  lukem         }
   1532  1.1  lukem 
   1533  1.1  lukem         /*
   1534  1.1  lukem          * Set the number of transitions actually used.
   1535  1.1  lukem          */
   1536  1.1  lukem         sp->trans_used = syms;
   1537  1.1  lukem     }
   1538  1.1  lukem     b->reducing = 0;
   1539  1.1  lukem }
   1540  1.1  lukem 
   1541  1.1  lukem static void
   1542  1.1  lukem _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
   1543  1.1  lukem {
   1544  1.1  lukem     ucs2_t tmp;
   1545  1.1  lukem 
   1546  1.1  lukem     l = b->states.states[l].id;
   1547  1.1  lukem     r = b->states.states[r].id;
   1548  1.1  lukem 
   1549  1.1  lukem     if (l == r)
   1550  1.1  lukem       return;
   1551  1.1  lukem 
   1552  1.1  lukem     if (l > r) {
   1553  1.1  lukem         tmp = l;
   1554  1.1  lukem         l = r;
   1555  1.1  lukem         r = tmp;
   1556  1.1  lukem     }
   1557  1.1  lukem 
   1558  1.1  lukem     /*
   1559  1.1  lukem      * Check to see if the equivalence pair already exists.
   1560  1.1  lukem      */
   1561  1.1  lukem     for (tmp = 0; tmp < b->equiv_used &&
   1562  1.1  lukem              (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
   1563  1.1  lukem          tmp++) ;
   1564  1.1  lukem 
   1565  1.1  lukem     if (tmp < b->equiv_used)
   1566  1.1  lukem       return;
   1567  1.1  lukem 
   1568  1.1  lukem     if (b->equiv_used == b->equiv_size) {
   1569  1.1  lukem         if (b->equiv_size == 0)
   1570  1.1  lukem           b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
   1571  1.1  lukem         else
   1572  1.1  lukem           b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
   1573  1.1  lukem                                               sizeof(_ure_equiv_t) *
   1574  1.1  lukem                                               (b->equiv_size + 8));
   1575  1.1  lukem         b->equiv_size += 8;
   1576  1.1  lukem     }
   1577  1.1  lukem     b->equiv[b->equiv_used].l = l;
   1578  1.1  lukem     b->equiv[b->equiv_used].r = r;
   1579  1.1  lukem     b->equiv_used++;
   1580  1.1  lukem }
   1581  1.1  lukem 
   1582  1.1  lukem /*
   1583  1.1  lukem  * Merge the DFA states that are equivalent.
   1584  1.1  lukem  */
   1585  1.1  lukem static void
   1586  1.1  lukem _ure_merge_equiv(_ure_buffer_t *b)
   1587  1.1  lukem {
   1588  1.1  lukem     ucs2_t i, j, k, eq, done;
   1589  1.1  lukem     _ure_state_t *sp1, *sp2, *ls, *rs;
   1590  1.1  lukem 
   1591  1.1  lukem     for (i = 0; i < b->states.states_used; i++) {
   1592  1.1  lukem         sp1 = b->states.states + i;
   1593  1.1  lukem         if (sp1->id != i)
   1594  1.1  lukem           continue;
   1595  1.1  lukem         for (j = 0; j < i; j++) {
   1596  1.1  lukem             sp2 = b->states.states + j;
   1597  1.1  lukem             if (sp2->id != j)
   1598  1.1  lukem               continue;
   1599  1.1  lukem             b->equiv_used = 0;
   1600  1.1  lukem             _ure_add_equiv(i, j, b);
   1601  1.1  lukem             for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
   1602  1.1  lukem                 ls = b->states.states + b->equiv[eq].l;
   1603  1.1  lukem                 rs = b->states.states + b->equiv[eq].r;
   1604  1.1  lukem                 if (ls->accepting != rs->accepting ||
   1605  1.1  lukem                     ls->trans_used != rs->trans_used) {
   1606  1.1  lukem                     done = 1;
   1607  1.1  lukem                     break;
   1608  1.1  lukem                 }
   1609  1.1  lukem                 for (k = 0; k < ls->trans_used &&
   1610  1.1  lukem                          ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
   1611  1.1  lukem                 if (k < ls->trans_used) {
   1612  1.1  lukem                     done = 1;
   1613  1.1  lukem                     break;
   1614  1.1  lukem                 }
   1615  1.1  lukem 
   1616  1.1  lukem                 for (k = 0; k < ls->trans_used; k++)
   1617  1.1  lukem                   _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
   1618  1.1  lukem             }
   1619  1.1  lukem             if (done == 0)
   1620  1.1  lukem               break;
   1621  1.1  lukem         }
   1622  1.1  lukem         for (eq = 0; j < i && eq < b->equiv_used; eq++)
   1623  1.1  lukem           b->states.states[b->equiv[eq].r].id =
   1624  1.1  lukem               b->states.states[b->equiv[eq].l].id;
   1625  1.1  lukem     }
   1626  1.1  lukem 
   1627  1.1  lukem     /*
   1628  1.1  lukem      * Renumber the states appropriately.
   1629  1.1  lukem      */
   1630  1.1  lukem     for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
   1631  1.1  lukem          sp1++, i++)
   1632  1.1  lukem       sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
   1633  1.1  lukem }
   1634  1.1  lukem 
   1635  1.1  lukem /*************************************************************************
   1636  1.1  lukem  *
   1637  1.1  lukem  * API.
   1638  1.1  lukem  *
   1639  1.1  lukem  *************************************************************************/
   1640  1.1  lukem 
   1641  1.1  lukem ure_buffer_t
   1642  1.1  lukem ure_buffer_create(void)
   1643  1.1  lukem {
   1644  1.1  lukem     ure_buffer_t b;
   1645  1.1  lukem 
   1646  1.1  lukem     b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
   1647  1.1  lukem 
   1648  1.1  lukem     return b;
   1649  1.1  lukem }
   1650  1.1  lukem 
   1651  1.1  lukem void
   1652  1.1  lukem ure_buffer_free(ure_buffer_t buf)
   1653  1.1  lukem {
   1654  1.1  lukem     unsigned long i;
   1655  1.1  lukem 
   1656  1.1  lukem     if (buf == 0)
   1657  1.1  lukem       return;
   1658  1.1  lukem 
   1659  1.1  lukem     if (buf->stack.slist_size > 0)
   1660  1.1  lukem       free((char *) buf->stack.slist);
   1661  1.1  lukem 
   1662  1.1  lukem     if (buf->expr_size > 0)
   1663  1.1  lukem       free((char *) buf->expr);
   1664  1.1  lukem 
   1665  1.1  lukem     for (i = 0; i < buf->symtab_size; i++) {
   1666  1.1  lukem         if (buf->symtab[i].states.slist_size > 0)
   1667  1.1  lukem           free((char *) buf->symtab[i].states.slist);
   1668  1.1  lukem     }
   1669  1.1  lukem 
   1670  1.1  lukem     if (buf->symtab_size > 0)
   1671  1.1  lukem       free((char *) buf->symtab);
   1672  1.1  lukem 
   1673  1.1  lukem     for (i = 0; i < buf->states.states_size; i++) {
   1674  1.1  lukem         if (buf->states.states[i].trans_size > 0)
   1675  1.1  lukem           free((char *) buf->states.states[i].trans);
   1676  1.1  lukem         if (buf->states.states[i].st.slist_size > 0)
   1677  1.1  lukem           free((char *) buf->states.states[i].st.slist);
   1678  1.1  lukem     }
   1679  1.1  lukem 
   1680  1.1  lukem     if (buf->states.states_size > 0)
   1681  1.1  lukem       free((char *) buf->states.states);
   1682  1.1  lukem 
   1683  1.1  lukem     if (buf->equiv_size > 0)
   1684  1.1  lukem       free((char *) buf->equiv);
   1685  1.1  lukem 
   1686  1.1  lukem     free((char *) buf);
   1687  1.1  lukem }
   1688  1.1  lukem 
   1689  1.1  lukem ure_dfa_t
   1690  1.1  lukem ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
   1691  1.1  lukem {
   1692  1.1  lukem     ucs2_t i, j, state;
   1693  1.1  lukem     _ure_state_t *sp;
   1694  1.1  lukem     _ure_dstate_t *dsp;
   1695  1.1  lukem     _ure_trans_t *tp;
   1696  1.1  lukem     ure_dfa_t dfa;
   1697  1.1  lukem 
   1698  1.1  lukem     if (re == 0 || *re == 0 || relen == 0 || buf == 0)
   1699  1.1  lukem       return 0;
   1700  1.1  lukem 
   1701  1.1  lukem     /*
   1702  1.1  lukem      * Reset the various fields of the compilation buffer.  Default the flags
   1703  1.1  lukem      * to indicate the presense of the "^$" pattern.  If any other pattern
   1704  1.1  lukem      * occurs, then this flag will be removed.  This is done to catch this
   1705  1.1  lukem      * special pattern and handle it specially when matching.
   1706  1.1  lukem      */
   1707  1.1  lukem     buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
   1708  1.1  lukem     buf->reducing = 0;
   1709  1.1  lukem     buf->stack.slist_used = 0;
   1710  1.1  lukem     buf->expr_used = 0;
   1711  1.1  lukem 
   1712  1.1  lukem     for (i = 0; i < buf->symtab_used; i++)
   1713  1.1  lukem       buf->symtab[i].states.slist_used = 0;
   1714  1.1  lukem     buf->symtab_used = 0;
   1715  1.1  lukem 
   1716  1.1  lukem     for (i = 0; i < buf->states.states_used; i++) {
   1717  1.1  lukem         buf->states.states[i].st.slist_used = 0;
   1718  1.1  lukem         buf->states.states[i].trans_used = 0;
   1719  1.1  lukem     }
   1720  1.1  lukem     buf->states.states_used = 0;
   1721  1.1  lukem 
   1722  1.1  lukem     /*
   1723  1.1  lukem      * Construct the NFA.  If this stage returns a 0, then an error occured or
   1724  1.1  lukem      * an empty expression was passed.
   1725  1.1  lukem      */
   1726  1.1  lukem     if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
   1727  1.1  lukem       return 0;
   1728  1.1  lukem 
   1729  1.1  lukem     /*
   1730  1.1  lukem      * Do the expression reduction to get the initial DFA.
   1731  1.1  lukem      */
   1732  1.1  lukem     _ure_reduce(state, buf);
   1733  1.1  lukem 
   1734  1.1  lukem     /*
   1735  1.1  lukem      * Merge all the equivalent DFA states.
   1736  1.1  lukem      */
   1737  1.1  lukem     _ure_merge_equiv(buf);
   1738  1.1  lukem 
   1739  1.1  lukem     /*
   1740  1.1  lukem      * Construct the minimal DFA.
   1741  1.1  lukem      */
   1742  1.1  lukem     dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
   1743  1.1  lukem     (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
   1744  1.1  lukem 
   1745  1.1  lukem     dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
   1746  1.1  lukem 
   1747  1.1  lukem     /*
   1748  1.1  lukem      * Free up the NFA state groups and transfer the symbols from the buffer
   1749  1.1  lukem      * to the DFA.
   1750  1.1  lukem      */
   1751  1.1  lukem     for (i = 0; i < buf->symtab_size; i++) {
   1752  1.1  lukem         if (buf->symtab[i].states.slist_size > 0)
   1753  1.1  lukem           free((char *) buf->symtab[i].states.slist);
   1754  1.1  lukem     }
   1755  1.1  lukem     dfa->syms = buf->symtab;
   1756  1.1  lukem     dfa->nsyms = buf->symtab_used;
   1757  1.1  lukem 
   1758  1.1  lukem     buf->symtab_used = buf->symtab_size = 0;
   1759  1.1  lukem 
   1760  1.1  lukem     /*
   1761  1.1  lukem      * Collect the total number of states and transitions needed for the DFA.
   1762  1.1  lukem      */
   1763  1.1  lukem     for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
   1764  1.1  lukem          i++, sp++) {
   1765  1.1  lukem         if (sp->id == state) {
   1766  1.1  lukem             dfa->nstates++;
   1767  1.1  lukem             dfa->ntrans += sp->trans_used;
   1768  1.1  lukem             state++;
   1769  1.1  lukem         }
   1770  1.1  lukem     }
   1771  1.1  lukem 
   1772  1.1  lukem     /*
   1773  1.1  lukem      * Allocate enough space for the states and transitions.
   1774  1.1  lukem      */
   1775  1.1  lukem     dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
   1776  1.1  lukem                                            dfa->nstates);
   1777  1.1  lukem     dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
   1778  1.1  lukem 
   1779  1.1  lukem     /*
   1780  1.1  lukem      * Actually transfer the DFA states from the buffer.
   1781  1.1  lukem      */
   1782  1.1  lukem     dsp = dfa->states;
   1783  1.1  lukem     tp = dfa->trans;
   1784  1.1  lukem     for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
   1785  1.1  lukem          i++, sp++) {
   1786  1.1  lukem         if (sp->id == state) {
   1787  1.1  lukem             dsp->trans = tp;
   1788  1.1  lukem             dsp->ntrans = sp->trans_used;
   1789  1.1  lukem             dsp->accepting = sp->accepting;
   1790  1.1  lukem 
   1791  1.1  lukem             /*
   1792  1.1  lukem              * Add the transitions for the state.
   1793  1.1  lukem              */
   1794  1.1  lukem             for (j = 0; j < dsp->ntrans; j++, tp++) {
   1795  1.1  lukem                 tp->symbol = sp->trans[j].lhs;
   1796  1.1  lukem                 tp->next_state = buf->states.states[sp->trans[j].rhs].id;
   1797  1.1  lukem             }
   1798  1.1  lukem 
   1799  1.1  lukem             dsp++;
   1800  1.1  lukem             state++;
   1801  1.1  lukem         }
   1802  1.1  lukem     }
   1803  1.1  lukem 
   1804  1.1  lukem     return dfa;
   1805  1.1  lukem }
   1806  1.1  lukem 
   1807  1.1  lukem void
   1808  1.1  lukem ure_dfa_free(ure_dfa_t dfa)
   1809  1.1  lukem {
   1810  1.1  lukem     ucs2_t i;
   1811  1.1  lukem 
   1812  1.1  lukem     if (dfa == 0)
   1813  1.1  lukem       return;
   1814  1.1  lukem 
   1815  1.1  lukem     for (i = 0; i < dfa->nsyms; i++) {
   1816  1.1  lukem         if ((dfa->syms[i].type == _URE_CCLASS ||
   1817  1.1  lukem              dfa->syms[i].type == _URE_NCCLASS) &&
   1818  1.1  lukem             dfa->syms[i].sym.ccl.ranges_size > 0)
   1819  1.1  lukem           free((char *) dfa->syms[i].sym.ccl.ranges);
   1820  1.1  lukem     }
   1821  1.1  lukem     if (dfa->nsyms > 0)
   1822  1.1  lukem       free((char *) dfa->syms);
   1823  1.1  lukem 
   1824  1.1  lukem     if (dfa->nstates > 0)
   1825  1.1  lukem       free((char *) dfa->states);
   1826  1.1  lukem     if (dfa->ntrans > 0)
   1827  1.1  lukem       free((char *) dfa->trans);
   1828  1.1  lukem     free((char *) dfa);
   1829  1.1  lukem }
   1830  1.1  lukem 
   1831  1.1  lukem void
   1832  1.1  lukem ure_write_dfa(ure_dfa_t dfa, FILE *out)
   1833  1.1  lukem {
   1834  1.1  lukem     ucs2_t i, j, k, h, l;
   1835  1.1  lukem     _ure_dstate_t *sp;
   1836  1.1  lukem     _ure_symtab_t *sym;
   1837  1.1  lukem     _ure_range_t *rp;
   1838  1.1  lukem 
   1839  1.1  lukem     if (dfa == 0 || out == 0)
   1840  1.1  lukem       return;
   1841  1.1  lukem 
   1842  1.1  lukem     /*
   1843  1.1  lukem      * Write all the different character classes.
   1844  1.1  lukem      */
   1845  1.1  lukem     for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
   1846  1.1  lukem         if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
   1847  1.1  lukem             fprintf(out, "C%hd = ", sym->id);
   1848  1.1  lukem             if (sym->sym.ccl.ranges_used > 0) {
   1849  1.1  lukem                 putc('[', out);
   1850  1.1  lukem                 if (sym->type == _URE_NCCLASS)
   1851  1.1  lukem                   putc('^', out);
   1852  1.1  lukem             }
   1853  1.1  lukem             if (sym->props != 0) {
   1854  1.1  lukem                 if (sym->type == _URE_NCCLASS)
   1855  1.1  lukem                   fprintf(out, "\\P");
   1856  1.1  lukem                 else
   1857  1.1  lukem                   fprintf(out, "\\p");
   1858  1.1  lukem                 for (k = h = 0; k < 32; k++) {
   1859  1.1  lukem                     if (sym->props & (1 << k)) {
   1860  1.1  lukem                         if (h != 0)
   1861  1.1  lukem                           putc(',', out);
   1862  1.1  lukem                         fprintf(out, "%hd", k + 1);
   1863  1.1  lukem                         h = 1;
   1864  1.1  lukem                     }
   1865  1.1  lukem                 }
   1866  1.1  lukem             }
   1867  1.1  lukem             /*
   1868  1.1  lukem              * Dump the ranges.
   1869  1.1  lukem              */
   1870  1.1  lukem             for (k = 0, rp = sym->sym.ccl.ranges;
   1871  1.1  lukem                  k < sym->sym.ccl.ranges_used; k++, rp++) {
   1872  1.1  lukem                 /*
   1873  1.1  lukem                  * Check for UTF16 characters.
   1874  1.1  lukem                  */
   1875  1.1  lukem                 if (0x10000 <= rp->min_code &&
   1876  1.1  lukem                     rp->min_code <= 0x10ffff) {
   1877  1.1  lukem                     h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
   1878  1.1  lukem                     l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
   1879  1.1  lukem                     fprintf(out, "\\x%04hX\\x%04hX", h, l);
   1880  1.1  lukem                 } else
   1881  1.1  lukem                   fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
   1882  1.1  lukem                 if (rp->max_code != rp->min_code) {
   1883  1.1  lukem                     putc('-', out);
   1884  1.1  lukem                     if (rp->max_code >= 0x10000 &&
   1885  1.1  lukem                         rp->max_code <= 0x10ffff) {
   1886  1.1  lukem                         h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
   1887  1.1  lukem                         l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
   1888  1.1  lukem                         fprintf(out, "\\x%04hX\\x%04hX", h, l);
   1889  1.1  lukem                     } else
   1890  1.1  lukem                       fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
   1891  1.1  lukem                 }
   1892  1.1  lukem             }
   1893  1.1  lukem             if (sym->sym.ccl.ranges_used > 0)
   1894  1.1  lukem               putc(']', out);
   1895  1.1  lukem             putc('\n', out);
   1896  1.1  lukem         }
   1897  1.1  lukem     }
   1898  1.1  lukem 
   1899  1.1  lukem     for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
   1900  1.1  lukem         fprintf(out, "S%hd = ", i);
   1901  1.1  lukem         if (sp->accepting) {
   1902  1.1  lukem             fprintf(out, "1 ");
   1903  1.1  lukem             if (sp->ntrans)
   1904  1.1  lukem               fprintf(out, "| ");
   1905  1.1  lukem         }
   1906  1.1  lukem         for (j = 0; j < sp->ntrans; j++) {
   1907  1.1  lukem             if (j > 0)
   1908  1.1  lukem               fprintf(out, "| ");
   1909  1.1  lukem 
   1910  1.1  lukem             sym = dfa->syms + sp->trans[j].symbol;
   1911  1.1  lukem             switch (sym->type) {
   1912  1.1  lukem               case _URE_CHAR:
   1913  1.1  lukem                 if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
   1914  1.1  lukem                     /*
   1915  1.1  lukem                      * Take care of UTF16 characters.
   1916  1.1  lukem                      */
   1917  1.1  lukem                     h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
   1918  1.1  lukem                     l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
   1919  1.1  lukem                     fprintf(out, "\\x%04hX\\x%04hX ", h, l);
   1920  1.1  lukem                 } else
   1921  1.1  lukem                   fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
   1922  1.1  lukem                 break;
   1923  1.1  lukem               case _URE_ANY_CHAR:
   1924  1.1  lukem                 fprintf(out, "<any> ");
   1925  1.1  lukem                 break;
   1926  1.1  lukem               case _URE_BOL_ANCHOR:
   1927  1.1  lukem                 fprintf(out, "<bol-anchor> ");
   1928  1.1  lukem                 break;
   1929  1.1  lukem               case _URE_EOL_ANCHOR:
   1930  1.1  lukem                 fprintf(out, "<eol-anchor> ");
   1931  1.1  lukem                 break;
   1932  1.1  lukem               case _URE_CCLASS:
   1933  1.1  lukem               case _URE_NCCLASS:
   1934  1.1  lukem                 fprintf(out, "[C%hd] ", sym->id);
   1935  1.1  lukem                 break;
   1936  1.1  lukem             }
   1937  1.1  lukem             fprintf(out, "S%hd", sp->trans[j].next_state);
   1938  1.1  lukem             if (j + 1 < sp->ntrans)
   1939  1.1  lukem               putc(' ', out);
   1940  1.1  lukem         }
   1941  1.1  lukem         putc('\n', out);
   1942  1.1  lukem     }
   1943  1.1  lukem }
   1944  1.1  lukem 
   1945  1.1  lukem #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
   1946  1.1  lukem                         (cc) == 0x2029)
   1947  1.1  lukem 
   1948  1.1  lukem int
   1949  1.1  lukem ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
   1950  1.1  lukem          unsigned long *match_start, unsigned long *match_end)
   1951  1.1  lukem {
   1952  1.1  lukem     int i, j, matched, found, skip;
   1953  1.1  lukem     unsigned long ms, me;
   1954  1.1  lukem     ucs4_t c;
   1955  1.1  lukem     ucs2_t *sp, *ep, *lp;
   1956  1.1  lukem     _ure_dstate_t *stp;
   1957  1.1  lukem     _ure_symtab_t *sym;
   1958  1.1  lukem     _ure_range_t *rp;
   1959  1.1  lukem 
   1960  1.1  lukem     if (dfa == 0 || text == 0)
   1961  1.1  lukem       return 0;
   1962  1.1  lukem 
   1963  1.1  lukem     /*
   1964  1.1  lukem      * Handle the special case of an empty string matching the "^$" pattern.
   1965  1.1  lukem      */
   1966  1.1  lukem     if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
   1967  1.1  lukem         *match_start = *match_end = 0;
   1968  1.1  lukem         return 1;
   1969  1.1  lukem     }
   1970  1.1  lukem 
   1971  1.1  lukem     sp = text;
   1972  1.1  lukem     ep = sp + textlen;
   1973  1.1  lukem 
   1974  1.1  lukem     ms = me = ~0;
   1975  1.1  lukem 
   1976  1.1  lukem     stp = dfa->states;
   1977  1.1  lukem 
   1978  1.1  lukem     for (found = skip = 0; found == 0 && sp < ep; ) {
   1979  1.1  lukem         lp = sp;
   1980  1.1  lukem         c = *sp++;
   1981  1.1  lukem 
   1982  1.1  lukem         /*
   1983  1.1  lukem          * Check to see if this is a high surrogate that should be
   1984  1.1  lukem          * combined with a following low surrogate.
   1985  1.1  lukem          */
   1986  1.1  lukem         if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
   1987  1.1  lukem             0xdc00 <= *sp && *sp <= 0xdfff)
   1988  1.1  lukem           c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
   1989  1.1  lukem 
   1990  1.1  lukem         /*
   1991  1.1  lukem          * Determine if the character is non-spacing and should be skipped.
   1992  1.1  lukem          */
   1993  1.1  lukem         if (_ure_matches_properties(_URE_NONSPACING, c) &&
   1994  1.1  lukem             (flags & URE_IGNORE_NONSPACING)) {
   1995  1.1  lukem             sp++;
   1996  1.1  lukem             continue;
   1997  1.1  lukem         }
   1998  1.1  lukem 
   1999  1.1  lukem         if (dfa->flags & _URE_DFA_CASEFOLD)
   2000  1.1  lukem           c = _ure_tolower(c);
   2001  1.1  lukem 
   2002  1.1  lukem         /*
   2003  1.1  lukem          * See if one of the transitions matches.
   2004  1.1  lukem          */
   2005  1.1  lukem         for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
   2006  1.1  lukem             sym = dfa->syms + stp->trans[i].symbol;
   2007  1.1  lukem             switch (sym->type) {
   2008  1.1  lukem               case _URE_ANY_CHAR:
   2009  1.1  lukem                 if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
   2010  1.1  lukem                     !_ure_issep(c))
   2011  1.1  lukem                   matched = 1;
   2012  1.1  lukem                 break;
   2013  1.1  lukem               case _URE_CHAR:
   2014  1.1  lukem                 if (c == sym->sym.chr)
   2015  1.1  lukem                   matched = 1;
   2016  1.1  lukem                 break;
   2017  1.1  lukem               case _URE_BOL_ANCHOR:
   2018  1.1  lukem                 if (lp == text) {
   2019  1.1  lukem                     sp = lp;
   2020  1.1  lukem                     matched = 1;
   2021  1.1  lukem                 } else if (_ure_issep(c)) {
   2022  1.1  lukem                     if (c == '\r' && sp < ep && *sp == '\n')
   2023  1.1  lukem                       sp++;
   2024  1.1  lukem                     lp = sp;
   2025  1.1  lukem                     matched = 1;
   2026  1.1  lukem                 }
   2027  1.1  lukem                 break;
   2028  1.1  lukem               case _URE_EOL_ANCHOR:
   2029  1.1  lukem                 if (_ure_issep(c)) {
   2030  1.1  lukem                     /*
   2031  1.1  lukem                      * Put the pointer back before the separator so the match
   2032  1.1  lukem                      * end position will be correct.  This case will also
   2033  1.1  lukem                      * cause the `sp' pointer to be advanced over the current
   2034  1.1  lukem                      * separator once the match end point has been recorded.
   2035  1.1  lukem                      */
   2036  1.1  lukem                     sp = lp;
   2037  1.1  lukem                     matched = 1;
   2038  1.1  lukem                 }
   2039  1.1  lukem                 break;
   2040  1.1  lukem               case _URE_CCLASS:
   2041  1.1  lukem               case _URE_NCCLASS:
   2042  1.1  lukem                 if (sym->props != 0)
   2043  1.1  lukem                   matched = _ure_matches_properties(sym->props, c);
   2044  1.1  lukem                 for (j = 0, rp = sym->sym.ccl.ranges;
   2045  1.1  lukem                      j < sym->sym.ccl.ranges_used; j++, rp++) {
   2046  1.1  lukem                     if (rp->min_code <= c && c <= rp->max_code)
   2047  1.1  lukem                       matched = 1;
   2048  1.1  lukem                 }
   2049  1.1  lukem                 if (sym->type == _URE_NCCLASS)
   2050  1.1  lukem                   matched = !matched;
   2051  1.1  lukem                 break;
   2052  1.1  lukem             }
   2053  1.1  lukem 
   2054  1.1  lukem             if (matched) {
   2055  1.1  lukem                 if (ms == ~0UL)
   2056  1.1  lukem                   ms = lp - text;
   2057  1.1  lukem                 else
   2058  1.1  lukem                   me = sp - text;
   2059  1.1  lukem                 stp = dfa->states + stp->trans[i].next_state;
   2060  1.1  lukem 
   2061  1.1  lukem                 /*
   2062  1.1  lukem                  * If the match was an EOL anchor, adjust the pointer past the
   2063  1.1  lukem                  * separator that caused the match.  The correct match
   2064  1.1  lukem                  * position has been recorded already.
   2065  1.1  lukem                  */
   2066  1.1  lukem                 if (sym->type == _URE_EOL_ANCHOR) {
   2067  1.1  lukem                     /*
   2068  1.1  lukem                      * Skip the character that caused the match.
   2069  1.1  lukem                      */
   2070  1.1  lukem                     sp++;
   2071  1.1  lukem 
   2072  1.1  lukem                     /*
   2073  1.1  lukem                      * Handle the infamous CRLF situation.
   2074  1.1  lukem                      */
   2075  1.1  lukem                     if (sp < ep && c == '\r' && *sp == '\n')
   2076  1.1  lukem                       sp++;
   2077  1.1  lukem                 }
   2078  1.1  lukem             }
   2079  1.1  lukem         }
   2080  1.1  lukem 
   2081  1.1  lukem         if (matched == 0) {
   2082  1.1  lukem             if (stp->accepting == 0) {
   2083  1.1  lukem                 /*
   2084  1.1  lukem                  * If the last state was not accepting, then reset
   2085  1.1  lukem                  * and start over.
   2086  1.1  lukem                  */
   2087  1.1  lukem                 stp = dfa->states;
   2088  1.1  lukem                 ms = me = ~0;
   2089  1.1  lukem             } else
   2090  1.1  lukem               /*
   2091  1.1  lukem                * The last state was accepting, so terminate the matching
   2092  1.1  lukem                * loop to avoid more work.
   2093  1.1  lukem                */
   2094  1.1  lukem               found = 1;
   2095  1.1  lukem         } else if (sp == ep) {
   2096  1.1  lukem             if (!stp->accepting) {
   2097  1.1  lukem                 /*
   2098  1.1  lukem                  * This ugly hack is to make sure the end-of-line anchors
   2099  1.1  lukem                  * match when the source text hits the end.  This is only done
   2100  1.1  lukem                  * if the last subexpression matches.
   2101  1.1  lukem                  */
   2102  1.1  lukem                 for (i = 0; found == 0 && i < stp->ntrans; i++) {
   2103  1.1  lukem                     sym = dfa->syms + stp->trans[i].symbol;
   2104  1.1  lukem                     if (sym->type ==_URE_EOL_ANCHOR) {
   2105  1.1  lukem                         stp = dfa->states + stp->trans[i].next_state;
   2106  1.1  lukem                         if (stp->accepting) {
   2107  1.1  lukem                             me = sp - text;
   2108  1.1  lukem                             found = 1;
   2109  1.1  lukem                         } else
   2110  1.1  lukem                           break;
   2111  1.1  lukem                     }
   2112  1.1  lukem                 }
   2113  1.1  lukem             } else {
   2114  1.1  lukem                 /*
   2115  1.1  lukem                  * Make sure any conditions that match all the way to the end
   2116  1.1  lukem                  * of the string match.
   2117  1.1  lukem                  */
   2118  1.1  lukem                 found = 1;
   2119  1.1  lukem                 me = sp - text;
   2120  1.1  lukem             }
   2121  1.1  lukem         }
   2122  1.1  lukem     }
   2123  1.1  lukem 
   2124  1.1  lukem     if (found == 0)
   2125  1.1  lukem       ms = me = ~0;
   2126  1.1  lukem 
   2127  1.1  lukem     *match_start = ms;
   2128  1.1  lukem     *match_end = me;
   2129  1.1  lukem 
   2130  1.1  lukem     return (ms != ~0UL) ? 1 : 0;
   2131  1.1  lukem }
   2132