Home | History | Annotate | Line # | Download | only in ure
ure.c revision 1.1.1.3.12.1
      1       1.1.1.2  lukem /*	$NetBSD: ure.c,v 1.1.1.3.12.1 2014/08/19 23:52:00 tls Exp $	*/
      2       1.1.1.2  lukem 
      3  1.1.1.3.12.1    tls /* $OpenLDAP$ */
      4           1.1  lukem /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
      5           1.1  lukem  *
      6  1.1.1.3.12.1    tls  * Copyright 1998-2014 The OpenLDAP Foundation.
      7           1.1  lukem  * All rights reserved.
      8           1.1  lukem  *
      9           1.1  lukem  * Redistribution and use in source and binary forms, with or without
     10           1.1  lukem  * modification, are permitted only as authorized by the OpenLDAP
     11           1.1  lukem  * Public License.
     12           1.1  lukem  *
     13           1.1  lukem  * A copy of this license is available in file LICENSE in the
     14           1.1  lukem  * top-level directory of the distribution or, alternatively, at
     15           1.1  lukem  * <http://www.OpenLDAP.org/license.html>.
     16           1.1  lukem  */
     17           1.1  lukem /* Copyright 1997, 1998, 1999 Computing Research Labs,
     18           1.1  lukem  * New Mexico State University
     19           1.1  lukem  *
     20           1.1  lukem  * Permission is hereby granted, free of charge, to any person obtaining a
     21           1.1  lukem  * copy of this software and associated documentation files (the "Software"),
     22           1.1  lukem  * to deal in the Software without restriction, including without limitation
     23           1.1  lukem  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     24           1.1  lukem  * and/or sell copies of the Software, and to permit persons to whom the
     25           1.1  lukem  * Software is furnished to do so, subject to the following conditions:
     26           1.1  lukem  *
     27           1.1  lukem  * The above copyright notice and this permission notice shall be included in
     28           1.1  lukem  * all copies or substantial portions of the Software.
     29           1.1  lukem  *
     30           1.1  lukem  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     31           1.1  lukem  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     32           1.1  lukem  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     33           1.1  lukem  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
     34           1.1  lukem  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
     35           1.1  lukem  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
     36           1.1  lukem  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     37           1.1  lukem  */
     38  1.1.1.3.12.1    tls /* Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp " */
     39           1.1  lukem 
     40           1.1  lukem #include "portable.h"
     41           1.1  lukem 
     42           1.1  lukem #include <ac/stdlib.h>
     43           1.1  lukem #include <ac/string.h>
     44           1.1  lukem #include <ac/unistd.h>
     45           1.1  lukem 
     46           1.1  lukem #include "ure.h"
     47           1.1  lukem 
     48           1.1  lukem /*
     49           1.1  lukem  * Flags used internally in the DFA.
     50           1.1  lukem  */
     51           1.1  lukem #define _URE_DFA_CASEFOLD  0x01
     52           1.1  lukem #define _URE_DFA_BLANKLINE 0x02
     53           1.1  lukem 
     54           1.1  lukem static unsigned long cclass_flags[] = {
     55           1.1  lukem     0,
     56           1.1  lukem     _URE_NONSPACING,
     57           1.1  lukem     _URE_COMBINING,
     58           1.1  lukem     _URE_NUMDIGIT,
     59           1.1  lukem     _URE_NUMOTHER,
     60           1.1  lukem     _URE_SPACESEP,
     61           1.1  lukem     _URE_LINESEP,
     62           1.1  lukem     _URE_PARASEP,
     63           1.1  lukem     _URE_CNTRL,
     64           1.1  lukem     _URE_PUA,
     65           1.1  lukem     _URE_UPPER,
     66           1.1  lukem     _URE_LOWER,
     67           1.1  lukem     _URE_TITLE,
     68           1.1  lukem     _URE_MODIFIER,
     69           1.1  lukem     _URE_OTHERLETTER,
     70           1.1  lukem     _URE_DASHPUNCT,
     71           1.1  lukem     _URE_OPENPUNCT,
     72           1.1  lukem     _URE_CLOSEPUNCT,
     73           1.1  lukem     _URE_OTHERPUNCT,
     74           1.1  lukem     _URE_MATHSYM,
     75           1.1  lukem     _URE_CURRENCYSYM,
     76           1.1  lukem     _URE_OTHERSYM,
     77           1.1  lukem     _URE_LTR,
     78           1.1  lukem     _URE_RTL,
     79           1.1  lukem     _URE_EURONUM,
     80           1.1  lukem     _URE_EURONUMSEP,
     81           1.1  lukem     _URE_EURONUMTERM,
     82           1.1  lukem     _URE_ARABNUM,
     83           1.1  lukem     _URE_COMMONSEP,
     84           1.1  lukem     _URE_BLOCKSEP,
     85           1.1  lukem     _URE_SEGMENTSEP,
     86           1.1  lukem     _URE_WHITESPACE,
     87           1.1  lukem     _URE_OTHERNEUT,
     88           1.1  lukem };
     89           1.1  lukem 
     90           1.1  lukem /*
     91           1.1  lukem  * Symbol types for the DFA.
     92           1.1  lukem  */
     93           1.1  lukem #define _URE_ANY_CHAR   1
     94           1.1  lukem #define _URE_CHAR       2
     95           1.1  lukem #define _URE_CCLASS     3
     96           1.1  lukem #define _URE_NCCLASS    4
     97           1.1  lukem #define _URE_BOL_ANCHOR 5
     98           1.1  lukem #define _URE_EOL_ANCHOR 6
     99           1.1  lukem 
    100           1.1  lukem /*
    101           1.1  lukem  * Op codes for converting the NFA to a DFA.
    102           1.1  lukem  */
    103           1.1  lukem #define _URE_SYMBOL     10
    104           1.1  lukem #define _URE_PAREN      11
    105           1.1  lukem #define _URE_QUEST      12
    106           1.1  lukem #define _URE_STAR       13
    107           1.1  lukem #define _URE_PLUS       14
    108           1.1  lukem #define _URE_ONE        15
    109           1.1  lukem #define _URE_AND        16
    110           1.1  lukem #define _URE_OR         17
    111           1.1  lukem 
    112           1.1  lukem #define _URE_NOOP       0xffff
    113           1.1  lukem 
    114           1.1  lukem #define _URE_REGSTART 0x8000
    115           1.1  lukem #define _URE_REGEND   0x4000
    116           1.1  lukem 
    117           1.1  lukem /*
    118           1.1  lukem  * Structure used to handle a compacted range of characters.
    119           1.1  lukem  */
    120           1.1  lukem typedef struct {
    121           1.1  lukem     ucs4_t min_code;
    122           1.1  lukem     ucs4_t max_code;
    123           1.1  lukem } _ure_range_t;
    124           1.1  lukem 
    125           1.1  lukem typedef struct {
    126           1.1  lukem     _ure_range_t *ranges;
    127           1.1  lukem     ucs2_t ranges_used;
    128           1.1  lukem     ucs2_t ranges_size;
    129           1.1  lukem } _ure_ccl_t;
    130           1.1  lukem 
    131           1.1  lukem typedef union {
    132           1.1  lukem     ucs4_t chr;
    133           1.1  lukem     _ure_ccl_t ccl;
    134           1.1  lukem } _ure_sym_t;
    135           1.1  lukem 
    136           1.1  lukem /*
    137           1.1  lukem  * This is a general element structure used for expressions and stack
    138           1.1  lukem  * elements.
    139           1.1  lukem  */
    140           1.1  lukem typedef struct {
    141           1.1  lukem     ucs2_t reg;
    142           1.1  lukem     ucs2_t onstack;
    143           1.1  lukem     ucs2_t type;
    144           1.1  lukem     ucs2_t lhs;
    145           1.1  lukem     ucs2_t rhs;
    146           1.1  lukem } _ure_elt_t;
    147           1.1  lukem 
    148           1.1  lukem /*
    149           1.1  lukem  * This is a structure used to track a list or a stack of states.
    150           1.1  lukem  */
    151           1.1  lukem typedef struct {
    152           1.1  lukem     ucs2_t *slist;
    153           1.1  lukem     ucs2_t slist_size;
    154           1.1  lukem     ucs2_t slist_used;
    155           1.1  lukem } _ure_stlist_t;
    156           1.1  lukem 
    157           1.1  lukem /*
    158           1.1  lukem  * Structure to track the list of unique states for a symbol
    159           1.1  lukem  * during reduction.
    160           1.1  lukem  */
    161           1.1  lukem typedef struct {
    162           1.1  lukem     ucs2_t id;
    163           1.1  lukem     ucs2_t type;
    164           1.1  lukem     unsigned long mods;
    165           1.1  lukem     unsigned long props;
    166           1.1  lukem     _ure_sym_t sym;
    167           1.1  lukem     _ure_stlist_t states;
    168           1.1  lukem } _ure_symtab_t;
    169           1.1  lukem 
    170           1.1  lukem /*
    171           1.1  lukem  * Structure to hold a single state.
    172           1.1  lukem  */
    173           1.1  lukem typedef struct {
    174           1.1  lukem     ucs2_t id;
    175           1.1  lukem     ucs2_t accepting;
    176           1.1  lukem     ucs2_t pad;
    177           1.1  lukem     _ure_stlist_t st;
    178           1.1  lukem     _ure_elt_t *trans;
    179           1.1  lukem     ucs2_t trans_size;
    180           1.1  lukem     ucs2_t trans_used;
    181           1.1  lukem } _ure_state_t;
    182           1.1  lukem 
    183           1.1  lukem /*
    184           1.1  lukem  * Structure used for keeping lists of states.
    185           1.1  lukem  */
    186           1.1  lukem typedef struct {
    187           1.1  lukem     _ure_state_t *states;
    188           1.1  lukem     ucs2_t states_size;
    189           1.1  lukem     ucs2_t states_used;
    190           1.1  lukem } _ure_statetable_t;
    191           1.1  lukem 
    192           1.1  lukem /*
    193           1.1  lukem  * Structure to track pairs of DFA states when equivalent states are
    194           1.1  lukem  * merged.
    195           1.1  lukem  */
    196           1.1  lukem typedef struct {
    197           1.1  lukem     ucs2_t l;
    198           1.1  lukem     ucs2_t r;
    199           1.1  lukem } _ure_equiv_t;
    200           1.1  lukem 
    201           1.1  lukem /*
    202           1.1  lukem  * Structure used for constructing the NFA and reducing to a minimal DFA.
    203           1.1  lukem  */
    204           1.1  lukem typedef struct _ure_buffer_t {
    205           1.1  lukem     int reducing;
    206           1.1  lukem     int error;
    207           1.1  lukem     unsigned long flags;
    208           1.1  lukem 
    209           1.1  lukem     _ure_stlist_t stack;
    210           1.1  lukem 
    211           1.1  lukem     /*
    212           1.1  lukem      * Table of unique symbols encountered.
    213           1.1  lukem      */
    214           1.1  lukem     _ure_symtab_t *symtab;
    215           1.1  lukem     ucs2_t symtab_size;
    216           1.1  lukem     ucs2_t symtab_used;
    217           1.1  lukem 
    218           1.1  lukem     /*
    219           1.1  lukem      * Tracks the unique expressions generated for the NFA and when the NFA is
    220           1.1  lukem      * reduced.
    221           1.1  lukem      */
    222           1.1  lukem     _ure_elt_t *expr;
    223           1.1  lukem     ucs2_t expr_used;
    224           1.1  lukem     ucs2_t expr_size;
    225           1.1  lukem 
    226           1.1  lukem     /*
    227           1.1  lukem      * The reduced table of unique groups of NFA states.
    228           1.1  lukem      */
    229           1.1  lukem     _ure_statetable_t states;
    230           1.1  lukem 
    231           1.1  lukem     /*
    232           1.1  lukem      * Tracks states when equivalent states are merged.
    233           1.1  lukem      */
    234           1.1  lukem     _ure_equiv_t *equiv;
    235           1.1  lukem     ucs2_t equiv_used;
    236           1.1  lukem     ucs2_t equiv_size;
    237           1.1  lukem } _ure_buffer_t;
    238           1.1  lukem 
    239           1.1  lukem typedef struct {
    240           1.1  lukem     ucs2_t symbol;
    241           1.1  lukem     ucs2_t next_state;
    242           1.1  lukem } _ure_trans_t;
    243           1.1  lukem 
    244           1.1  lukem typedef struct {
    245           1.1  lukem     ucs2_t accepting;
    246           1.1  lukem     ucs2_t ntrans;
    247           1.1  lukem     _ure_trans_t *trans;
    248           1.1  lukem } _ure_dstate_t;
    249           1.1  lukem 
    250           1.1  lukem typedef struct _ure_dfa_t {
    251           1.1  lukem     unsigned long flags;
    252           1.1  lukem 
    253           1.1  lukem     _ure_symtab_t *syms;
    254           1.1  lukem     ucs2_t nsyms;
    255           1.1  lukem 
    256           1.1  lukem     _ure_dstate_t *states;
    257           1.1  lukem     ucs2_t nstates;
    258           1.1  lukem 
    259           1.1  lukem     _ure_trans_t *trans;
    260           1.1  lukem     ucs2_t ntrans;
    261           1.1  lukem } _ure_dfa_t;
    262           1.1  lukem 
    263           1.1  lukem /*************************************************************************
    264           1.1  lukem  *
    265           1.1  lukem  * Functions.
    266           1.1  lukem  *
    267           1.1  lukem  *************************************************************************/
    268           1.1  lukem 
    269           1.1  lukem static void
    270           1.1  lukem _ure_memmove(char *dest, char *src, unsigned long bytes)
    271           1.1  lukem {
    272           1.1  lukem     long i, j;
    273           1.1  lukem 
    274           1.1  lukem     i = (long) bytes;
    275           1.1  lukem     j = i & 7;
    276           1.1  lukem     i = (i + 7) >> 3;
    277           1.1  lukem 
    278           1.1  lukem     /*
    279           1.1  lukem      * Do a memmove using Ye Olde Duff's Device for efficiency.
    280           1.1  lukem      */
    281           1.1  lukem     if (src < dest) {
    282           1.1  lukem         src += bytes;
    283           1.1  lukem         dest += bytes;
    284           1.1  lukem 
    285           1.1  lukem         switch (j) {
    286           1.1  lukem           case 0: do {
    287           1.1  lukem               *--dest = *--src;
    288           1.1  lukem             case 7: *--dest = *--src;
    289           1.1  lukem             case 6: *--dest = *--src;
    290           1.1  lukem             case 5: *--dest = *--src;
    291           1.1  lukem             case 4: *--dest = *--src;
    292           1.1  lukem             case 3: *--dest = *--src;
    293           1.1  lukem             case 2: *--dest = *--src;
    294           1.1  lukem             case 1: *--dest = *--src;
    295           1.1  lukem           } while (--i > 0);
    296           1.1  lukem         }
    297           1.1  lukem     } else if (src > dest) {
    298           1.1  lukem         switch (j) {
    299           1.1  lukem           case 0: do {
    300           1.1  lukem               *dest++ = *src++;
    301           1.1  lukem             case 7: *dest++ = *src++;
    302           1.1  lukem             case 6: *dest++ = *src++;
    303           1.1  lukem             case 5: *dest++ = *src++;
    304           1.1  lukem             case 4: *dest++ = *src++;
    305           1.1  lukem             case 3: *dest++ = *src++;
    306           1.1  lukem             case 2: *dest++ = *src++;
    307           1.1  lukem             case 1: *dest++ = *src++;
    308           1.1  lukem           } while (--i > 0);
    309           1.1  lukem         }
    310           1.1  lukem     }
    311           1.1  lukem }
    312           1.1  lukem 
    313           1.1  lukem static void
    314           1.1  lukem _ure_push(ucs2_t v, _ure_buffer_t *b)
    315           1.1  lukem {
    316           1.1  lukem     _ure_stlist_t *s;
    317           1.1  lukem 
    318           1.1  lukem     if (b == 0)
    319           1.1  lukem       return;
    320           1.1  lukem 
    321           1.1  lukem     /*
    322           1.1  lukem      * If the `reducing' parameter is non-zero, check to see if the value
    323           1.1  lukem      * passed is already on the stack.
    324           1.1  lukem      */
    325           1.1  lukem     if (b->reducing != 0 && b->expr[v].onstack != 0)
    326           1.1  lukem       return;
    327           1.1  lukem 
    328           1.1  lukem     s = &b->stack;
    329           1.1  lukem     if (s->slist_used == s->slist_size) {
    330           1.1  lukem         if (s->slist_size == 0)
    331           1.1  lukem           s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
    332           1.1  lukem         else
    333           1.1  lukem           s->slist = (ucs2_t *) realloc((char *) s->slist,
    334           1.1  lukem                                         sizeof(ucs2_t) * (s->slist_size + 8));
    335           1.1  lukem         s->slist_size += 8;
    336           1.1  lukem     }
    337           1.1  lukem     s->slist[s->slist_used++] = v;
    338           1.1  lukem 
    339           1.1  lukem     /*
    340           1.1  lukem      * If the `reducing' parameter is non-zero, flag the element as being on
    341           1.1  lukem      * the stack.
    342           1.1  lukem      */
    343           1.1  lukem     if (b->reducing != 0)
    344           1.1  lukem       b->expr[v].onstack = 1;
    345           1.1  lukem }
    346           1.1  lukem 
    347           1.1  lukem static ucs2_t
    348           1.1  lukem _ure_peek(_ure_buffer_t *b)
    349           1.1  lukem {
    350           1.1  lukem     if (b == 0 || b->stack.slist_used == 0)
    351           1.1  lukem       return _URE_NOOP;
    352           1.1  lukem 
    353           1.1  lukem     return b->stack.slist[b->stack.slist_used - 1];
    354           1.1  lukem }
    355           1.1  lukem 
    356           1.1  lukem static ucs2_t
    357           1.1  lukem _ure_pop(_ure_buffer_t *b)
    358           1.1  lukem {
    359           1.1  lukem     ucs2_t v;
    360           1.1  lukem 
    361           1.1  lukem     if (b == 0 || b->stack.slist_used == 0)
    362           1.1  lukem       return _URE_NOOP;
    363           1.1  lukem 
    364           1.1  lukem     v = b->stack.slist[--b->stack.slist_used];
    365           1.1  lukem     if (b->reducing)
    366           1.1  lukem       b->expr[v].onstack = 0;
    367           1.1  lukem 
    368           1.1  lukem     return v;
    369           1.1  lukem }
    370           1.1  lukem 
    371           1.1  lukem /*************************************************************************
    372           1.1  lukem  *
    373           1.1  lukem  * Start symbol parse functions.
    374           1.1  lukem  *
    375           1.1  lukem  *************************************************************************/
    376           1.1  lukem 
    377           1.1  lukem /*
    378           1.1  lukem  * Parse a comma-separated list of integers that represent character
    379           1.1  lukem  * properties.  Combine them into a mask that is returned in the `mask'
    380           1.1  lukem  * variable, and return the number of characters consumed.
    381           1.1  lukem  */
    382           1.1  lukem static unsigned long
    383           1.1  lukem _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
    384           1.1  lukem                _ure_buffer_t *b)
    385           1.1  lukem {
    386           1.1  lukem     unsigned long n, m;
    387           1.1  lukem     ucs2_t *sp, *ep;
    388           1.1  lukem 
    389           1.1  lukem     sp = pp;
    390           1.1  lukem     ep = sp + limit;
    391           1.1  lukem 
    392           1.1  lukem     for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
    393           1.1  lukem         if (*sp == ',') {
    394           1.1  lukem             /*
    395           1.1  lukem              * Encountered a comma, so select the next character property flag
    396           1.1  lukem              * and reset the number.
    397           1.1  lukem              */
    398           1.1  lukem             m |= cclass_flags[n];
    399           1.1  lukem             n = 0;
    400           1.1  lukem         } else if (*sp >= '0' && *sp <= '9')
    401           1.1  lukem           /*
    402           1.1  lukem            * Encountered a digit, so start or continue building the cardinal
    403           1.1  lukem            * that represents the character property flag.
    404           1.1  lukem            */
    405           1.1  lukem           n = (n * 10) + (*sp - '0');
    406           1.1  lukem         else
    407           1.1  lukem           /*
    408           1.1  lukem            * Encountered something that is not part of the property list.
    409           1.1  lukem            * Indicate that we are done.
    410           1.1  lukem            */
    411           1.1  lukem           break;
    412           1.1  lukem 
    413           1.1  lukem         /*
    414           1.1  lukem          * If a property number greater than 32 occurs, then there is a
    415           1.1  lukem          * problem.  Most likely a missing comma separator.
    416           1.1  lukem          */
    417           1.1  lukem         if (n > 32)
    418           1.1  lukem           b->error = _URE_INVALID_PROPERTY;
    419           1.1  lukem     }
    420           1.1  lukem 
    421           1.1  lukem     if (n != 0)
    422           1.1  lukem       m |= cclass_flags[n];
    423           1.1  lukem 
    424           1.1  lukem     /*
    425           1.1  lukem      * Set the mask that represents the group of character properties.
    426           1.1  lukem      */
    427           1.1  lukem     *mask = m;
    428           1.1  lukem 
    429           1.1  lukem     /*
    430           1.1  lukem      * Return the number of characters consumed.
    431           1.1  lukem      */
    432           1.1  lukem     return sp - pp;
    433           1.1  lukem }
    434           1.1  lukem 
    435           1.1  lukem /*
    436           1.1  lukem  * Collect a hex number with 1 to 4 digits and return the number
    437           1.1  lukem  * of characters used.
    438           1.1  lukem  */
    439           1.1  lukem static unsigned long
    440           1.1  lukem _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
    441           1.1  lukem {
    442           1.1  lukem     ucs2_t i;
    443           1.1  lukem     ucs2_t *sp, *ep;
    444           1.1  lukem     ucs4_t nn;
    445           1.1  lukem 
    446           1.1  lukem     sp = np;
    447           1.1  lukem     ep = sp + limit;
    448           1.1  lukem 
    449           1.1  lukem     for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
    450           1.1  lukem         if (*sp >= '0' && *sp <= '9')
    451           1.1  lukem           nn = (nn << 4) + (*sp - '0');
    452           1.1  lukem         else if (*sp >= 'A' && *sp <= 'F')
    453           1.1  lukem           nn = (nn << 4) + ((*sp - 'A') + 10);
    454           1.1  lukem         else if (*sp >= 'a' && *sp <= 'f')
    455           1.1  lukem           nn = (nn << 4) + ((*sp - 'a') + 10);
    456           1.1  lukem         else
    457           1.1  lukem           /*
    458           1.1  lukem            * Encountered something that is not a hex digit.
    459           1.1  lukem            */
    460           1.1  lukem           break;
    461           1.1  lukem     }
    462           1.1  lukem 
    463           1.1  lukem     /*
    464           1.1  lukem      * Assign the character code collected and return the number of
    465           1.1  lukem      * characters used.
    466           1.1  lukem      */
    467           1.1  lukem     *n = nn;
    468           1.1  lukem 
    469           1.1  lukem     return sp - np;
    470           1.1  lukem }
    471           1.1  lukem 
    472           1.1  lukem /*
    473           1.1  lukem  * Insert a range into a character class, removing duplicates and ordering
    474           1.1  lukem  * them in increasing range-start order.
    475           1.1  lukem  */
    476           1.1  lukem static void
    477           1.1  lukem _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
    478           1.1  lukem {
    479           1.1  lukem     ucs2_t i;
    480           1.1  lukem     ucs4_t tmp;
    481           1.1  lukem     _ure_range_t *rp;
    482           1.1  lukem 
    483           1.1  lukem     /*
    484           1.1  lukem      * If the `casefold' flag is set, then make sure both endpoints of the
    485           1.1  lukem      * range are converted to lower case.
    486           1.1  lukem      */
    487           1.1  lukem     if (b->flags & _URE_DFA_CASEFOLD) {
    488           1.1  lukem         r->min_code = _ure_tolower(r->min_code);
    489           1.1  lukem         r->max_code = _ure_tolower(r->max_code);
    490           1.1  lukem     }
    491           1.1  lukem 
    492           1.1  lukem     /*
    493           1.1  lukem      * Swap the range endpoints if they are not in increasing order.
    494           1.1  lukem      */
    495           1.1  lukem     if (r->min_code > r->max_code) {
    496           1.1  lukem         tmp = r->min_code;
    497           1.1  lukem         r->min_code = r->max_code;
    498           1.1  lukem         r->max_code = tmp;
    499           1.1  lukem     }
    500           1.1  lukem 
    501           1.1  lukem     for (i = 0, rp = ccl->ranges;
    502           1.1  lukem          i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
    503           1.1  lukem 
    504           1.1  lukem     /*
    505           1.1  lukem      * Check for a duplicate.
    506           1.1  lukem      */
    507           1.1  lukem     if (i < ccl->ranges_used &&
    508           1.1  lukem         r->min_code == rp->min_code && r->max_code == rp->max_code)
    509           1.1  lukem       return;
    510           1.1  lukem 
    511           1.1  lukem     if (ccl->ranges_used == ccl->ranges_size) {
    512           1.1  lukem         if (ccl->ranges_size == 0)
    513           1.1  lukem           ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
    514           1.1  lukem         else
    515           1.1  lukem           ccl->ranges = (_ure_range_t *)
    516           1.1  lukem               realloc((char *) ccl->ranges,
    517           1.1  lukem                       sizeof(_ure_range_t) * (ccl->ranges_size + 8));
    518           1.1  lukem         ccl->ranges_size += 8;
    519           1.1  lukem     }
    520           1.1  lukem 
    521           1.1  lukem     rp = ccl->ranges + ccl->ranges_used;
    522           1.1  lukem 
    523           1.1  lukem     if (i < ccl->ranges_used)
    524           1.1  lukem       _ure_memmove((char *) (rp + 1), (char *) rp,
    525           1.1  lukem                    sizeof(_ure_range_t) * (ccl->ranges_used - i));
    526           1.1  lukem 
    527           1.1  lukem     ccl->ranges_used++;
    528           1.1  lukem     rp->min_code = r->min_code;
    529           1.1  lukem     rp->max_code = r->max_code;
    530           1.1  lukem }
    531           1.1  lukem 
    532           1.1  lukem #define _URE_ALPHA_MASK  (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
    533           1.1  lukem _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
    534           1.1  lukem #define _URE_ALNUM_MASK  (_URE_ALPHA_MASK|_URE_NUMDIGIT)
    535           1.1  lukem #define _URE_PUNCT_MASK  (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
    536           1.1  lukem _URE_OTHERPUNCT)
    537           1.1  lukem #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
    538           1.1  lukem _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
    539           1.1  lukem #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
    540           1.1  lukem #define _URE_SPACE_MASK  (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
    541           1.1  lukem 
    542           1.1  lukem typedef void (*_ure_cclsetup_t)(
    543           1.1  lukem     _ure_symtab_t *sym,
    544           1.1  lukem     unsigned long mask,
    545           1.1  lukem     _ure_buffer_t *b
    546           1.1  lukem );
    547           1.1  lukem 
    548           1.1  lukem typedef struct {
    549           1.1  lukem     ucs2_t key;
    550           1.1  lukem     unsigned long len;
    551           1.1  lukem     unsigned long next;
    552           1.1  lukem     _ure_cclsetup_t func;
    553           1.1  lukem     unsigned long mask;
    554           1.1  lukem } _ure_trie_t;
    555           1.1  lukem 
    556           1.1  lukem static void
    557           1.1  lukem _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
    558           1.1  lukem {
    559           1.1  lukem     sym->props |= mask;
    560           1.1  lukem }
    561           1.1  lukem 
    562           1.1  lukem static void
    563           1.1  lukem _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
    564           1.1  lukem {
    565           1.1  lukem     _ure_range_t range;
    566           1.1  lukem 
    567           1.1  lukem     sym->props |= mask;
    568           1.1  lukem 
    569           1.1  lukem     /*
    570           1.1  lukem      * Add the additional characters needed for handling isspace().
    571           1.1  lukem      */
    572           1.1  lukem     range.min_code = range.max_code = '\t';
    573           1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    574           1.1  lukem     range.min_code = range.max_code = '\r';
    575           1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    576           1.1  lukem     range.min_code = range.max_code = '\n';
    577           1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    578           1.1  lukem     range.min_code = range.max_code = '\f';
    579           1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    580           1.1  lukem     range.min_code = range.max_code = 0xfeff;
    581           1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    582           1.1  lukem }
    583           1.1  lukem 
    584           1.1  lukem static void
    585           1.1  lukem _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
    586           1.1  lukem {
    587           1.1  lukem     _ure_range_t range;
    588           1.1  lukem 
    589           1.1  lukem     /*
    590           1.1  lukem      * Add the additional characters needed for handling isxdigit().
    591           1.1  lukem      */
    592           1.1  lukem     range.min_code = '0';
    593           1.1  lukem     range.max_code = '9';
    594           1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    595           1.1  lukem     range.min_code = 'A';
    596           1.1  lukem     range.max_code = 'F';
    597           1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    598           1.1  lukem     range.min_code = 'a';
    599           1.1  lukem     range.max_code = 'f';
    600           1.1  lukem     _ure_add_range(&sym->sym.ccl, &range, b);
    601           1.1  lukem }
    602           1.1  lukem 
    603           1.1  lukem static _ure_trie_t cclass_trie[] = {
    604           1.1  lukem     {0x003a, 1, 1, 0, 0},
    605           1.1  lukem     {0x0061, 9, 10, 0, 0},
    606           1.1  lukem     {0x0063, 8, 19, 0, 0},
    607           1.1  lukem     {0x0064, 7, 24, 0, 0},
    608           1.1  lukem     {0x0067, 6, 29, 0, 0},
    609           1.1  lukem     {0x006c, 5, 34, 0, 0},
    610           1.1  lukem     {0x0070, 4, 39, 0, 0},
    611           1.1  lukem     {0x0073, 3, 49, 0, 0},
    612           1.1  lukem     {0x0075, 2, 54, 0, 0},
    613           1.1  lukem     {0x0078, 1, 59, 0, 0},
    614           1.1  lukem     {0x006c, 1, 11, 0, 0},
    615           1.1  lukem     {0x006e, 2, 13, 0, 0},
    616           1.1  lukem     {0x0070, 1, 16, 0, 0},
    617           1.1  lukem     {0x0075, 1, 14, 0, 0},
    618           1.1  lukem     {0x006d, 1, 15, 0, 0},
    619           1.1  lukem     {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
    620           1.1  lukem     {0x0068, 1, 17, 0, 0},
    621           1.1  lukem     {0x0061, 1, 18, 0, 0},
    622           1.1  lukem     {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
    623           1.1  lukem     {0x006e, 1, 20, 0, 0},
    624           1.1  lukem     {0x0074, 1, 21, 0, 0},
    625           1.1  lukem     {0x0072, 1, 22, 0, 0},
    626           1.1  lukem     {0x006c, 1, 23, 0, 0},
    627           1.1  lukem     {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
    628           1.1  lukem     {0x0069, 1, 25, 0, 0},
    629           1.1  lukem     {0x0067, 1, 26, 0, 0},
    630           1.1  lukem     {0x0069, 1, 27, 0, 0},
    631           1.1  lukem     {0x0074, 1, 28, 0, 0},
    632           1.1  lukem     {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
    633           1.1  lukem     {0x0072, 1, 30, 0, 0},
    634           1.1  lukem     {0x0061, 1, 31, 0, 0},
    635           1.1  lukem     {0x0070, 1, 32, 0, 0},
    636           1.1  lukem     {0x0068, 1, 33, 0, 0},
    637           1.1  lukem     {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
    638           1.1  lukem     {0x006f, 1, 35, 0, 0},
    639           1.1  lukem     {0x0077, 1, 36, 0, 0},
    640           1.1  lukem     {0x0065, 1, 37, 0, 0},
    641           1.1  lukem     {0x0072, 1, 38, 0, 0},
    642           1.1  lukem     {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
    643           1.1  lukem     {0x0072, 2, 41, 0, 0},
    644           1.1  lukem     {0x0075, 1, 45, 0, 0},
    645           1.1  lukem     {0x0069, 1, 42, 0, 0},
    646           1.1  lukem     {0x006e, 1, 43, 0, 0},
    647           1.1  lukem     {0x0074, 1, 44, 0, 0},
    648           1.1  lukem     {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
    649           1.1  lukem     {0x006e, 1, 46, 0, 0},
    650           1.1  lukem     {0x0063, 1, 47, 0, 0},
    651           1.1  lukem     {0x0074, 1, 48, 0, 0},
    652           1.1  lukem     {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
    653           1.1  lukem     {0x0070, 1, 50, 0, 0},
    654           1.1  lukem     {0x0061, 1, 51, 0, 0},
    655           1.1  lukem     {0x0063, 1, 52, 0, 0},
    656           1.1  lukem     {0x0065, 1, 53, 0, 0},
    657           1.1  lukem     {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
    658           1.1  lukem     {0x0070, 1, 55, 0, 0},
    659           1.1  lukem     {0x0070, 1, 56, 0, 0},
    660           1.1  lukem     {0x0065, 1, 57, 0, 0},
    661           1.1  lukem     {0x0072, 1, 58, 0, 0},
    662           1.1  lukem     {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
    663           1.1  lukem     {0x0064, 1, 60, 0, 0},
    664           1.1  lukem     {0x0069, 1, 61, 0, 0},
    665           1.1  lukem     {0x0067, 1, 62, 0, 0},
    666           1.1  lukem     {0x0069, 1, 63, 0, 0},
    667           1.1  lukem     {0x0074, 1, 64, 0, 0},
    668           1.1  lukem     {0x003a, 1, 65, _ure_xdigit_setup, 0},
    669           1.1  lukem };
    670           1.1  lukem 
    671           1.1  lukem /*
    672           1.1  lukem  * Probe for one of the POSIX colon delimited character classes in the static
    673           1.1  lukem  * trie.
    674           1.1  lukem  */
    675           1.1  lukem static unsigned long
    676           1.1  lukem _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
    677           1.1  lukem                _ure_buffer_t *b)
    678           1.1  lukem {
    679           1.1  lukem     int i;
    680           1.1  lukem     unsigned long n;
    681           1.1  lukem     _ure_trie_t *tp;
    682           1.1  lukem     ucs2_t *sp, *ep;
    683           1.1  lukem 
    684           1.1  lukem     /*
    685           1.1  lukem      * If the number of characters left is less than 7, then this cannot be
    686           1.1  lukem      * interpreted as one of the colon delimited classes.
    687           1.1  lukem      */
    688           1.1  lukem     if (limit < 7)
    689           1.1  lukem       return 0;
    690           1.1  lukem 
    691           1.1  lukem     sp = cp;
    692           1.1  lukem     ep = sp + limit;
    693           1.1  lukem     tp = cclass_trie;
    694           1.1  lukem     for (i = 0; sp < ep && i < 8; i++, sp++) {
    695           1.1  lukem         n = tp->len;
    696           1.1  lukem 
    697           1.1  lukem         for (; n > 0 && tp->key != *sp; tp++, n--) ;
    698           1.1  lukem 
    699           1.1  lukem         if (n == 0)
    700           1.1  lukem           return 0;
    701           1.1  lukem 
    702           1.1  lukem         if (*sp == ':' && (i == 6 || i == 7)) {
    703           1.1  lukem             sp++;
    704           1.1  lukem             break;
    705           1.1  lukem         }
    706           1.1  lukem         if (sp + 1 < ep)
    707           1.1  lukem           tp = cclass_trie + tp->next;
    708           1.1  lukem     }
    709           1.1  lukem     if (tp->func == 0)
    710           1.1  lukem       return 0;
    711           1.1  lukem 
    712           1.1  lukem     (*tp->func)(sym, tp->mask, b);
    713           1.1  lukem 
    714           1.1  lukem     return sp - cp;
    715           1.1  lukem }
    716           1.1  lukem 
    717           1.1  lukem /*
    718           1.1  lukem  * Construct a list of ranges and return the number of characters consumed.
    719           1.1  lukem  */
    720           1.1  lukem static unsigned long
    721           1.1  lukem _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
    722           1.1  lukem             _ure_buffer_t *b)
    723           1.1  lukem {
    724           1.1  lukem     int range_end;
    725           1.1  lukem     unsigned long n;
    726           1.1  lukem     ucs2_t *sp, *ep;
    727           1.1  lukem     ucs4_t c, last;
    728           1.1  lukem     _ure_ccl_t *cclp;
    729           1.1  lukem     _ure_range_t range;
    730           1.1  lukem 
    731           1.1  lukem     sp = cp;
    732           1.1  lukem     ep = sp + limit;
    733           1.1  lukem 
    734           1.1  lukem     if (*sp == '^') {
    735           1.1  lukem       symp->type = _URE_NCCLASS;
    736           1.1  lukem       sp++;
    737           1.1  lukem     } else
    738           1.1  lukem       symp->type = _URE_CCLASS;
    739           1.1  lukem 
    740           1.1  lukem     for (last = 0, range_end = 0;
    741           1.1  lukem          b->error == _URE_OK && sp < ep && *sp != ']'; ) {
    742           1.1  lukem         c = *sp++;
    743           1.1  lukem         if (c == '\\') {
    744           1.1  lukem             if (sp == ep) {
    745           1.1  lukem                 /*
    746           1.1  lukem                  * The EOS was encountered when expecting the reverse solidus
    747           1.1  lukem                  * to be followed by the character it is escaping.  Set an
    748           1.1  lukem                  * error code and return the number of characters consumed up
    749           1.1  lukem                  * to this point.
    750           1.1  lukem                  */
    751           1.1  lukem                 b->error = _URE_UNEXPECTED_EOS;
    752           1.1  lukem                 return sp - cp;
    753           1.1  lukem             }
    754           1.1  lukem 
    755           1.1  lukem             c = *sp++;
    756           1.1  lukem             switch (c) {
    757           1.1  lukem               case 'a':
    758           1.1  lukem                 c = 0x07;
    759           1.1  lukem                 break;
    760           1.1  lukem               case 'b':
    761           1.1  lukem                 c = 0x08;
    762           1.1  lukem                 break;
    763           1.1  lukem               case 'f':
    764           1.1  lukem                 c = 0x0c;
    765           1.1  lukem                 break;
    766           1.1  lukem               case 'n':
    767           1.1  lukem                 c = 0x0a;
    768           1.1  lukem                 break;
    769           1.1  lukem               case 'r':
    770           1.1  lukem                 c = 0x0d;
    771           1.1  lukem                 break;
    772           1.1  lukem               case 't':
    773           1.1  lukem                 c = 0x09;
    774           1.1  lukem                 break;
    775           1.1  lukem               case 'v':
    776           1.1  lukem                 c = 0x0b;
    777           1.1  lukem                 break;
    778           1.1  lukem               case 'p':
    779           1.1  lukem               case 'P':
    780           1.1  lukem                 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
    781           1.1  lukem                 /*
    782           1.1  lukem                  * Invert the bit mask of the properties if this is a negated
    783           1.1  lukem                  * character class or if 'P' is used to specify a list of
    784           1.1  lukem                  * character properties that should *not* match in a
    785           1.1  lukem                  * character class.
    786           1.1  lukem                  */
    787           1.1  lukem                 if (c == 'P')
    788           1.1  lukem                   symp->props = ~symp->props;
    789           1.1  lukem                 continue;
    790           1.1  lukem                 break;
    791           1.1  lukem               case 'x':
    792           1.1  lukem               case 'X':
    793           1.1  lukem               case 'u':
    794           1.1  lukem               case 'U':
    795           1.1  lukem                 if (sp < ep &&
    796           1.1  lukem                     ((*sp >= '0' && *sp <= '9') ||
    797           1.1  lukem                      (*sp >= 'A' && *sp <= 'F') ||
    798           1.1  lukem                      (*sp >= 'a' && *sp <= 'f')))
    799           1.1  lukem                   sp += _ure_hex(sp, ep - sp, &c);
    800           1.1  lukem             }
    801           1.1  lukem         } else if (c == ':') {
    802           1.1  lukem             /*
    803           1.1  lukem              * Probe for a POSIX colon delimited character class.
    804           1.1  lukem              */
    805           1.1  lukem             sp--;
    806           1.1  lukem             if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
    807           1.1  lukem               sp++;
    808           1.1  lukem             else {
    809           1.1  lukem                 sp += n;
    810           1.1  lukem                 continue;
    811           1.1  lukem             }
    812           1.1  lukem         }
    813           1.1  lukem 
    814           1.1  lukem         cclp = &symp->sym.ccl;
    815           1.1  lukem 
    816           1.1  lukem         /*
    817           1.1  lukem          * Check to see if the current character is a low surrogate that needs
    818           1.1  lukem          * to be combined with a preceding high surrogate.
    819           1.1  lukem          */
    820           1.1  lukem         if (last != 0) {
    821           1.1  lukem             if (c >= 0xdc00 && c <= 0xdfff)
    822           1.1  lukem               /*
    823           1.1  lukem                * Construct the UTF16 character code.
    824           1.1  lukem                */
    825           1.1  lukem               c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
    826           1.1  lukem             else {
    827           1.1  lukem                 /*
    828           1.1  lukem                  * Add the isolated high surrogate to the range.
    829           1.1  lukem                  */
    830           1.1  lukem                 if (range_end == 1)
    831           1.1  lukem                   range.max_code = last & 0xffff;
    832           1.1  lukem                 else
    833           1.1  lukem                   range.min_code = range.max_code = last & 0xffff;
    834           1.1  lukem 
    835           1.1  lukem                 _ure_add_range(cclp, &range, b);
    836           1.1  lukem                 range_end = 0;
    837           1.1  lukem             }
    838           1.1  lukem         }
    839           1.1  lukem 
    840           1.1  lukem         /*
    841           1.1  lukem          * Clear the last character code.
    842           1.1  lukem          */
    843           1.1  lukem         last = 0;
    844           1.1  lukem 
    845           1.1  lukem         /*
    846           1.1  lukem          * This slightly awkward code handles the different cases needed to
    847           1.1  lukem          * construct a range.
    848           1.1  lukem          */
    849           1.1  lukem         if (c >= 0xd800 && c <= 0xdbff) {
    850           1.1  lukem             /*
    851           1.1  lukem              * If the high surrogate is followed by a range indicator, simply
    852           1.1  lukem              * add it as the range start.  Otherwise, save it in case the next
    853           1.1  lukem              * character is a low surrogate.
    854           1.1  lukem              */
    855           1.1  lukem             if (*sp == '-') {
    856           1.1  lukem                 sp++;
    857           1.1  lukem                 range.min_code = c;
    858           1.1  lukem                 range_end = 1;
    859           1.1  lukem             } else
    860           1.1  lukem               last = c;
    861           1.1  lukem         } else if (range_end == 1) {
    862           1.1  lukem             range.max_code = c;
    863           1.1  lukem             _ure_add_range(cclp, &range, b);
    864           1.1  lukem             range_end = 0;
    865           1.1  lukem         } else {
    866           1.1  lukem             range.min_code = range.max_code = c;
    867           1.1  lukem             if (*sp == '-') {
    868           1.1  lukem                 sp++;
    869           1.1  lukem                 range_end = 1;
    870           1.1  lukem             } else
    871           1.1  lukem               _ure_add_range(cclp, &range, b);
    872           1.1  lukem         }
    873           1.1  lukem     }
    874           1.1  lukem 
    875           1.1  lukem     if (sp < ep && *sp == ']')
    876           1.1  lukem       sp++;
    877           1.1  lukem     else
    878           1.1  lukem       /*
    879           1.1  lukem        * The parse was not terminated by the character class close symbol
    880           1.1  lukem        * (']'), so set an error code.
    881           1.1  lukem        */
    882           1.1  lukem       b->error = _URE_CCLASS_OPEN;
    883           1.1  lukem 
    884           1.1  lukem     return sp - cp;
    885           1.1  lukem }
    886           1.1  lukem 
    887           1.1  lukem /*
    888           1.1  lukem  * Probe for a low surrogate hex code.
    889           1.1  lukem  */
    890           1.1  lukem static unsigned long
    891           1.1  lukem _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
    892           1.1  lukem {
    893           1.1  lukem     ucs4_t i, code;
    894           1.1  lukem     ucs2_t *sp, *ep;
    895           1.1  lukem 
    896           1.1  lukem     for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
    897           1.1  lukem         if (*sp >= '0' && *sp <= '9')
    898           1.1  lukem           code = (code << 4) + (*sp - '0');
    899           1.1  lukem         else if (*sp >= 'A' && *sp <= 'F')
    900           1.1  lukem           code = (code << 4) + ((*sp - 'A') + 10);
    901           1.1  lukem         else if (*sp >= 'a' && *sp <= 'f')
    902           1.1  lukem           code = (code << 4) + ((*sp - 'a') + 10);
    903           1.1  lukem         else
    904           1.1  lukem           break;
    905           1.1  lukem     }
    906           1.1  lukem 
    907           1.1  lukem     *c = code;
    908           1.1  lukem     return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
    909           1.1  lukem }
    910           1.1  lukem 
    911           1.1  lukem static unsigned long
    912           1.1  lukem _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
    913           1.1  lukem                     _ure_buffer_t *b)
    914           1.1  lukem {
    915           1.1  lukem     ucs4_t c;
    916           1.1  lukem     ucs2_t *sp, *ep;
    917           1.1  lukem 
    918           1.1  lukem     sp = sym;
    919           1.1  lukem     ep = sym + limit;
    920           1.1  lukem 
    921           1.1  lukem     if ((c = *sp++) == '\\') {
    922           1.1  lukem 
    923           1.1  lukem         if (sp == ep) {
    924           1.1  lukem             /*
    925           1.1  lukem              * The EOS was encountered when expecting the reverse solidus to
    926           1.1  lukem              * be followed by the character it is escaping.  Set an error code
    927           1.1  lukem              * and return the number of characters consumed up to this point.
    928           1.1  lukem              */
    929           1.1  lukem             b->error = _URE_UNEXPECTED_EOS;
    930           1.1  lukem             return sp - sym;
    931           1.1  lukem         }
    932           1.1  lukem 
    933           1.1  lukem         c = *sp++;
    934           1.1  lukem         switch (c) {
    935           1.1  lukem           case 'p':
    936           1.1  lukem           case 'P':
    937           1.1  lukem             symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
    938           1.1  lukem             sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
    939           1.1  lukem             break;
    940           1.1  lukem           case 'a':
    941           1.1  lukem             symp->type = _URE_CHAR;
    942           1.1  lukem             symp->sym.chr = 0x07;
    943           1.1  lukem             break;
    944           1.1  lukem           case 'b':
    945           1.1  lukem             symp->type = _URE_CHAR;
    946           1.1  lukem             symp->sym.chr = 0x08;
    947           1.1  lukem             break;
    948           1.1  lukem           case 'f':
    949           1.1  lukem             symp->type = _URE_CHAR;
    950           1.1  lukem             symp->sym.chr = 0x0c;
    951           1.1  lukem             break;
    952           1.1  lukem           case 'n':
    953           1.1  lukem             symp->type = _URE_CHAR;
    954           1.1  lukem             symp->sym.chr = 0x0a;
    955           1.1  lukem             break;
    956           1.1  lukem           case 'r':
    957           1.1  lukem             symp->type = _URE_CHAR;
    958           1.1  lukem             symp->sym.chr = 0x0d;
    959           1.1  lukem             break;
    960           1.1  lukem           case 't':
    961           1.1  lukem             symp->type = _URE_CHAR;
    962           1.1  lukem             symp->sym.chr = 0x09;
    963           1.1  lukem             break;
    964           1.1  lukem           case 'v':
    965           1.1  lukem             symp->type = _URE_CHAR;
    966           1.1  lukem             symp->sym.chr = 0x0b;
    967           1.1  lukem             break;
    968           1.1  lukem           case 'x':
    969           1.1  lukem           case 'X':
    970           1.1  lukem           case 'u':
    971           1.1  lukem           case 'U':
    972           1.1  lukem             /*
    973           1.1  lukem              * Collect between 1 and 4 digits representing a UCS2 code.  Fall
    974           1.1  lukem              * through to the next case.
    975           1.1  lukem              */
    976           1.1  lukem             if (sp < ep &&
    977           1.1  lukem                 ((*sp >= '0' && *sp <= '9') ||
    978           1.1  lukem                  (*sp >= 'A' && *sp <= 'F') ||
    979           1.1  lukem                  (*sp >= 'a' && *sp <= 'f')))
    980           1.1  lukem               sp += _ure_hex(sp, ep - sp, &c);
    981           1.1  lukem             /* FALLTHROUGH */
    982           1.1  lukem           default:
    983           1.1  lukem             /*
    984           1.1  lukem              * Simply add an escaped character here.
    985           1.1  lukem              */
    986           1.1  lukem             symp->type = _URE_CHAR;
    987           1.1  lukem             symp->sym.chr = c;
    988           1.1  lukem         }
    989           1.1  lukem     } else if (c == '^' || c == '$')
    990           1.1  lukem       /*
    991           1.1  lukem        * Handle the BOL and EOL anchors.  This actually consists simply of
    992           1.1  lukem        * setting a flag that indicates that the user supplied anchor match
    993           1.1  lukem        * function should be called.  This needs to be done instead of simply
    994           1.1  lukem        * matching line/paragraph separators because beginning-of-text and
    995           1.1  lukem        * end-of-text tests are needed as well.
    996           1.1  lukem        */
    997           1.1  lukem       symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
    998           1.1  lukem     else if (c == '[')
    999           1.1  lukem       /*
   1000           1.1  lukem        * Construct a character class.
   1001           1.1  lukem        */
   1002           1.1  lukem       sp += _ure_cclass(sp, ep - sp, symp, b);
   1003           1.1  lukem     else if (c == '.')
   1004           1.1  lukem       symp->type = _URE_ANY_CHAR;
   1005           1.1  lukem     else {
   1006           1.1  lukem         symp->type = _URE_CHAR;
   1007           1.1  lukem         symp->sym.chr = c;
   1008           1.1  lukem     }
   1009           1.1  lukem 
   1010           1.1  lukem     /*
   1011           1.1  lukem      * If the symbol type happens to be a character and is a high surrogate,
   1012           1.1  lukem      * then probe forward to see if it is followed by a low surrogate that
   1013           1.1  lukem      * needs to be added.
   1014           1.1  lukem      */
   1015           1.1  lukem     if (sp < ep && symp->type == _URE_CHAR &&
   1016           1.1  lukem         0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
   1017           1.1  lukem 
   1018           1.1  lukem         if (0xdc00 <= *sp && *sp <= 0xdfff) {
   1019           1.1  lukem             symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
   1020           1.1  lukem                                        (*sp & 0x03ff));
   1021           1.1  lukem             sp++;
   1022           1.1  lukem         } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
   1023           1.1  lukem                                  *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
   1024           1.1  lukem             sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
   1025           1.1  lukem             if (0xdc00 <= c && c <= 0xdfff) {
   1026           1.1  lukem                 /*
   1027           1.1  lukem                  * Take into account the \[xu] in front of the hex code.
   1028           1.1  lukem                  */
   1029           1.1  lukem                 sp += 2;
   1030           1.1  lukem                 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
   1031           1.1  lukem                                            (c & 0x03ff));
   1032           1.1  lukem             }
   1033           1.1  lukem         }
   1034           1.1  lukem     }
   1035           1.1  lukem 
   1036           1.1  lukem     /*
   1037           1.1  lukem      * Last, make sure any _URE_CHAR type symbols are changed to lower case if
   1038           1.1  lukem      * the `casefold' flag is set.
   1039           1.1  lukem      */
   1040           1.1  lukem     if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
   1041           1.1  lukem       symp->sym.chr = _ure_tolower(symp->sym.chr);
   1042           1.1  lukem 
   1043           1.1  lukem     /*
   1044           1.1  lukem      * If the symbol constructed is anything other than one of the anchors,
   1045           1.1  lukem      * make sure the _URE_DFA_BLANKLINE flag is removed.
   1046           1.1  lukem      */
   1047           1.1  lukem     if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
   1048           1.1  lukem       b->flags &= ~_URE_DFA_BLANKLINE;
   1049           1.1  lukem 
   1050           1.1  lukem     /*
   1051           1.1  lukem      * Return the number of characters consumed.
   1052           1.1  lukem      */
   1053           1.1  lukem     return sp - sym;
   1054           1.1  lukem }
   1055           1.1  lukem 
   1056           1.1  lukem static int
   1057           1.1  lukem _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
   1058           1.1  lukem {
   1059           1.1  lukem     if (a->type != b->type || a->mods != b->mods || a->props != b->props)
   1060           1.1  lukem       return 1;
   1061           1.1  lukem 
   1062           1.1  lukem     if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
   1063           1.1  lukem         if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
   1064           1.1  lukem           return 1;
   1065           1.1  lukem         if (a->sym.ccl.ranges_used > 0 &&
   1066           1.1  lukem             memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
   1067           1.1  lukem                    sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
   1068           1.1  lukem           return 1;
   1069           1.1  lukem     } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
   1070           1.1  lukem       return 1;
   1071           1.1  lukem     return 0;
   1072           1.1  lukem }
   1073           1.1  lukem 
   1074           1.1  lukem /*
   1075           1.1  lukem  * Construct a symbol, but only keep unique symbols.
   1076           1.1  lukem  */
   1077           1.1  lukem static ucs2_t
   1078           1.1  lukem _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
   1079           1.1  lukem                  _ure_buffer_t *b)
   1080           1.1  lukem {
   1081           1.1  lukem     ucs2_t i;
   1082           1.1  lukem     _ure_symtab_t *sp, symbol;
   1083           1.1  lukem 
   1084           1.1  lukem     /*
   1085           1.1  lukem      * Build the next symbol so we can test to see if it is already in the
   1086           1.1  lukem      * symbol table.
   1087           1.1  lukem      */
   1088           1.1  lukem     (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
   1089           1.1  lukem     *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
   1090           1.1  lukem 
   1091           1.1  lukem     /*
   1092           1.1  lukem      * Check to see if the symbol exists.
   1093           1.1  lukem      */
   1094           1.1  lukem     for (i = 0, sp = b->symtab;
   1095           1.1  lukem          i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
   1096           1.1  lukem 
   1097           1.1  lukem     if (i < b->symtab_used) {
   1098           1.1  lukem         /*
   1099           1.1  lukem          * Free up any ranges used for the symbol.
   1100           1.1  lukem          */
   1101           1.1  lukem         if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
   1102           1.1  lukem             symbol.sym.ccl.ranges_size > 0)
   1103           1.1  lukem           free((char *) symbol.sym.ccl.ranges);
   1104           1.1  lukem 
   1105           1.1  lukem         return b->symtab[i].id;
   1106           1.1  lukem     }
   1107           1.1  lukem 
   1108           1.1  lukem     /*
   1109           1.1  lukem      * Need to add the new symbol.
   1110           1.1  lukem      */
   1111           1.1  lukem     if (b->symtab_used == b->symtab_size) {
   1112           1.1  lukem         if (b->symtab_size == 0)
   1113           1.1  lukem           b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
   1114           1.1  lukem         else
   1115           1.1  lukem           b->symtab = (_ure_symtab_t *)
   1116           1.1  lukem               realloc((char *) b->symtab,
   1117           1.1  lukem                       sizeof(_ure_symtab_t) * (b->symtab_size + 8));
   1118           1.1  lukem         sp = b->symtab + b->symtab_size;
   1119           1.1  lukem         (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
   1120           1.1  lukem         b->symtab_size += 8;
   1121           1.1  lukem     }
   1122           1.1  lukem 
   1123           1.1  lukem     symbol.id = b->symtab_used++;
   1124           1.1  lukem     (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
   1125           1.1  lukem                   sizeof(_ure_symtab_t));
   1126           1.1  lukem 
   1127           1.1  lukem     return symbol.id;
   1128           1.1  lukem }
   1129           1.1  lukem 
   1130           1.1  lukem /*************************************************************************
   1131           1.1  lukem  *
   1132           1.1  lukem  * End symbol parse functions.
   1133           1.1  lukem  *
   1134           1.1  lukem  *************************************************************************/
   1135           1.1  lukem 
   1136           1.1  lukem static ucs2_t
   1137           1.1  lukem _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
   1138           1.1  lukem {
   1139           1.1  lukem     ucs2_t i;
   1140           1.1  lukem 
   1141           1.1  lukem     if (b == 0)
   1142           1.1  lukem       return _URE_NOOP;
   1143           1.1  lukem 
   1144           1.1  lukem     /*
   1145           1.1  lukem      * Determine if the expression already exists or not.
   1146           1.1  lukem      */
   1147           1.1  lukem     for (i = 0; i < b->expr_used; i++) {
   1148           1.1  lukem         if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
   1149           1.1  lukem             b->expr[i].rhs == rhs)
   1150           1.1  lukem           break;
   1151           1.1  lukem     }
   1152           1.1  lukem     if (i < b->expr_used)
   1153           1.1  lukem       return i;
   1154           1.1  lukem 
   1155           1.1  lukem     /*
   1156           1.1  lukem      * Need to add a new expression.
   1157           1.1  lukem      */
   1158           1.1  lukem     if (b->expr_used == b->expr_size) {
   1159           1.1  lukem         if (b->expr_size == 0)
   1160           1.1  lukem           b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
   1161           1.1  lukem         else
   1162           1.1  lukem           b->expr = (_ure_elt_t *)
   1163           1.1  lukem               realloc((char *) b->expr,
   1164           1.1  lukem                       sizeof(_ure_elt_t) * (b->expr_size + 8));
   1165           1.1  lukem         b->expr_size += 8;
   1166           1.1  lukem     }
   1167           1.1  lukem 
   1168           1.1  lukem     b->expr[b->expr_used].onstack = 0;
   1169           1.1  lukem     b->expr[b->expr_used].type = type;
   1170           1.1  lukem     b->expr[b->expr_used].lhs = lhs;
   1171           1.1  lukem     b->expr[b->expr_used].rhs = rhs;
   1172           1.1  lukem 
   1173           1.1  lukem     return b->expr_used++;
   1174           1.1  lukem }
   1175           1.1  lukem 
   1176           1.1  lukem static unsigned char spmap[] = {
   1177           1.1  lukem     0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
   1178           1.1  lukem     0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   1179           1.1  lukem     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   1180           1.1  lukem };
   1181           1.1  lukem 
   1182           1.1  lukem #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
   1183           1.1  lukem                             (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
   1184           1.1  lukem 
   1185           1.1  lukem /*
   1186           1.1  lukem  * Convert the regular expression into an NFA in a form that will be easy to
   1187           1.1  lukem  * reduce to a DFA.  The starting state for the reduction will be returned.
   1188           1.1  lukem  */
   1189           1.1  lukem static ucs2_t
   1190           1.1  lukem _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
   1191           1.1  lukem {
   1192           1.1  lukem     ucs2_t c, state, top, sym, *sp, *ep;
   1193           1.1  lukem     unsigned long used;
   1194           1.1  lukem 
   1195           1.1  lukem     state = _URE_NOOP;
   1196           1.1  lukem 
   1197           1.1  lukem     sp = re;
   1198           1.1  lukem     ep = sp + relen;
   1199           1.1  lukem     while (b->error == _URE_OK && sp < ep) {
   1200           1.1  lukem         c = *sp++;
   1201           1.1  lukem         switch (c) {
   1202           1.1  lukem           case '(':
   1203           1.1  lukem             _ure_push(_URE_PAREN, b);
   1204           1.1  lukem             break;
   1205           1.1  lukem           case ')':
   1206           1.1  lukem             /*
   1207           1.1  lukem              * Check for the case of too many close parentheses.
   1208           1.1  lukem              */
   1209           1.1  lukem             if (_ure_peek(b) == _URE_NOOP) {
   1210           1.1  lukem                 b->error = _URE_UNBALANCED_GROUP;
   1211           1.1  lukem                 break;
   1212           1.1  lukem             }
   1213           1.1  lukem 
   1214           1.1  lukem             while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
   1215           1.1  lukem               /*
   1216           1.1  lukem                * Make an expression with the AND or OR operator and its right
   1217           1.1  lukem                * hand side.
   1218           1.1  lukem                */
   1219           1.1  lukem               state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
   1220           1.1  lukem 
   1221           1.1  lukem             /*
   1222           1.1  lukem              * Remove the _URE_PAREN off the stack.
   1223           1.1  lukem              */
   1224           1.1  lukem             (void) _ure_pop(b);
   1225           1.1  lukem             break;
   1226           1.1  lukem           case '*':
   1227           1.1  lukem             state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
   1228           1.1  lukem             break;
   1229           1.1  lukem           case '+':
   1230           1.1  lukem             state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
   1231           1.1  lukem             break;
   1232           1.1  lukem           case '?':
   1233           1.1  lukem             state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
   1234           1.1  lukem             break;
   1235           1.1  lukem           case '|':
   1236           1.1  lukem             while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
   1237           1.1  lukem               /*
   1238           1.1  lukem                * Make an expression with the AND or OR operator and its right
   1239           1.1  lukem                * hand side.
   1240           1.1  lukem                */
   1241           1.1  lukem               state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
   1242           1.1  lukem 
   1243           1.1  lukem             _ure_push(state, b);
   1244           1.1  lukem             _ure_push(_URE_OR, b);
   1245           1.1  lukem             break;
   1246           1.1  lukem           default:
   1247           1.1  lukem             sp--;
   1248           1.1  lukem             sym = _ure_make_symbol(sp, ep - sp, &used, b);
   1249           1.1  lukem             sp += used;
   1250           1.1  lukem             state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
   1251           1.1  lukem             break;
   1252           1.1  lukem         }
   1253           1.1  lukem 
   1254           1.1  lukem         if (c != '(' && c != '|' && sp < ep &&
   1255           1.1  lukem             (!_ure_isspecial(*sp) || *sp == '(')) {
   1256           1.1  lukem             _ure_push(state, b);
   1257           1.1  lukem             _ure_push(_URE_AND, b);
   1258           1.1  lukem         }
   1259           1.1  lukem     }
   1260           1.1  lukem     while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
   1261           1.1  lukem       /*
   1262           1.1  lukem        * Make an expression with the AND or OR operator and its right
   1263           1.1  lukem        * hand side.
   1264           1.1  lukem        */
   1265           1.1  lukem       state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
   1266           1.1  lukem 
   1267           1.1  lukem     if (b->stack.slist_used > 0)
   1268           1.1  lukem       b->error = _URE_UNBALANCED_GROUP;
   1269           1.1  lukem 
   1270           1.1  lukem     return (b->error == _URE_OK) ? state : _URE_NOOP;
   1271           1.1  lukem }
   1272           1.1  lukem 
   1273           1.1  lukem static void
   1274           1.1  lukem _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
   1275           1.1  lukem {
   1276           1.1  lukem     ucs2_t i, *stp;
   1277           1.1  lukem     _ure_symtab_t *sp;
   1278           1.1  lukem 
   1279           1.1  lukem     /*
   1280           1.1  lukem      * Locate the symbol in the symbol table so the state can be added.
   1281           1.1  lukem      * If the symbol doesn't exist, then a real problem exists.
   1282           1.1  lukem      */
   1283           1.1  lukem     for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
   1284           1.1  lukem          i++, sp++) ;
   1285           1.1  lukem 
   1286           1.1  lukem     /*
   1287           1.1  lukem      * Now find out if the state exists in the symbol's state list.
   1288           1.1  lukem      */
   1289           1.1  lukem     for (i = 0, stp = sp->states.slist;
   1290           1.1  lukem          i < sp->states.slist_used && state > *stp; i++, stp++) ;
   1291           1.1  lukem 
   1292           1.1  lukem     if (i == sp->states.slist_used || state < *stp) {
   1293           1.1  lukem         /*
   1294           1.1  lukem          * Need to add the state in order.
   1295           1.1  lukem          */
   1296           1.1  lukem         if (sp->states.slist_used == sp->states.slist_size) {
   1297           1.1  lukem             if (sp->states.slist_size == 0)
   1298           1.1  lukem               sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
   1299           1.1  lukem             else
   1300           1.1  lukem               sp->states.slist = (ucs2_t *)
   1301           1.1  lukem                   realloc((char *) sp->states.slist,
   1302           1.1  lukem                           sizeof(ucs2_t) * (sp->states.slist_size + 8));
   1303           1.1  lukem             sp->states.slist_size += 8;
   1304           1.1  lukem         }
   1305           1.1  lukem         if (i < sp->states.slist_used)
   1306           1.1  lukem           (void) _ure_memmove((char *) (sp->states.slist + i + 1),
   1307           1.1  lukem                               (char *) (sp->states.slist + i),
   1308           1.1  lukem                               sizeof(ucs2_t) * (sp->states.slist_used - i));
   1309           1.1  lukem         sp->states.slist[i] = state;
   1310           1.1  lukem         sp->states.slist_used++;
   1311           1.1  lukem     }
   1312           1.1  lukem }
   1313           1.1  lukem 
   1314           1.1  lukem static ucs2_t
   1315           1.1  lukem _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
   1316           1.1  lukem {
   1317           1.1  lukem     ucs2_t i;
   1318           1.1  lukem     _ure_state_t *sp;
   1319           1.1  lukem 
   1320           1.1  lukem     for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
   1321           1.1  lukem         if (sp->st.slist_used == nstates &&
   1322           1.1  lukem             memcmp((char *) states, (char *) sp->st.slist,
   1323           1.1  lukem                    sizeof(ucs2_t) * nstates) == 0)
   1324           1.1  lukem           break;
   1325           1.1  lukem     }
   1326           1.1  lukem 
   1327           1.1  lukem     if (i == b->states.states_used) {
   1328           1.1  lukem         /*
   1329           1.1  lukem          * Need to add a new DFA state (set of NFA states).
   1330           1.1  lukem          */
   1331           1.1  lukem         if (b->states.states_used == b->states.states_size) {
   1332           1.1  lukem             if (b->states.states_size == 0)
   1333           1.1  lukem               b->states.states = (_ure_state_t *)
   1334           1.1  lukem                   malloc(sizeof(_ure_state_t) << 3);
   1335           1.1  lukem             else
   1336           1.1  lukem               b->states.states = (_ure_state_t *)
   1337           1.1  lukem                   realloc((char *) b->states.states,
   1338           1.1  lukem                           sizeof(_ure_state_t) * (b->states.states_size + 8));
   1339           1.1  lukem             sp = b->states.states + b->states.states_size;
   1340           1.1  lukem             (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
   1341           1.1  lukem             b->states.states_size += 8;
   1342           1.1  lukem         }
   1343           1.1  lukem 
   1344           1.1  lukem         sp = b->states.states + b->states.states_used++;
   1345           1.1  lukem         sp->id = i;
   1346           1.1  lukem 
   1347           1.1  lukem         if (sp->st.slist_used + nstates > sp->st.slist_size) {
   1348           1.1  lukem             if (sp->st.slist_size == 0)
   1349           1.1  lukem               sp->st.slist = (ucs2_t *)
   1350           1.1  lukem                   malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
   1351           1.1  lukem             else
   1352           1.1  lukem               sp->st.slist = (ucs2_t *)
   1353           1.1  lukem                   realloc((char *) sp->st.slist,
   1354           1.1  lukem                           sizeof(ucs2_t) * (sp->st.slist_used + nstates));
   1355           1.1  lukem             sp->st.slist_size = sp->st.slist_used + nstates;
   1356           1.1  lukem         }
   1357           1.1  lukem         sp->st.slist_used = nstates;
   1358           1.1  lukem         (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
   1359           1.1  lukem                       sizeof(ucs2_t) * nstates);
   1360           1.1  lukem     }
   1361           1.1  lukem 
   1362           1.1  lukem     /*
   1363           1.1  lukem      * Return the ID of the DFA state representing a group of NFA states.
   1364           1.1  lukem      */
   1365           1.1  lukem     return i;
   1366           1.1  lukem }
   1367           1.1  lukem 
   1368           1.1  lukem static void
   1369           1.1  lukem _ure_reduce(ucs2_t start, _ure_buffer_t *b)
   1370           1.1  lukem {
   1371           1.1  lukem     ucs2_t i, j, state, eval, syms, rhs;
   1372           1.1  lukem     ucs2_t s1, s2, ns1, ns2;
   1373           1.1  lukem     _ure_state_t *sp;
   1374           1.1  lukem     _ure_symtab_t *smp;
   1375           1.1  lukem 
   1376           1.1  lukem     b->reducing = 1;
   1377           1.1  lukem 
   1378           1.1  lukem     /*
   1379           1.1  lukem      * Add the starting state for the reduction.
   1380           1.1  lukem      */
   1381           1.1  lukem     _ure_add_state(1, &start, b);
   1382           1.1  lukem 
   1383           1.1  lukem     /*
   1384           1.1  lukem      * Process each set of NFA states that get created.
   1385           1.1  lukem      */
   1386           1.1  lukem     for (i = 0; i < b->states.states_used; i++) {
   1387           1.1  lukem         sp = b->states.states + i;
   1388           1.1  lukem 
   1389           1.1  lukem         /*
   1390           1.1  lukem          * Push the current states on the stack.
   1391           1.1  lukem          */
   1392           1.1  lukem         for (j = 0; j < sp->st.slist_used; j++)
   1393           1.1  lukem           _ure_push(sp->st.slist[j], b);
   1394           1.1  lukem 
   1395           1.1  lukem         /*
   1396           1.1  lukem          * Reduce the NFA states.
   1397           1.1  lukem          */
   1398           1.1  lukem         for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
   1399           1.1  lukem             state = b->stack.slist[j];
   1400           1.1  lukem             eval = 1;
   1401           1.1  lukem 
   1402           1.1  lukem             /*
   1403           1.1  lukem              * This inner loop is the iterative equivalent of recursively
   1404           1.1  lukem              * reducing subexpressions generated as a result of a reduction.
   1405           1.1  lukem              */
   1406           1.1  lukem             while (eval) {
   1407           1.1  lukem                 switch (b->expr[state].type) {
   1408           1.1  lukem                   case _URE_SYMBOL:
   1409           1.1  lukem                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
   1410           1.1  lukem                     _ure_add_symstate(b->expr[state].lhs, ns1, b);
   1411           1.1  lukem                     syms++;
   1412           1.1  lukem                     eval = 0;
   1413           1.1  lukem                     break;
   1414           1.1  lukem                   case _URE_ONE:
   1415           1.1  lukem                     sp->accepting = 1;
   1416           1.1  lukem                     eval = 0;
   1417           1.1  lukem                     break;
   1418           1.1  lukem                   case _URE_QUEST:
   1419           1.1  lukem                     s1 = b->expr[state].lhs;
   1420           1.1  lukem                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
   1421           1.1  lukem                     state = _ure_make_expr(_URE_OR, ns1, s1, b);
   1422           1.1  lukem                     break;
   1423           1.1  lukem                   case _URE_PLUS:
   1424           1.1  lukem                     s1 = b->expr[state].lhs;
   1425           1.1  lukem                     ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
   1426           1.1  lukem                     state = _ure_make_expr(_URE_AND, s1, ns1, b);
   1427           1.1  lukem                     break;
   1428           1.1  lukem                   case _URE_STAR:
   1429           1.1  lukem                     s1 = b->expr[state].lhs;
   1430           1.1  lukem                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
   1431           1.1  lukem                     ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
   1432           1.1  lukem                     state = _ure_make_expr(_URE_OR, ns1, ns2, b);
   1433           1.1  lukem                     break;
   1434           1.1  lukem                   case _URE_OR:
   1435           1.1  lukem                     s1 = b->expr[state].lhs;
   1436           1.1  lukem                     s2 = b->expr[state].rhs;
   1437           1.1  lukem                     _ure_push(s1, b);
   1438           1.1  lukem                     _ure_push(s2, b);
   1439           1.1  lukem                     eval = 0;
   1440           1.1  lukem                     break;
   1441           1.1  lukem                   case _URE_AND:
   1442           1.1  lukem                     s1 = b->expr[state].lhs;
   1443           1.1  lukem                     s2 = b->expr[state].rhs;
   1444           1.1  lukem                     switch (b->expr[s1].type) {
   1445           1.1  lukem                       case _URE_SYMBOL:
   1446           1.1  lukem                         _ure_add_symstate(b->expr[s1].lhs, s2, b);
   1447           1.1  lukem                         syms++;
   1448           1.1  lukem                         eval = 0;
   1449           1.1  lukem                         break;
   1450           1.1  lukem                       case _URE_ONE:
   1451           1.1  lukem                         state = s2;
   1452           1.1  lukem                         break;
   1453           1.1  lukem                       case _URE_QUEST:
   1454           1.1  lukem                         ns1 = b->expr[s1].lhs;
   1455           1.1  lukem                         ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
   1456           1.1  lukem                         state = _ure_make_expr(_URE_OR, s2, ns2, b);
   1457           1.1  lukem                         break;
   1458           1.1  lukem                       case _URE_PLUS:
   1459           1.1  lukem                         ns1 = b->expr[s1].lhs;
   1460           1.1  lukem                         ns2 = _ure_make_expr(_URE_OR, s2, state, b);
   1461           1.1  lukem                         state = _ure_make_expr(_URE_AND, ns1, ns2, b);
   1462           1.1  lukem                         break;
   1463           1.1  lukem                       case _URE_STAR:
   1464           1.1  lukem                         ns1 = b->expr[s1].lhs;
   1465           1.1  lukem                         ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
   1466           1.1  lukem                         state = _ure_make_expr(_URE_OR, s2, ns2, b);
   1467           1.1  lukem                         break;
   1468           1.1  lukem                       case _URE_OR:
   1469           1.1  lukem                         ns1 = b->expr[s1].lhs;
   1470           1.1  lukem                         ns2 = b->expr[s1].rhs;
   1471           1.1  lukem                         ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
   1472           1.1  lukem                         ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
   1473           1.1  lukem                         state = _ure_make_expr(_URE_OR, ns1, ns2, b);
   1474           1.1  lukem                         break;
   1475           1.1  lukem                       case _URE_AND:
   1476           1.1  lukem                         ns1 = b->expr[s1].lhs;
   1477           1.1  lukem                         ns2 = b->expr[s1].rhs;
   1478           1.1  lukem                         ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
   1479           1.1  lukem                         state = _ure_make_expr(_URE_AND, ns1, ns2, b);
   1480           1.1  lukem                         break;
   1481           1.1  lukem                     }
   1482           1.1  lukem                 }
   1483           1.1  lukem             }
   1484           1.1  lukem         }
   1485           1.1  lukem 
   1486           1.1  lukem         /*
   1487           1.1  lukem          * Clear the state stack.
   1488           1.1  lukem          */
   1489           1.1  lukem         while (_ure_pop(b) != _URE_NOOP) ;
   1490           1.1  lukem 
   1491           1.1  lukem         /*
   1492           1.1  lukem          * Reset the state pointer because the reduction may have moved it
   1493           1.1  lukem          * during a reallocation.
   1494           1.1  lukem          */
   1495           1.1  lukem         sp = b->states.states + i;
   1496           1.1  lukem 
   1497           1.1  lukem         /*
   1498           1.1  lukem          * Generate the DFA states for the symbols collected during the
   1499           1.1  lukem          * current reduction.
   1500           1.1  lukem          */
   1501           1.1  lukem         if (sp->trans_used + syms > sp->trans_size) {
   1502           1.1  lukem             if (sp->trans_size == 0)
   1503           1.1  lukem               sp->trans = (_ure_elt_t *)
   1504           1.1  lukem                   malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
   1505           1.1  lukem             else
   1506           1.1  lukem               sp->trans = (_ure_elt_t *)
   1507           1.1  lukem                   realloc((char *) sp->trans,
   1508           1.1  lukem                           sizeof(_ure_elt_t) * (sp->trans_used + syms));
   1509           1.1  lukem             sp->trans_size = sp->trans_used + syms;
   1510           1.1  lukem         }
   1511           1.1  lukem 
   1512           1.1  lukem         /*
   1513           1.1  lukem          * Go through the symbol table and generate the DFA state transitions
   1514           1.1  lukem          * for each symbol that has collected NFA states.
   1515           1.1  lukem          */
   1516           1.1  lukem         for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
   1517           1.1  lukem             sp = b->states.states + i;
   1518           1.1  lukem 
   1519           1.1  lukem             if (smp->states.slist_used > 0) {
   1520           1.1  lukem                 sp->trans[syms].lhs = smp->id;
   1521           1.1  lukem                 rhs = _ure_add_state(smp->states.slist_used,
   1522           1.1  lukem                                      smp->states.slist, b);
   1523           1.1  lukem                 /*
   1524           1.1  lukem                  * Reset the state pointer in case the reallocation moves it
   1525           1.1  lukem                  * in memory.
   1526           1.1  lukem                  */
   1527           1.1  lukem                 sp = b->states.states + i;
   1528           1.1  lukem                 sp->trans[syms].rhs = rhs;
   1529           1.1  lukem 
   1530           1.1  lukem                 smp->states.slist_used = 0;
   1531           1.1  lukem                 syms++;
   1532           1.1  lukem             }
   1533           1.1  lukem         }
   1534           1.1  lukem 
   1535           1.1  lukem         /*
   1536           1.1  lukem          * Set the number of transitions actually used.
   1537           1.1  lukem          */
   1538           1.1  lukem         sp->trans_used = syms;
   1539           1.1  lukem     }
   1540           1.1  lukem     b->reducing = 0;
   1541           1.1  lukem }
   1542           1.1  lukem 
   1543           1.1  lukem static void
   1544           1.1  lukem _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
   1545           1.1  lukem {
   1546           1.1  lukem     ucs2_t tmp;
   1547           1.1  lukem 
   1548           1.1  lukem     l = b->states.states[l].id;
   1549           1.1  lukem     r = b->states.states[r].id;
   1550           1.1  lukem 
   1551           1.1  lukem     if (l == r)
   1552           1.1  lukem       return;
   1553           1.1  lukem 
   1554           1.1  lukem     if (l > r) {
   1555           1.1  lukem         tmp = l;
   1556           1.1  lukem         l = r;
   1557           1.1  lukem         r = tmp;
   1558           1.1  lukem     }
   1559           1.1  lukem 
   1560           1.1  lukem     /*
   1561           1.1  lukem      * Check to see if the equivalence pair already exists.
   1562           1.1  lukem      */
   1563           1.1  lukem     for (tmp = 0; tmp < b->equiv_used &&
   1564           1.1  lukem              (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
   1565           1.1  lukem          tmp++) ;
   1566           1.1  lukem 
   1567           1.1  lukem     if (tmp < b->equiv_used)
   1568           1.1  lukem       return;
   1569           1.1  lukem 
   1570           1.1  lukem     if (b->equiv_used == b->equiv_size) {
   1571           1.1  lukem         if (b->equiv_size == 0)
   1572           1.1  lukem           b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
   1573           1.1  lukem         else
   1574           1.1  lukem           b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
   1575           1.1  lukem                                               sizeof(_ure_equiv_t) *
   1576           1.1  lukem                                               (b->equiv_size + 8));
   1577           1.1  lukem         b->equiv_size += 8;
   1578           1.1  lukem     }
   1579           1.1  lukem     b->equiv[b->equiv_used].l = l;
   1580           1.1  lukem     b->equiv[b->equiv_used].r = r;
   1581           1.1  lukem     b->equiv_used++;
   1582           1.1  lukem }
   1583           1.1  lukem 
   1584           1.1  lukem /*
   1585           1.1  lukem  * Merge the DFA states that are equivalent.
   1586           1.1  lukem  */
   1587           1.1  lukem static void
   1588           1.1  lukem _ure_merge_equiv(_ure_buffer_t *b)
   1589           1.1  lukem {
   1590           1.1  lukem     ucs2_t i, j, k, eq, done;
   1591           1.1  lukem     _ure_state_t *sp1, *sp2, *ls, *rs;
   1592           1.1  lukem 
   1593           1.1  lukem     for (i = 0; i < b->states.states_used; i++) {
   1594           1.1  lukem         sp1 = b->states.states + i;
   1595           1.1  lukem         if (sp1->id != i)
   1596           1.1  lukem           continue;
   1597           1.1  lukem         for (j = 0; j < i; j++) {
   1598           1.1  lukem             sp2 = b->states.states + j;
   1599           1.1  lukem             if (sp2->id != j)
   1600           1.1  lukem               continue;
   1601           1.1  lukem             b->equiv_used = 0;
   1602           1.1  lukem             _ure_add_equiv(i, j, b);
   1603           1.1  lukem             for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
   1604           1.1  lukem                 ls = b->states.states + b->equiv[eq].l;
   1605           1.1  lukem                 rs = b->states.states + b->equiv[eq].r;
   1606           1.1  lukem                 if (ls->accepting != rs->accepting ||
   1607           1.1  lukem                     ls->trans_used != rs->trans_used) {
   1608           1.1  lukem                     done = 1;
   1609           1.1  lukem                     break;
   1610           1.1  lukem                 }
   1611           1.1  lukem                 for (k = 0; k < ls->trans_used &&
   1612           1.1  lukem                          ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
   1613           1.1  lukem                 if (k < ls->trans_used) {
   1614           1.1  lukem                     done = 1;
   1615           1.1  lukem                     break;
   1616           1.1  lukem                 }
   1617           1.1  lukem 
   1618           1.1  lukem                 for (k = 0; k < ls->trans_used; k++)
   1619           1.1  lukem                   _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
   1620           1.1  lukem             }
   1621           1.1  lukem             if (done == 0)
   1622           1.1  lukem               break;
   1623           1.1  lukem         }
   1624           1.1  lukem         for (eq = 0; j < i && eq < b->equiv_used; eq++)
   1625           1.1  lukem           b->states.states[b->equiv[eq].r].id =
   1626           1.1  lukem               b->states.states[b->equiv[eq].l].id;
   1627           1.1  lukem     }
   1628           1.1  lukem 
   1629           1.1  lukem     /*
   1630           1.1  lukem      * Renumber the states appropriately.
   1631           1.1  lukem      */
   1632           1.1  lukem     for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
   1633           1.1  lukem          sp1++, i++)
   1634           1.1  lukem       sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
   1635           1.1  lukem }
   1636           1.1  lukem 
   1637           1.1  lukem /*************************************************************************
   1638           1.1  lukem  *
   1639           1.1  lukem  * API.
   1640           1.1  lukem  *
   1641           1.1  lukem  *************************************************************************/
   1642           1.1  lukem 
   1643           1.1  lukem ure_buffer_t
   1644           1.1  lukem ure_buffer_create(void)
   1645           1.1  lukem {
   1646           1.1  lukem     ure_buffer_t b;
   1647           1.1  lukem 
   1648           1.1  lukem     b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
   1649           1.1  lukem 
   1650           1.1  lukem     return b;
   1651           1.1  lukem }
   1652           1.1  lukem 
   1653           1.1  lukem void
   1654           1.1  lukem ure_buffer_free(ure_buffer_t buf)
   1655           1.1  lukem {
   1656           1.1  lukem     unsigned long i;
   1657           1.1  lukem 
   1658           1.1  lukem     if (buf == 0)
   1659           1.1  lukem       return;
   1660           1.1  lukem 
   1661           1.1  lukem     if (buf->stack.slist_size > 0)
   1662           1.1  lukem       free((char *) buf->stack.slist);
   1663           1.1  lukem 
   1664           1.1  lukem     if (buf->expr_size > 0)
   1665           1.1  lukem       free((char *) buf->expr);
   1666           1.1  lukem 
   1667           1.1  lukem     for (i = 0; i < buf->symtab_size; i++) {
   1668           1.1  lukem         if (buf->symtab[i].states.slist_size > 0)
   1669           1.1  lukem           free((char *) buf->symtab[i].states.slist);
   1670           1.1  lukem     }
   1671           1.1  lukem 
   1672           1.1  lukem     if (buf->symtab_size > 0)
   1673           1.1  lukem       free((char *) buf->symtab);
   1674           1.1  lukem 
   1675           1.1  lukem     for (i = 0; i < buf->states.states_size; i++) {
   1676           1.1  lukem         if (buf->states.states[i].trans_size > 0)
   1677           1.1  lukem           free((char *) buf->states.states[i].trans);
   1678           1.1  lukem         if (buf->states.states[i].st.slist_size > 0)
   1679           1.1  lukem           free((char *) buf->states.states[i].st.slist);
   1680           1.1  lukem     }
   1681           1.1  lukem 
   1682           1.1  lukem     if (buf->states.states_size > 0)
   1683           1.1  lukem       free((char *) buf->states.states);
   1684           1.1  lukem 
   1685           1.1  lukem     if (buf->equiv_size > 0)
   1686           1.1  lukem       free((char *) buf->equiv);
   1687           1.1  lukem 
   1688           1.1  lukem     free((char *) buf);
   1689           1.1  lukem }
   1690           1.1  lukem 
   1691           1.1  lukem ure_dfa_t
   1692           1.1  lukem ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
   1693           1.1  lukem {
   1694           1.1  lukem     ucs2_t i, j, state;
   1695           1.1  lukem     _ure_state_t *sp;
   1696           1.1  lukem     _ure_dstate_t *dsp;
   1697           1.1  lukem     _ure_trans_t *tp;
   1698           1.1  lukem     ure_dfa_t dfa;
   1699           1.1  lukem 
   1700           1.1  lukem     if (re == 0 || *re == 0 || relen == 0 || buf == 0)
   1701           1.1  lukem       return 0;
   1702           1.1  lukem 
   1703           1.1  lukem     /*
   1704           1.1  lukem      * Reset the various fields of the compilation buffer.  Default the flags
   1705           1.1  lukem      * to indicate the presense of the "^$" pattern.  If any other pattern
   1706           1.1  lukem      * occurs, then this flag will be removed.  This is done to catch this
   1707           1.1  lukem      * special pattern and handle it specially when matching.
   1708           1.1  lukem      */
   1709           1.1  lukem     buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
   1710           1.1  lukem     buf->reducing = 0;
   1711           1.1  lukem     buf->stack.slist_used = 0;
   1712           1.1  lukem     buf->expr_used = 0;
   1713           1.1  lukem 
   1714           1.1  lukem     for (i = 0; i < buf->symtab_used; i++)
   1715           1.1  lukem       buf->symtab[i].states.slist_used = 0;
   1716           1.1  lukem     buf->symtab_used = 0;
   1717           1.1  lukem 
   1718           1.1  lukem     for (i = 0; i < buf->states.states_used; i++) {
   1719           1.1  lukem         buf->states.states[i].st.slist_used = 0;
   1720           1.1  lukem         buf->states.states[i].trans_used = 0;
   1721           1.1  lukem     }
   1722           1.1  lukem     buf->states.states_used = 0;
   1723           1.1  lukem 
   1724           1.1  lukem     /*
   1725           1.1  lukem      * Construct the NFA.  If this stage returns a 0, then an error occured or
   1726           1.1  lukem      * an empty expression was passed.
   1727           1.1  lukem      */
   1728           1.1  lukem     if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
   1729           1.1  lukem       return 0;
   1730           1.1  lukem 
   1731           1.1  lukem     /*
   1732           1.1  lukem      * Do the expression reduction to get the initial DFA.
   1733           1.1  lukem      */
   1734           1.1  lukem     _ure_reduce(state, buf);
   1735           1.1  lukem 
   1736           1.1  lukem     /*
   1737           1.1  lukem      * Merge all the equivalent DFA states.
   1738           1.1  lukem      */
   1739           1.1  lukem     _ure_merge_equiv(buf);
   1740           1.1  lukem 
   1741           1.1  lukem     /*
   1742           1.1  lukem      * Construct the minimal DFA.
   1743           1.1  lukem      */
   1744           1.1  lukem     dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
   1745           1.1  lukem     (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
   1746           1.1  lukem 
   1747           1.1  lukem     dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
   1748           1.1  lukem 
   1749           1.1  lukem     /*
   1750           1.1  lukem      * Free up the NFA state groups and transfer the symbols from the buffer
   1751           1.1  lukem      * to the DFA.
   1752           1.1  lukem      */
   1753           1.1  lukem     for (i = 0; i < buf->symtab_size; i++) {
   1754           1.1  lukem         if (buf->symtab[i].states.slist_size > 0)
   1755           1.1  lukem           free((char *) buf->symtab[i].states.slist);
   1756           1.1  lukem     }
   1757           1.1  lukem     dfa->syms = buf->symtab;
   1758           1.1  lukem     dfa->nsyms = buf->symtab_used;
   1759           1.1  lukem 
   1760           1.1  lukem     buf->symtab_used = buf->symtab_size = 0;
   1761           1.1  lukem 
   1762           1.1  lukem     /*
   1763           1.1  lukem      * Collect the total number of states and transitions needed for the DFA.
   1764           1.1  lukem      */
   1765           1.1  lukem     for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
   1766           1.1  lukem          i++, sp++) {
   1767           1.1  lukem         if (sp->id == state) {
   1768           1.1  lukem             dfa->nstates++;
   1769           1.1  lukem             dfa->ntrans += sp->trans_used;
   1770           1.1  lukem             state++;
   1771           1.1  lukem         }
   1772           1.1  lukem     }
   1773           1.1  lukem 
   1774           1.1  lukem     /*
   1775           1.1  lukem      * Allocate enough space for the states and transitions.
   1776           1.1  lukem      */
   1777           1.1  lukem     dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
   1778           1.1  lukem                                            dfa->nstates);
   1779           1.1  lukem     dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
   1780           1.1  lukem 
   1781           1.1  lukem     /*
   1782           1.1  lukem      * Actually transfer the DFA states from the buffer.
   1783           1.1  lukem      */
   1784           1.1  lukem     dsp = dfa->states;
   1785           1.1  lukem     tp = dfa->trans;
   1786           1.1  lukem     for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
   1787           1.1  lukem          i++, sp++) {
   1788           1.1  lukem         if (sp->id == state) {
   1789           1.1  lukem             dsp->trans = tp;
   1790           1.1  lukem             dsp->ntrans = sp->trans_used;
   1791           1.1  lukem             dsp->accepting = sp->accepting;
   1792           1.1  lukem 
   1793           1.1  lukem             /*
   1794           1.1  lukem              * Add the transitions for the state.
   1795           1.1  lukem              */
   1796           1.1  lukem             for (j = 0; j < dsp->ntrans; j++, tp++) {
   1797           1.1  lukem                 tp->symbol = sp->trans[j].lhs;
   1798           1.1  lukem                 tp->next_state = buf->states.states[sp->trans[j].rhs].id;
   1799           1.1  lukem             }
   1800           1.1  lukem 
   1801           1.1  lukem             dsp++;
   1802           1.1  lukem             state++;
   1803           1.1  lukem         }
   1804           1.1  lukem     }
   1805           1.1  lukem 
   1806           1.1  lukem     return dfa;
   1807           1.1  lukem }
   1808           1.1  lukem 
   1809           1.1  lukem void
   1810           1.1  lukem ure_dfa_free(ure_dfa_t dfa)
   1811           1.1  lukem {
   1812           1.1  lukem     ucs2_t i;
   1813           1.1  lukem 
   1814           1.1  lukem     if (dfa == 0)
   1815           1.1  lukem       return;
   1816           1.1  lukem 
   1817           1.1  lukem     for (i = 0; i < dfa->nsyms; i++) {
   1818           1.1  lukem         if ((dfa->syms[i].type == _URE_CCLASS ||
   1819           1.1  lukem              dfa->syms[i].type == _URE_NCCLASS) &&
   1820           1.1  lukem             dfa->syms[i].sym.ccl.ranges_size > 0)
   1821           1.1  lukem           free((char *) dfa->syms[i].sym.ccl.ranges);
   1822           1.1  lukem     }
   1823           1.1  lukem     if (dfa->nsyms > 0)
   1824           1.1  lukem       free((char *) dfa->syms);
   1825           1.1  lukem 
   1826           1.1  lukem     if (dfa->nstates > 0)
   1827           1.1  lukem       free((char *) dfa->states);
   1828           1.1  lukem     if (dfa->ntrans > 0)
   1829           1.1  lukem       free((char *) dfa->trans);
   1830           1.1  lukem     free((char *) dfa);
   1831           1.1  lukem }
   1832           1.1  lukem 
   1833           1.1  lukem void
   1834           1.1  lukem ure_write_dfa(ure_dfa_t dfa, FILE *out)
   1835           1.1  lukem {
   1836           1.1  lukem     ucs2_t i, j, k, h, l;
   1837           1.1  lukem     _ure_dstate_t *sp;
   1838           1.1  lukem     _ure_symtab_t *sym;
   1839           1.1  lukem     _ure_range_t *rp;
   1840           1.1  lukem 
   1841           1.1  lukem     if (dfa == 0 || out == 0)
   1842           1.1  lukem       return;
   1843           1.1  lukem 
   1844           1.1  lukem     /*
   1845           1.1  lukem      * Write all the different character classes.
   1846           1.1  lukem      */
   1847           1.1  lukem     for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
   1848           1.1  lukem         if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
   1849           1.1  lukem             fprintf(out, "C%hd = ", sym->id);
   1850           1.1  lukem             if (sym->sym.ccl.ranges_used > 0) {
   1851           1.1  lukem                 putc('[', out);
   1852           1.1  lukem                 if (sym->type == _URE_NCCLASS)
   1853           1.1  lukem                   putc('^', out);
   1854           1.1  lukem             }
   1855           1.1  lukem             if (sym->props != 0) {
   1856           1.1  lukem                 if (sym->type == _URE_NCCLASS)
   1857           1.1  lukem                   fprintf(out, "\\P");
   1858           1.1  lukem                 else
   1859           1.1  lukem                   fprintf(out, "\\p");
   1860           1.1  lukem                 for (k = h = 0; k < 32; k++) {
   1861           1.1  lukem                     if (sym->props & (1 << k)) {
   1862           1.1  lukem                         if (h != 0)
   1863           1.1  lukem                           putc(',', out);
   1864           1.1  lukem                         fprintf(out, "%hd", k + 1);
   1865           1.1  lukem                         h = 1;
   1866           1.1  lukem                     }
   1867           1.1  lukem                 }
   1868           1.1  lukem             }
   1869           1.1  lukem             /*
   1870           1.1  lukem              * Dump the ranges.
   1871           1.1  lukem              */
   1872           1.1  lukem             for (k = 0, rp = sym->sym.ccl.ranges;
   1873           1.1  lukem                  k < sym->sym.ccl.ranges_used; k++, rp++) {
   1874           1.1  lukem                 /*
   1875           1.1  lukem                  * Check for UTF16 characters.
   1876           1.1  lukem                  */
   1877           1.1  lukem                 if (0x10000 <= rp->min_code &&
   1878           1.1  lukem                     rp->min_code <= 0x10ffff) {
   1879           1.1  lukem                     h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
   1880           1.1  lukem                     l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
   1881           1.1  lukem                     fprintf(out, "\\x%04hX\\x%04hX", h, l);
   1882           1.1  lukem                 } else
   1883           1.1  lukem                   fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
   1884           1.1  lukem                 if (rp->max_code != rp->min_code) {
   1885           1.1  lukem                     putc('-', out);
   1886           1.1  lukem                     if (rp->max_code >= 0x10000 &&
   1887           1.1  lukem                         rp->max_code <= 0x10ffff) {
   1888           1.1  lukem                         h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
   1889           1.1  lukem                         l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
   1890           1.1  lukem                         fprintf(out, "\\x%04hX\\x%04hX", h, l);
   1891           1.1  lukem                     } else
   1892           1.1  lukem                       fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
   1893           1.1  lukem                 }
   1894           1.1  lukem             }
   1895           1.1  lukem             if (sym->sym.ccl.ranges_used > 0)
   1896           1.1  lukem               putc(']', out);
   1897           1.1  lukem             putc('\n', out);
   1898           1.1  lukem         }
   1899           1.1  lukem     }
   1900           1.1  lukem 
   1901           1.1  lukem     for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
   1902           1.1  lukem         fprintf(out, "S%hd = ", i);
   1903           1.1  lukem         if (sp->accepting) {
   1904           1.1  lukem             fprintf(out, "1 ");
   1905           1.1  lukem             if (sp->ntrans)
   1906           1.1  lukem               fprintf(out, "| ");
   1907           1.1  lukem         }
   1908           1.1  lukem         for (j = 0; j < sp->ntrans; j++) {
   1909           1.1  lukem             if (j > 0)
   1910           1.1  lukem               fprintf(out, "| ");
   1911           1.1  lukem 
   1912           1.1  lukem             sym = dfa->syms + sp->trans[j].symbol;
   1913           1.1  lukem             switch (sym->type) {
   1914           1.1  lukem               case _URE_CHAR:
   1915           1.1  lukem                 if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
   1916           1.1  lukem                     /*
   1917           1.1  lukem                      * Take care of UTF16 characters.
   1918           1.1  lukem                      */
   1919           1.1  lukem                     h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
   1920           1.1  lukem                     l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
   1921           1.1  lukem                     fprintf(out, "\\x%04hX\\x%04hX ", h, l);
   1922           1.1  lukem                 } else
   1923           1.1  lukem                   fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
   1924           1.1  lukem                 break;
   1925           1.1  lukem               case _URE_ANY_CHAR:
   1926           1.1  lukem                 fprintf(out, "<any> ");
   1927           1.1  lukem                 break;
   1928           1.1  lukem               case _URE_BOL_ANCHOR:
   1929           1.1  lukem                 fprintf(out, "<bol-anchor> ");
   1930           1.1  lukem                 break;
   1931           1.1  lukem               case _URE_EOL_ANCHOR:
   1932           1.1  lukem                 fprintf(out, "<eol-anchor> ");
   1933           1.1  lukem                 break;
   1934           1.1  lukem               case _URE_CCLASS:
   1935           1.1  lukem               case _URE_NCCLASS:
   1936           1.1  lukem                 fprintf(out, "[C%hd] ", sym->id);
   1937           1.1  lukem                 break;
   1938           1.1  lukem             }
   1939           1.1  lukem             fprintf(out, "S%hd", sp->trans[j].next_state);
   1940           1.1  lukem             if (j + 1 < sp->ntrans)
   1941           1.1  lukem               putc(' ', out);
   1942           1.1  lukem         }
   1943           1.1  lukem         putc('\n', out);
   1944           1.1  lukem     }
   1945           1.1  lukem }
   1946           1.1  lukem 
   1947           1.1  lukem #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
   1948           1.1  lukem                         (cc) == 0x2029)
   1949           1.1  lukem 
   1950           1.1  lukem int
   1951           1.1  lukem ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
   1952           1.1  lukem          unsigned long *match_start, unsigned long *match_end)
   1953           1.1  lukem {
   1954           1.1  lukem     int i, j, matched, found, skip;
   1955           1.1  lukem     unsigned long ms, me;
   1956           1.1  lukem     ucs4_t c;
   1957           1.1  lukem     ucs2_t *sp, *ep, *lp;
   1958           1.1  lukem     _ure_dstate_t *stp;
   1959           1.1  lukem     _ure_symtab_t *sym;
   1960           1.1  lukem     _ure_range_t *rp;
   1961           1.1  lukem 
   1962           1.1  lukem     if (dfa == 0 || text == 0)
   1963           1.1  lukem       return 0;
   1964           1.1  lukem 
   1965           1.1  lukem     /*
   1966           1.1  lukem      * Handle the special case of an empty string matching the "^$" pattern.
   1967           1.1  lukem      */
   1968           1.1  lukem     if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
   1969           1.1  lukem         *match_start = *match_end = 0;
   1970           1.1  lukem         return 1;
   1971           1.1  lukem     }
   1972           1.1  lukem 
   1973           1.1  lukem     sp = text;
   1974           1.1  lukem     ep = sp + textlen;
   1975           1.1  lukem 
   1976           1.1  lukem     ms = me = ~0;
   1977           1.1  lukem 
   1978           1.1  lukem     stp = dfa->states;
   1979           1.1  lukem 
   1980           1.1  lukem     for (found = skip = 0; found == 0 && sp < ep; ) {
   1981           1.1  lukem         lp = sp;
   1982           1.1  lukem         c = *sp++;
   1983           1.1  lukem 
   1984           1.1  lukem         /*
   1985           1.1  lukem          * Check to see if this is a high surrogate that should be
   1986           1.1  lukem          * combined with a following low surrogate.
   1987           1.1  lukem          */
   1988           1.1  lukem         if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
   1989           1.1  lukem             0xdc00 <= *sp && *sp <= 0xdfff)
   1990           1.1  lukem           c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
   1991           1.1  lukem 
   1992           1.1  lukem         /*
   1993           1.1  lukem          * Determine if the character is non-spacing and should be skipped.
   1994           1.1  lukem          */
   1995           1.1  lukem         if (_ure_matches_properties(_URE_NONSPACING, c) &&
   1996           1.1  lukem             (flags & URE_IGNORE_NONSPACING)) {
   1997           1.1  lukem             sp++;
   1998           1.1  lukem             continue;
   1999           1.1  lukem         }
   2000           1.1  lukem 
   2001           1.1  lukem         if (dfa->flags & _URE_DFA_CASEFOLD)
   2002           1.1  lukem           c = _ure_tolower(c);
   2003           1.1  lukem 
   2004           1.1  lukem         /*
   2005           1.1  lukem          * See if one of the transitions matches.
   2006           1.1  lukem          */
   2007           1.1  lukem         for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
   2008           1.1  lukem             sym = dfa->syms + stp->trans[i].symbol;
   2009           1.1  lukem             switch (sym->type) {
   2010           1.1  lukem               case _URE_ANY_CHAR:
   2011           1.1  lukem                 if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
   2012           1.1  lukem                     !_ure_issep(c))
   2013           1.1  lukem                   matched = 1;
   2014           1.1  lukem                 break;
   2015           1.1  lukem               case _URE_CHAR:
   2016           1.1  lukem                 if (c == sym->sym.chr)
   2017           1.1  lukem                   matched = 1;
   2018           1.1  lukem                 break;
   2019           1.1  lukem               case _URE_BOL_ANCHOR:
   2020           1.1  lukem                 if (lp == text) {
   2021           1.1  lukem                     sp = lp;
   2022           1.1  lukem                     matched = 1;
   2023           1.1  lukem                 } else if (_ure_issep(c)) {
   2024           1.1  lukem                     if (c == '\r' && sp < ep && *sp == '\n')
   2025           1.1  lukem                       sp++;
   2026           1.1  lukem                     lp = sp;
   2027           1.1  lukem                     matched = 1;
   2028           1.1  lukem                 }
   2029           1.1  lukem                 break;
   2030           1.1  lukem               case _URE_EOL_ANCHOR:
   2031           1.1  lukem                 if (_ure_issep(c)) {
   2032           1.1  lukem                     /*
   2033           1.1  lukem                      * Put the pointer back before the separator so the match
   2034           1.1  lukem                      * end position will be correct.  This case will also
   2035           1.1  lukem                      * cause the `sp' pointer to be advanced over the current
   2036           1.1  lukem                      * separator once the match end point has been recorded.
   2037           1.1  lukem                      */
   2038           1.1  lukem                     sp = lp;
   2039           1.1  lukem                     matched = 1;
   2040           1.1  lukem                 }
   2041           1.1  lukem                 break;
   2042           1.1  lukem               case _URE_CCLASS:
   2043           1.1  lukem               case _URE_NCCLASS:
   2044           1.1  lukem                 if (sym->props != 0)
   2045           1.1  lukem                   matched = _ure_matches_properties(sym->props, c);
   2046           1.1  lukem                 for (j = 0, rp = sym->sym.ccl.ranges;
   2047           1.1  lukem                      j < sym->sym.ccl.ranges_used; j++, rp++) {
   2048           1.1  lukem                     if (rp->min_code <= c && c <= rp->max_code)
   2049           1.1  lukem                       matched = 1;
   2050           1.1  lukem                 }
   2051           1.1  lukem                 if (sym->type == _URE_NCCLASS)
   2052           1.1  lukem                   matched = !matched;
   2053           1.1  lukem                 break;
   2054           1.1  lukem             }
   2055           1.1  lukem 
   2056           1.1  lukem             if (matched) {
   2057           1.1  lukem                 if (ms == ~0UL)
   2058           1.1  lukem                   ms = lp - text;
   2059           1.1  lukem                 else
   2060           1.1  lukem                   me = sp - text;
   2061           1.1  lukem                 stp = dfa->states + stp->trans[i].next_state;
   2062           1.1  lukem 
   2063           1.1  lukem                 /*
   2064           1.1  lukem                  * If the match was an EOL anchor, adjust the pointer past the
   2065           1.1  lukem                  * separator that caused the match.  The correct match
   2066           1.1  lukem                  * position has been recorded already.
   2067           1.1  lukem                  */
   2068           1.1  lukem                 if (sym->type == _URE_EOL_ANCHOR) {
   2069           1.1  lukem                     /*
   2070           1.1  lukem                      * Skip the character that caused the match.
   2071           1.1  lukem                      */
   2072           1.1  lukem                     sp++;
   2073           1.1  lukem 
   2074           1.1  lukem                     /*
   2075           1.1  lukem                      * Handle the infamous CRLF situation.
   2076           1.1  lukem                      */
   2077           1.1  lukem                     if (sp < ep && c == '\r' && *sp == '\n')
   2078           1.1  lukem                       sp++;
   2079           1.1  lukem                 }
   2080           1.1  lukem             }
   2081           1.1  lukem         }
   2082           1.1  lukem 
   2083           1.1  lukem         if (matched == 0) {
   2084           1.1  lukem             if (stp->accepting == 0) {
   2085           1.1  lukem                 /*
   2086           1.1  lukem                  * If the last state was not accepting, then reset
   2087           1.1  lukem                  * and start over.
   2088           1.1  lukem                  */
   2089           1.1  lukem                 stp = dfa->states;
   2090           1.1  lukem                 ms = me = ~0;
   2091           1.1  lukem             } else
   2092           1.1  lukem               /*
   2093           1.1  lukem                * The last state was accepting, so terminate the matching
   2094           1.1  lukem                * loop to avoid more work.
   2095           1.1  lukem                */
   2096           1.1  lukem               found = 1;
   2097           1.1  lukem         } else if (sp == ep) {
   2098           1.1  lukem             if (!stp->accepting) {
   2099           1.1  lukem                 /*
   2100           1.1  lukem                  * This ugly hack is to make sure the end-of-line anchors
   2101           1.1  lukem                  * match when the source text hits the end.  This is only done
   2102           1.1  lukem                  * if the last subexpression matches.
   2103           1.1  lukem                  */
   2104           1.1  lukem                 for (i = 0; found == 0 && i < stp->ntrans; i++) {
   2105           1.1  lukem                     sym = dfa->syms + stp->trans[i].symbol;
   2106           1.1  lukem                     if (sym->type ==_URE_EOL_ANCHOR) {
   2107           1.1  lukem                         stp = dfa->states + stp->trans[i].next_state;
   2108           1.1  lukem                         if (stp->accepting) {
   2109           1.1  lukem                             me = sp - text;
   2110           1.1  lukem                             found = 1;
   2111           1.1  lukem                         } else
   2112           1.1  lukem                           break;
   2113           1.1  lukem                     }
   2114           1.1  lukem                 }
   2115           1.1  lukem             } else {
   2116           1.1  lukem                 /*
   2117           1.1  lukem                  * Make sure any conditions that match all the way to the end
   2118           1.1  lukem                  * of the string match.
   2119           1.1  lukem                  */
   2120           1.1  lukem                 found = 1;
   2121           1.1  lukem                 me = sp - text;
   2122           1.1  lukem             }
   2123           1.1  lukem         }
   2124           1.1  lukem     }
   2125           1.1  lukem 
   2126           1.1  lukem     if (found == 0)
   2127           1.1  lukem       ms = me = ~0;
   2128           1.1  lukem 
   2129           1.1  lukem     *match_start = ms;
   2130           1.1  lukem     *match_end = me;
   2131           1.1  lukem 
   2132           1.1  lukem     return (ms != ~0UL) ? 1 : 0;
   2133           1.1  lukem }
   2134