Home | History | Annotate | Line # | Download | only in utbm
utbm.c revision 1.1.1.1.6.2
      1  1.1.1.1.6.2  wrstuden /* $OpenLDAP: pkg/ldap/libraries/liblunicode/utbm/utbm.c,v 1.7.2.3 2008/02/11 23:26:42 kurt Exp $ */
      2  1.1.1.1.6.2  wrstuden /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
      3  1.1.1.1.6.2  wrstuden  *
      4  1.1.1.1.6.2  wrstuden  * Copyright 1998-2008 The OpenLDAP Foundation.
      5  1.1.1.1.6.2  wrstuden  * All rights reserved.
      6  1.1.1.1.6.2  wrstuden  *
      7  1.1.1.1.6.2  wrstuden  * Redistribution and use in source and binary forms, with or without
      8  1.1.1.1.6.2  wrstuden  * modification, are permitted only as authorized by the OpenLDAP
      9  1.1.1.1.6.2  wrstuden  * Public License.
     10  1.1.1.1.6.2  wrstuden  *
     11  1.1.1.1.6.2  wrstuden  * A copy of this license is available in file LICENSE in the
     12  1.1.1.1.6.2  wrstuden  * top-level directory of the distribution or, alternatively, at
     13  1.1.1.1.6.2  wrstuden  * <http://www.OpenLDAP.org/license.html>.
     14  1.1.1.1.6.2  wrstuden  */
     15  1.1.1.1.6.2  wrstuden /* Copyright 1997, 1998, 1999 Computing Research Labs,
     16  1.1.1.1.6.2  wrstuden  * New Mexico State University
     17  1.1.1.1.6.2  wrstuden  *
     18  1.1.1.1.6.2  wrstuden  * Permission is hereby granted, free of charge, to any person obtaining a
     19  1.1.1.1.6.2  wrstuden  * copy of this software and associated documentation files (the "Software"),
     20  1.1.1.1.6.2  wrstuden  * to deal in the Software without restriction, including without limitation
     21  1.1.1.1.6.2  wrstuden  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     22  1.1.1.1.6.2  wrstuden  * and/or sell copies of the Software, and to permit persons to whom the
     23  1.1.1.1.6.2  wrstuden  * Software is furnished to do so, subject to the following conditions:
     24  1.1.1.1.6.2  wrstuden  *
     25  1.1.1.1.6.2  wrstuden  * The above copyright notice and this permission notice shall be included in
     26  1.1.1.1.6.2  wrstuden  * all copies or substantial portions of the Software.
     27  1.1.1.1.6.2  wrstuden  *
     28  1.1.1.1.6.2  wrstuden  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     29  1.1.1.1.6.2  wrstuden  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     30  1.1.1.1.6.2  wrstuden  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     31  1.1.1.1.6.2  wrstuden  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
     32  1.1.1.1.6.2  wrstuden  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
     33  1.1.1.1.6.2  wrstuden  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
     34  1.1.1.1.6.2  wrstuden  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     35  1.1.1.1.6.2  wrstuden  */
     36  1.1.1.1.6.2  wrstuden /* $Id: utbm.c,v 1.1.1.1.6.2 2008/09/18 05:15:03 wrstuden Exp $ */
     37  1.1.1.1.6.2  wrstuden 
     38  1.1.1.1.6.2  wrstuden /*
     39  1.1.1.1.6.2  wrstuden  * Assumptions:
     40  1.1.1.1.6.2  wrstuden  * 1. Case conversions of UTF-16 characters must also be UTF-16 characters.
     41  1.1.1.1.6.2  wrstuden  * 2. Case conversions are all one-to-one.
     42  1.1.1.1.6.2  wrstuden  * 3. Text and pattern have already been normalized in some fashion.
     43  1.1.1.1.6.2  wrstuden  */
     44  1.1.1.1.6.2  wrstuden 
     45  1.1.1.1.6.2  wrstuden #include <stdlib.h>
     46  1.1.1.1.6.2  wrstuden #include <unistd.h>
     47  1.1.1.1.6.2  wrstuden #include <string.h>
     48  1.1.1.1.6.2  wrstuden #include "utbm.h"
     49  1.1.1.1.6.2  wrstuden 
     50  1.1.1.1.6.2  wrstuden /*
     51  1.1.1.1.6.2  wrstuden  * Single pattern character.
     52  1.1.1.1.6.2  wrstuden  */
     53  1.1.1.1.6.2  wrstuden typedef struct {
     54  1.1.1.1.6.2  wrstuden     ucs4_t lc;
     55  1.1.1.1.6.2  wrstuden     ucs4_t uc;
     56  1.1.1.1.6.2  wrstuden     ucs4_t tc;
     57  1.1.1.1.6.2  wrstuden } _utbm_char_t;
     58  1.1.1.1.6.2  wrstuden 
     59  1.1.1.1.6.2  wrstuden typedef struct {
     60  1.1.1.1.6.2  wrstuden     _utbm_char_t *ch;
     61  1.1.1.1.6.2  wrstuden     unsigned long skip;
     62  1.1.1.1.6.2  wrstuden } _utbm_skip_t;
     63  1.1.1.1.6.2  wrstuden 
     64  1.1.1.1.6.2  wrstuden typedef struct _utbm_pattern_t {
     65  1.1.1.1.6.2  wrstuden     unsigned long flags;
     66  1.1.1.1.6.2  wrstuden 
     67  1.1.1.1.6.2  wrstuden     _utbm_char_t *pat;
     68  1.1.1.1.6.2  wrstuden     unsigned long pat_used;
     69  1.1.1.1.6.2  wrstuden     unsigned long pat_size;
     70  1.1.1.1.6.2  wrstuden     unsigned long patlen;
     71  1.1.1.1.6.2  wrstuden 
     72  1.1.1.1.6.2  wrstuden     _utbm_skip_t *skip;
     73  1.1.1.1.6.2  wrstuden     unsigned long skip_used;
     74  1.1.1.1.6.2  wrstuden     unsigned long skip_size;
     75  1.1.1.1.6.2  wrstuden 
     76  1.1.1.1.6.2  wrstuden     unsigned long md4;
     77  1.1.1.1.6.2  wrstuden } _utbm_pattern_t;
     78  1.1.1.1.6.2  wrstuden 
     79  1.1.1.1.6.2  wrstuden /*************************************************************************
     80  1.1.1.1.6.2  wrstuden  *
     81  1.1.1.1.6.2  wrstuden  * Support functions.
     82  1.1.1.1.6.2  wrstuden  *
     83  1.1.1.1.6.2  wrstuden  *************************************************************************/
     84  1.1.1.1.6.2  wrstuden 
     85  1.1.1.1.6.2  wrstuden /*
     86  1.1.1.1.6.2  wrstuden  * Routine to look up the skip value for a character.
     87  1.1.1.1.6.2  wrstuden  */
     88  1.1.1.1.6.2  wrstuden static unsigned long
     89  1.1.1.1.6.2  wrstuden _utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end)
     90  1.1.1.1.6.2  wrstuden {
     91  1.1.1.1.6.2  wrstuden     unsigned long i;
     92  1.1.1.1.6.2  wrstuden     ucs4_t c1, c2;
     93  1.1.1.1.6.2  wrstuden     _utbm_skip_t *sp;
     94  1.1.1.1.6.2  wrstuden 
     95  1.1.1.1.6.2  wrstuden     if (start >= end)
     96  1.1.1.1.6.2  wrstuden       return 0;
     97  1.1.1.1.6.2  wrstuden 
     98  1.1.1.1.6.2  wrstuden     c1 = *start;
     99  1.1.1.1.6.2  wrstuden     c2 = (start + 1 < end) ? *(start + 1) : ~0;
    100  1.1.1.1.6.2  wrstuden     if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
    101  1.1.1.1.6.2  wrstuden       c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    102  1.1.1.1.6.2  wrstuden 
    103  1.1.1.1.6.2  wrstuden     for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) {
    104  1.1.1.1.6.2  wrstuden         if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) {
    105  1.1.1.1.6.2  wrstuden             return ((unsigned long) (end - start) < sp->skip) ?
    106  1.1.1.1.6.2  wrstuden                 end - start : sp->skip;
    107  1.1.1.1.6.2  wrstuden         }
    108  1.1.1.1.6.2  wrstuden     }
    109  1.1.1.1.6.2  wrstuden     return p->patlen;
    110  1.1.1.1.6.2  wrstuden }
    111  1.1.1.1.6.2  wrstuden 
    112  1.1.1.1.6.2  wrstuden static int
    113  1.1.1.1.6.2  wrstuden _utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end,
    114  1.1.1.1.6.2  wrstuden             unsigned long *match_start, unsigned long *match_end)
    115  1.1.1.1.6.2  wrstuden {
    116  1.1.1.1.6.2  wrstuden     int check_space;
    117  1.1.1.1.6.2  wrstuden     ucs4_t c1, c2;
    118  1.1.1.1.6.2  wrstuden     unsigned long count;
    119  1.1.1.1.6.2  wrstuden     _utbm_char_t *cp;
    120  1.1.1.1.6.2  wrstuden 
    121  1.1.1.1.6.2  wrstuden     /*
    122  1.1.1.1.6.2  wrstuden      * Set the potential match endpoint first.
    123  1.1.1.1.6.2  wrstuden      */
    124  1.1.1.1.6.2  wrstuden     *match_end = (start - text) + 1;
    125  1.1.1.1.6.2  wrstuden 
    126  1.1.1.1.6.2  wrstuden     c1 = *start;
    127  1.1.1.1.6.2  wrstuden     c2 = (start + 1 < end) ? *(start + 1) : ~0;
    128  1.1.1.1.6.2  wrstuden     if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) {
    129  1.1.1.1.6.2  wrstuden         c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    130  1.1.1.1.6.2  wrstuden         /*
    131  1.1.1.1.6.2  wrstuden          * Adjust the match end point to occur after the UTF-16 character.
    132  1.1.1.1.6.2  wrstuden          */
    133  1.1.1.1.6.2  wrstuden         *match_end = *match_end + 1;
    134  1.1.1.1.6.2  wrstuden     }
    135  1.1.1.1.6.2  wrstuden 
    136  1.1.1.1.6.2  wrstuden     if (pat->pat_used == 1) {
    137  1.1.1.1.6.2  wrstuden         *match_start = start - text;
    138  1.1.1.1.6.2  wrstuden         return 1;
    139  1.1.1.1.6.2  wrstuden     }
    140  1.1.1.1.6.2  wrstuden 
    141  1.1.1.1.6.2  wrstuden     /*
    142  1.1.1.1.6.2  wrstuden      * Compare backward.
    143  1.1.1.1.6.2  wrstuden      */
    144  1.1.1.1.6.2  wrstuden     cp = pat->pat + (pat->pat_used - 1);
    145  1.1.1.1.6.2  wrstuden 
    146  1.1.1.1.6.2  wrstuden     for (count = pat->patlen; start > text && count > 0;) {
    147  1.1.1.1.6.2  wrstuden         /*
    148  1.1.1.1.6.2  wrstuden          * Ignore non-spacing characters if indicated.
    149  1.1.1.1.6.2  wrstuden          */
    150  1.1.1.1.6.2  wrstuden         if (pat->flags & UTBM_IGNORE_NONSPACING) {
    151  1.1.1.1.6.2  wrstuden             while (start > text && _utbm_nonspacing(c1)) {
    152  1.1.1.1.6.2  wrstuden                 c2 = *--start;
    153  1.1.1.1.6.2  wrstuden                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    154  1.1.1.1.6.2  wrstuden                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    155  1.1.1.1.6.2  wrstuden                     0xd800 <= c1 && c1 <= 0xdbff) {
    156  1.1.1.1.6.2  wrstuden                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    157  1.1.1.1.6.2  wrstuden                     start--;
    158  1.1.1.1.6.2  wrstuden                 } else
    159  1.1.1.1.6.2  wrstuden                   c1 = c2;
    160  1.1.1.1.6.2  wrstuden             }
    161  1.1.1.1.6.2  wrstuden         }
    162  1.1.1.1.6.2  wrstuden 
    163  1.1.1.1.6.2  wrstuden         /*
    164  1.1.1.1.6.2  wrstuden          * Handle space compression if indicated.
    165  1.1.1.1.6.2  wrstuden          */
    166  1.1.1.1.6.2  wrstuden         if (pat->flags & UTBM_SPACE_COMPRESS) {
    167  1.1.1.1.6.2  wrstuden             check_space = 0;
    168  1.1.1.1.6.2  wrstuden             while (start > text &&
    169  1.1.1.1.6.2  wrstuden                    (_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) {
    170  1.1.1.1.6.2  wrstuden                 check_space = _utbm_isspace(c1, 1);
    171  1.1.1.1.6.2  wrstuden                 c2 = *--start;
    172  1.1.1.1.6.2  wrstuden                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    173  1.1.1.1.6.2  wrstuden                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    174  1.1.1.1.6.2  wrstuden                     0xd800 <= c1 && c1 <= 0xdbff) {
    175  1.1.1.1.6.2  wrstuden                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    176  1.1.1.1.6.2  wrstuden                     start--;
    177  1.1.1.1.6.2  wrstuden                 } else
    178  1.1.1.1.6.2  wrstuden                   c1 = c2;
    179  1.1.1.1.6.2  wrstuden             }
    180  1.1.1.1.6.2  wrstuden             /*
    181  1.1.1.1.6.2  wrstuden              * Handle things if space compression was indicated and one or
    182  1.1.1.1.6.2  wrstuden              * more member characters were found.
    183  1.1.1.1.6.2  wrstuden              */
    184  1.1.1.1.6.2  wrstuden             if (check_space) {
    185  1.1.1.1.6.2  wrstuden                 if (cp->uc != ' ')
    186  1.1.1.1.6.2  wrstuden                   return 0;
    187  1.1.1.1.6.2  wrstuden                 cp--;
    188  1.1.1.1.6.2  wrstuden                 count--;
    189  1.1.1.1.6.2  wrstuden             }
    190  1.1.1.1.6.2  wrstuden         }
    191  1.1.1.1.6.2  wrstuden 
    192  1.1.1.1.6.2  wrstuden         /*
    193  1.1.1.1.6.2  wrstuden          * Handle the normal comparison cases.
    194  1.1.1.1.6.2  wrstuden          */
    195  1.1.1.1.6.2  wrstuden         if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc)))
    196  1.1.1.1.6.2  wrstuden           return 0;
    197  1.1.1.1.6.2  wrstuden 
    198  1.1.1.1.6.2  wrstuden         count -= (c1 >= 0x10000) ? 2 : 1;
    199  1.1.1.1.6.2  wrstuden         if (count > 0) {
    200  1.1.1.1.6.2  wrstuden             cp--;
    201  1.1.1.1.6.2  wrstuden 
    202  1.1.1.1.6.2  wrstuden             /*
    203  1.1.1.1.6.2  wrstuden              * Get the next preceding character.
    204  1.1.1.1.6.2  wrstuden              */
    205  1.1.1.1.6.2  wrstuden             if (start > text) {
    206  1.1.1.1.6.2  wrstuden                 c2 = *--start;
    207  1.1.1.1.6.2  wrstuden                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    208  1.1.1.1.6.2  wrstuden                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    209  1.1.1.1.6.2  wrstuden                     0xd800 <= c1 && c1 <= 0xdbff) {
    210  1.1.1.1.6.2  wrstuden                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    211  1.1.1.1.6.2  wrstuden                     start--;
    212  1.1.1.1.6.2  wrstuden                 } else
    213  1.1.1.1.6.2  wrstuden                   c1 = c2;
    214  1.1.1.1.6.2  wrstuden             }
    215  1.1.1.1.6.2  wrstuden         }
    216  1.1.1.1.6.2  wrstuden     }
    217  1.1.1.1.6.2  wrstuden 
    218  1.1.1.1.6.2  wrstuden     /*
    219  1.1.1.1.6.2  wrstuden      * Set the match start position.
    220  1.1.1.1.6.2  wrstuden      */
    221  1.1.1.1.6.2  wrstuden     *match_start = start - text;
    222  1.1.1.1.6.2  wrstuden     return 1;
    223  1.1.1.1.6.2  wrstuden }
    224  1.1.1.1.6.2  wrstuden 
    225  1.1.1.1.6.2  wrstuden /*************************************************************************
    226  1.1.1.1.6.2  wrstuden  *
    227  1.1.1.1.6.2  wrstuden  * API.
    228  1.1.1.1.6.2  wrstuden  *
    229  1.1.1.1.6.2  wrstuden  *************************************************************************/
    230  1.1.1.1.6.2  wrstuden 
    231  1.1.1.1.6.2  wrstuden utbm_pattern_t
    232  1.1.1.1.6.2  wrstuden utbm_create_pattern(void)
    233  1.1.1.1.6.2  wrstuden {
    234  1.1.1.1.6.2  wrstuden     utbm_pattern_t p;
    235  1.1.1.1.6.2  wrstuden 
    236  1.1.1.1.6.2  wrstuden     p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t));
    237  1.1.1.1.6.2  wrstuden     (void) memset((char *) p, '\0', sizeof(_utbm_pattern_t));
    238  1.1.1.1.6.2  wrstuden     return p;
    239  1.1.1.1.6.2  wrstuden }
    240  1.1.1.1.6.2  wrstuden 
    241  1.1.1.1.6.2  wrstuden void
    242  1.1.1.1.6.2  wrstuden utbm_free_pattern(utbm_pattern_t pattern)
    243  1.1.1.1.6.2  wrstuden {
    244  1.1.1.1.6.2  wrstuden     if (pattern == 0)
    245  1.1.1.1.6.2  wrstuden       return;
    246  1.1.1.1.6.2  wrstuden 
    247  1.1.1.1.6.2  wrstuden     if (pattern->pat_size > 0)
    248  1.1.1.1.6.2  wrstuden       free((char *) pattern->pat);
    249  1.1.1.1.6.2  wrstuden 
    250  1.1.1.1.6.2  wrstuden     if (pattern->skip_size > 0)
    251  1.1.1.1.6.2  wrstuden       free((char *) pattern->skip);
    252  1.1.1.1.6.2  wrstuden 
    253  1.1.1.1.6.2  wrstuden     free((char *) pattern);
    254  1.1.1.1.6.2  wrstuden }
    255  1.1.1.1.6.2  wrstuden 
    256  1.1.1.1.6.2  wrstuden void
    257  1.1.1.1.6.2  wrstuden utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags,
    258  1.1.1.1.6.2  wrstuden              utbm_pattern_t p)
    259  1.1.1.1.6.2  wrstuden {
    260  1.1.1.1.6.2  wrstuden     int have_space;
    261  1.1.1.1.6.2  wrstuden     unsigned long i, j, k, slen;
    262  1.1.1.1.6.2  wrstuden     _utbm_char_t *cp;
    263  1.1.1.1.6.2  wrstuden     _utbm_skip_t *sp;
    264  1.1.1.1.6.2  wrstuden     ucs4_t c1, c2, sentinel;
    265  1.1.1.1.6.2  wrstuden 
    266  1.1.1.1.6.2  wrstuden     if (p == 0 || pat == 0 || *pat == 0 || patlen == 0)
    267  1.1.1.1.6.2  wrstuden       return;
    268  1.1.1.1.6.2  wrstuden 
    269  1.1.1.1.6.2  wrstuden     /*
    270  1.1.1.1.6.2  wrstuden      * Reset the pattern buffer.
    271  1.1.1.1.6.2  wrstuden      */
    272  1.1.1.1.6.2  wrstuden     p->patlen = p->pat_used = p->skip_used = 0;
    273  1.1.1.1.6.2  wrstuden 
    274  1.1.1.1.6.2  wrstuden     /*
    275  1.1.1.1.6.2  wrstuden      * Set the flags.
    276  1.1.1.1.6.2  wrstuden      */
    277  1.1.1.1.6.2  wrstuden     p->flags = flags;
    278  1.1.1.1.6.2  wrstuden 
    279  1.1.1.1.6.2  wrstuden     /*
    280  1.1.1.1.6.2  wrstuden      * Initialize the extra skip flag.
    281  1.1.1.1.6.2  wrstuden      */
    282  1.1.1.1.6.2  wrstuden     p->md4 = 1;
    283  1.1.1.1.6.2  wrstuden 
    284  1.1.1.1.6.2  wrstuden     /*
    285  1.1.1.1.6.2  wrstuden      * Allocate more storage if necessary.
    286  1.1.1.1.6.2  wrstuden      */
    287  1.1.1.1.6.2  wrstuden     if (patlen > p->pat_size) {
    288  1.1.1.1.6.2  wrstuden         if (p->pat_size == 0) {
    289  1.1.1.1.6.2  wrstuden             p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen);
    290  1.1.1.1.6.2  wrstuden             p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen);
    291  1.1.1.1.6.2  wrstuden         } else {
    292  1.1.1.1.6.2  wrstuden             p->pat = (_utbm_char_t *)
    293  1.1.1.1.6.2  wrstuden                 realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen);
    294  1.1.1.1.6.2  wrstuden             p->skip = (_utbm_skip_t *)
    295  1.1.1.1.6.2  wrstuden                 realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen);
    296  1.1.1.1.6.2  wrstuden         }
    297  1.1.1.1.6.2  wrstuden         p->pat_size = p->skip_size = patlen;
    298  1.1.1.1.6.2  wrstuden     }
    299  1.1.1.1.6.2  wrstuden 
    300  1.1.1.1.6.2  wrstuden     /*
    301  1.1.1.1.6.2  wrstuden      * Preprocess the pattern to remove controls (if specified) and determine
    302  1.1.1.1.6.2  wrstuden      * case.
    303  1.1.1.1.6.2  wrstuden      */
    304  1.1.1.1.6.2  wrstuden     for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) {
    305  1.1.1.1.6.2  wrstuden         c1 = pat[i];
    306  1.1.1.1.6.2  wrstuden         c2 = (i + 1 < patlen) ? pat[i + 1] : ~0;
    307  1.1.1.1.6.2  wrstuden         if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
    308  1.1.1.1.6.2  wrstuden           c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    309  1.1.1.1.6.2  wrstuden 
    310  1.1.1.1.6.2  wrstuden         /*
    311  1.1.1.1.6.2  wrstuden          * Make sure the `have_space' flag is turned off if the character
    312  1.1.1.1.6.2  wrstuden          * is not an appropriate one.
    313  1.1.1.1.6.2  wrstuden          */
    314  1.1.1.1.6.2  wrstuden         if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS))
    315  1.1.1.1.6.2  wrstuden           have_space = 0;
    316  1.1.1.1.6.2  wrstuden 
    317  1.1.1.1.6.2  wrstuden         /*
    318  1.1.1.1.6.2  wrstuden          * If non-spacing characters should be ignored, do it here.
    319  1.1.1.1.6.2  wrstuden          */
    320  1.1.1.1.6.2  wrstuden         if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1))
    321  1.1.1.1.6.2  wrstuden           continue;
    322  1.1.1.1.6.2  wrstuden 
    323  1.1.1.1.6.2  wrstuden         /*
    324  1.1.1.1.6.2  wrstuden          * Check if spaces and controls need to be compressed.
    325  1.1.1.1.6.2  wrstuden          */
    326  1.1.1.1.6.2  wrstuden         if (flags & UTBM_SPACE_COMPRESS) {
    327  1.1.1.1.6.2  wrstuden             if (_utbm_isspace(c1, 1)) {
    328  1.1.1.1.6.2  wrstuden                 if (!have_space) {
    329  1.1.1.1.6.2  wrstuden                     /*
    330  1.1.1.1.6.2  wrstuden                      * Add a space and set the flag.
    331  1.1.1.1.6.2  wrstuden                      */
    332  1.1.1.1.6.2  wrstuden                     cp->uc = cp->lc = cp->tc = ' ';
    333  1.1.1.1.6.2  wrstuden                     cp++;
    334  1.1.1.1.6.2  wrstuden 
    335  1.1.1.1.6.2  wrstuden                     /*
    336  1.1.1.1.6.2  wrstuden                      * Increase the real pattern length.
    337  1.1.1.1.6.2  wrstuden                      */
    338  1.1.1.1.6.2  wrstuden                     p->patlen++;
    339  1.1.1.1.6.2  wrstuden                     sentinel = ' ';
    340  1.1.1.1.6.2  wrstuden                     have_space = 1;
    341  1.1.1.1.6.2  wrstuden                 }
    342  1.1.1.1.6.2  wrstuden                 continue;
    343  1.1.1.1.6.2  wrstuden             }
    344  1.1.1.1.6.2  wrstuden 
    345  1.1.1.1.6.2  wrstuden             /*
    346  1.1.1.1.6.2  wrstuden              * Ignore all control characters.
    347  1.1.1.1.6.2  wrstuden              */
    348  1.1.1.1.6.2  wrstuden             if (_utbm_iscntrl(c1))
    349  1.1.1.1.6.2  wrstuden               continue;
    350  1.1.1.1.6.2  wrstuden         }
    351  1.1.1.1.6.2  wrstuden 
    352  1.1.1.1.6.2  wrstuden         /*
    353  1.1.1.1.6.2  wrstuden          * Add the character.
    354  1.1.1.1.6.2  wrstuden          */
    355  1.1.1.1.6.2  wrstuden         if (flags & UTBM_CASEFOLD) {
    356  1.1.1.1.6.2  wrstuden             cp->uc = _utbm_toupper(c1);
    357  1.1.1.1.6.2  wrstuden             cp->lc = _utbm_tolower(c1);
    358  1.1.1.1.6.2  wrstuden             cp->tc = _utbm_totitle(c1);
    359  1.1.1.1.6.2  wrstuden         } else
    360  1.1.1.1.6.2  wrstuden           cp->uc = cp->lc = cp->tc = c1;
    361  1.1.1.1.6.2  wrstuden 
    362  1.1.1.1.6.2  wrstuden         /*
    363  1.1.1.1.6.2  wrstuden          * Set the sentinel character.
    364  1.1.1.1.6.2  wrstuden          */
    365  1.1.1.1.6.2  wrstuden         sentinel = cp->uc;
    366  1.1.1.1.6.2  wrstuden 
    367  1.1.1.1.6.2  wrstuden         /*
    368  1.1.1.1.6.2  wrstuden          * Move to the next character.
    369  1.1.1.1.6.2  wrstuden          */
    370  1.1.1.1.6.2  wrstuden         cp++;
    371  1.1.1.1.6.2  wrstuden 
    372  1.1.1.1.6.2  wrstuden         /*
    373  1.1.1.1.6.2  wrstuden          * Increase the real pattern length appropriately.
    374  1.1.1.1.6.2  wrstuden          */
    375  1.1.1.1.6.2  wrstuden         p->patlen += (c1 >= 0x10000) ? 2 : 1;
    376  1.1.1.1.6.2  wrstuden 
    377  1.1.1.1.6.2  wrstuden         /*
    378  1.1.1.1.6.2  wrstuden          * Increment the loop index for UTF-16 characters.
    379  1.1.1.1.6.2  wrstuden          */
    380  1.1.1.1.6.2  wrstuden         i += (c1 >= 0x10000) ? 1 : 0;
    381  1.1.1.1.6.2  wrstuden 
    382  1.1.1.1.6.2  wrstuden     }
    383  1.1.1.1.6.2  wrstuden 
    384  1.1.1.1.6.2  wrstuden     /*
    385  1.1.1.1.6.2  wrstuden      * Set the number of characters actually used.
    386  1.1.1.1.6.2  wrstuden      */
    387  1.1.1.1.6.2  wrstuden     p->pat_used = cp - p->pat;
    388  1.1.1.1.6.2  wrstuden 
    389  1.1.1.1.6.2  wrstuden     /*
    390  1.1.1.1.6.2  wrstuden      * Go through and construct the skip array and determine the actual length
    391  1.1.1.1.6.2  wrstuden      * of the pattern in UCS2 terms.
    392  1.1.1.1.6.2  wrstuden      */
    393  1.1.1.1.6.2  wrstuden     slen = p->patlen - 1;
    394  1.1.1.1.6.2  wrstuden     cp = p->pat;
    395  1.1.1.1.6.2  wrstuden     for (i = k = 0; i < p->pat_used; i++, cp++) {
    396  1.1.1.1.6.2  wrstuden         /*
    397  1.1.1.1.6.2  wrstuden          * Locate the character in the skip array.
    398  1.1.1.1.6.2  wrstuden          */
    399  1.1.1.1.6.2  wrstuden         for (sp = p->skip, j = 0;
    400  1.1.1.1.6.2  wrstuden              j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ;
    401  1.1.1.1.6.2  wrstuden 
    402  1.1.1.1.6.2  wrstuden         /*
    403  1.1.1.1.6.2  wrstuden          * If the character is not found, set the new skip element and
    404  1.1.1.1.6.2  wrstuden          * increase the number of skip elements.
    405  1.1.1.1.6.2  wrstuden          */
    406  1.1.1.1.6.2  wrstuden         if (j == p->skip_used) {
    407  1.1.1.1.6.2  wrstuden             sp->ch = cp;
    408  1.1.1.1.6.2  wrstuden             p->skip_used++;
    409  1.1.1.1.6.2  wrstuden         }
    410  1.1.1.1.6.2  wrstuden 
    411  1.1.1.1.6.2  wrstuden         /*
    412  1.1.1.1.6.2  wrstuden          * Set the updated skip value.  If the character is UTF-16 and is
    413  1.1.1.1.6.2  wrstuden          * not the last one in the pattern, add one to its skip value.
    414  1.1.1.1.6.2  wrstuden          */
    415  1.1.1.1.6.2  wrstuden         sp->skip = slen - k;
    416  1.1.1.1.6.2  wrstuden         if (cp->uc >= 0x10000 && k + 2 < slen)
    417  1.1.1.1.6.2  wrstuden           sp->skip++;
    418  1.1.1.1.6.2  wrstuden 
    419  1.1.1.1.6.2  wrstuden         /*
    420  1.1.1.1.6.2  wrstuden          * Set the new extra skip for the sentinel character.
    421  1.1.1.1.6.2  wrstuden          */
    422  1.1.1.1.6.2  wrstuden         if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) &&
    423  1.1.1.1.6.2  wrstuden             cp->uc == sentinel)
    424  1.1.1.1.6.2  wrstuden           p->md4 = slen - k;
    425  1.1.1.1.6.2  wrstuden 
    426  1.1.1.1.6.2  wrstuden         /*
    427  1.1.1.1.6.2  wrstuden          * Increase the actual index.
    428  1.1.1.1.6.2  wrstuden          */
    429  1.1.1.1.6.2  wrstuden         k += (cp->uc >= 0x10000) ? 2 : 1;
    430  1.1.1.1.6.2  wrstuden     }
    431  1.1.1.1.6.2  wrstuden }
    432  1.1.1.1.6.2  wrstuden 
    433  1.1.1.1.6.2  wrstuden int
    434  1.1.1.1.6.2  wrstuden utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen,
    435  1.1.1.1.6.2  wrstuden           unsigned long *match_start, unsigned long *match_end)
    436  1.1.1.1.6.2  wrstuden {
    437  1.1.1.1.6.2  wrstuden     unsigned long k;
    438  1.1.1.1.6.2  wrstuden     ucs2_t *start, *end;
    439  1.1.1.1.6.2  wrstuden 
    440  1.1.1.1.6.2  wrstuden     if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 ||
    441  1.1.1.1.6.2  wrstuden         textlen < pat->patlen)
    442  1.1.1.1.6.2  wrstuden       return 0;
    443  1.1.1.1.6.2  wrstuden 
    444  1.1.1.1.6.2  wrstuden     start = text + pat->patlen;
    445  1.1.1.1.6.2  wrstuden     end = text + textlen;
    446  1.1.1.1.6.2  wrstuden 
    447  1.1.1.1.6.2  wrstuden     /*
    448  1.1.1.1.6.2  wrstuden      * Adjust the start point if it points to a low surrogate.
    449  1.1.1.1.6.2  wrstuden      */
    450  1.1.1.1.6.2  wrstuden     if (0xdc00 <= *start && *start <= 0xdfff &&
    451  1.1.1.1.6.2  wrstuden         0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    452  1.1.1.1.6.2  wrstuden       start--;
    453  1.1.1.1.6.2  wrstuden 
    454  1.1.1.1.6.2  wrstuden     while (start < end) {
    455  1.1.1.1.6.2  wrstuden         while ((k = _utbm_skip(pat, start, end))) {
    456  1.1.1.1.6.2  wrstuden             start += k;
    457  1.1.1.1.6.2  wrstuden             if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
    458  1.1.1.1.6.2  wrstuden                 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    459  1.1.1.1.6.2  wrstuden               start--;
    460  1.1.1.1.6.2  wrstuden         }
    461  1.1.1.1.6.2  wrstuden 
    462  1.1.1.1.6.2  wrstuden         if (start < end &&
    463  1.1.1.1.6.2  wrstuden             _utbm_match(pat, text, start, end, match_start, match_end))
    464  1.1.1.1.6.2  wrstuden           return 1;
    465  1.1.1.1.6.2  wrstuden 
    466  1.1.1.1.6.2  wrstuden         start += pat->md4;
    467  1.1.1.1.6.2  wrstuden         if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
    468  1.1.1.1.6.2  wrstuden             0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    469  1.1.1.1.6.2  wrstuden           start--;
    470  1.1.1.1.6.2  wrstuden     }
    471  1.1.1.1.6.2  wrstuden     return 0;
    472  1.1.1.1.6.2  wrstuden }
    473