Home | History | Annotate | Line # | Download | only in utbm
utbm.c revision 1.1.1.4.6.1
      1  1.1.1.4.6.1  pgoyette /*	$NetBSD: utbm.c,v 1.1.1.4.6.1 2017/03/20 06:56:15 pgoyette Exp $	*/
      2      1.1.1.2     lukem 
      3      1.1.1.4      tron /* $OpenLDAP$ */
      4          1.1     lukem /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
      5          1.1     lukem  *
      6  1.1.1.4.6.1  pgoyette  * Copyright 1998-2016 The OpenLDAP Foundation.
      7          1.1     lukem  * All rights reserved.
      8          1.1     lukem  *
      9          1.1     lukem  * Redistribution and use in source and binary forms, with or without
     10          1.1     lukem  * modification, are permitted only as authorized by the OpenLDAP
     11          1.1     lukem  * Public License.
     12          1.1     lukem  *
     13          1.1     lukem  * A copy of this license is available in file LICENSE in the
     14          1.1     lukem  * top-level directory of the distribution or, alternatively, at
     15          1.1     lukem  * <http://www.OpenLDAP.org/license.html>.
     16          1.1     lukem  */
     17          1.1     lukem /* Copyright 1997, 1998, 1999 Computing Research Labs,
     18          1.1     lukem  * New Mexico State University
     19          1.1     lukem  *
     20          1.1     lukem  * Permission is hereby granted, free of charge, to any person obtaining a
     21          1.1     lukem  * copy of this software and associated documentation files (the "Software"),
     22          1.1     lukem  * to deal in the Software without restriction, including without limitation
     23          1.1     lukem  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     24          1.1     lukem  * and/or sell copies of the Software, and to permit persons to whom the
     25          1.1     lukem  * Software is furnished to do so, subject to the following conditions:
     26          1.1     lukem  *
     27          1.1     lukem  * The above copyright notice and this permission notice shall be included in
     28          1.1     lukem  * all copies or substantial portions of the Software.
     29          1.1     lukem  *
     30          1.1     lukem  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     31          1.1     lukem  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     32          1.1     lukem  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     33          1.1     lukem  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
     34          1.1     lukem  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
     35          1.1     lukem  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
     36          1.1     lukem  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     37          1.1     lukem  */
     38      1.1.1.4      tron /* Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp  */
     39          1.1     lukem 
     40          1.1     lukem /*
     41          1.1     lukem  * Assumptions:
     42          1.1     lukem  * 1. Case conversions of UTF-16 characters must also be UTF-16 characters.
     43          1.1     lukem  * 2. Case conversions are all one-to-one.
     44          1.1     lukem  * 3. Text and pattern have already been normalized in some fashion.
     45          1.1     lukem  */
     46          1.1     lukem 
     47          1.1     lukem #include <stdlib.h>
     48          1.1     lukem #include <unistd.h>
     49          1.1     lukem #include <string.h>
     50          1.1     lukem #include "utbm.h"
     51          1.1     lukem 
     52          1.1     lukem /*
     53          1.1     lukem  * Single pattern character.
     54          1.1     lukem  */
     55          1.1     lukem typedef struct {
     56          1.1     lukem     ucs4_t lc;
     57          1.1     lukem     ucs4_t uc;
     58          1.1     lukem     ucs4_t tc;
     59          1.1     lukem } _utbm_char_t;
     60          1.1     lukem 
     61          1.1     lukem typedef struct {
     62          1.1     lukem     _utbm_char_t *ch;
     63          1.1     lukem     unsigned long skip;
     64          1.1     lukem } _utbm_skip_t;
     65          1.1     lukem 
     66          1.1     lukem typedef struct _utbm_pattern_t {
     67          1.1     lukem     unsigned long flags;
     68          1.1     lukem 
     69          1.1     lukem     _utbm_char_t *pat;
     70          1.1     lukem     unsigned long pat_used;
     71          1.1     lukem     unsigned long pat_size;
     72          1.1     lukem     unsigned long patlen;
     73          1.1     lukem 
     74          1.1     lukem     _utbm_skip_t *skip;
     75          1.1     lukem     unsigned long skip_used;
     76          1.1     lukem     unsigned long skip_size;
     77          1.1     lukem 
     78          1.1     lukem     unsigned long md4;
     79          1.1     lukem } _utbm_pattern_t;
     80          1.1     lukem 
     81          1.1     lukem /*************************************************************************
     82          1.1     lukem  *
     83          1.1     lukem  * Support functions.
     84          1.1     lukem  *
     85          1.1     lukem  *************************************************************************/
     86          1.1     lukem 
     87          1.1     lukem /*
     88          1.1     lukem  * Routine to look up the skip value for a character.
     89          1.1     lukem  */
     90          1.1     lukem static unsigned long
     91          1.1     lukem _utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end)
     92          1.1     lukem {
     93          1.1     lukem     unsigned long i;
     94          1.1     lukem     ucs4_t c1, c2;
     95          1.1     lukem     _utbm_skip_t *sp;
     96          1.1     lukem 
     97          1.1     lukem     if (start >= end)
     98          1.1     lukem       return 0;
     99          1.1     lukem 
    100          1.1     lukem     c1 = *start;
    101          1.1     lukem     c2 = (start + 1 < end) ? *(start + 1) : ~0;
    102          1.1     lukem     if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
    103          1.1     lukem       c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    104          1.1     lukem 
    105          1.1     lukem     for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) {
    106          1.1     lukem         if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) {
    107          1.1     lukem             return ((unsigned long) (end - start) < sp->skip) ?
    108          1.1     lukem                 end - start : sp->skip;
    109          1.1     lukem         }
    110          1.1     lukem     }
    111          1.1     lukem     return p->patlen;
    112          1.1     lukem }
    113          1.1     lukem 
    114          1.1     lukem static int
    115          1.1     lukem _utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end,
    116          1.1     lukem             unsigned long *match_start, unsigned long *match_end)
    117          1.1     lukem {
    118          1.1     lukem     int check_space;
    119          1.1     lukem     ucs4_t c1, c2;
    120          1.1     lukem     unsigned long count;
    121          1.1     lukem     _utbm_char_t *cp;
    122          1.1     lukem 
    123          1.1     lukem     /*
    124          1.1     lukem      * Set the potential match endpoint first.
    125          1.1     lukem      */
    126          1.1     lukem     *match_end = (start - text) + 1;
    127          1.1     lukem 
    128          1.1     lukem     c1 = *start;
    129          1.1     lukem     c2 = (start + 1 < end) ? *(start + 1) : ~0;
    130          1.1     lukem     if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) {
    131          1.1     lukem         c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    132          1.1     lukem         /*
    133          1.1     lukem          * Adjust the match end point to occur after the UTF-16 character.
    134          1.1     lukem          */
    135          1.1     lukem         *match_end = *match_end + 1;
    136          1.1     lukem     }
    137          1.1     lukem 
    138          1.1     lukem     if (pat->pat_used == 1) {
    139          1.1     lukem         *match_start = start - text;
    140          1.1     lukem         return 1;
    141          1.1     lukem     }
    142          1.1     lukem 
    143          1.1     lukem     /*
    144          1.1     lukem      * Compare backward.
    145          1.1     lukem      */
    146          1.1     lukem     cp = pat->pat + (pat->pat_used - 1);
    147          1.1     lukem 
    148          1.1     lukem     for (count = pat->patlen; start > text && count > 0;) {
    149          1.1     lukem         /*
    150          1.1     lukem          * Ignore non-spacing characters if indicated.
    151          1.1     lukem          */
    152          1.1     lukem         if (pat->flags & UTBM_IGNORE_NONSPACING) {
    153          1.1     lukem             while (start > text && _utbm_nonspacing(c1)) {
    154          1.1     lukem                 c2 = *--start;
    155          1.1     lukem                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    156          1.1     lukem                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    157          1.1     lukem                     0xd800 <= c1 && c1 <= 0xdbff) {
    158          1.1     lukem                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    159          1.1     lukem                     start--;
    160          1.1     lukem                 } else
    161          1.1     lukem                   c1 = c2;
    162          1.1     lukem             }
    163          1.1     lukem         }
    164          1.1     lukem 
    165          1.1     lukem         /*
    166          1.1     lukem          * Handle space compression if indicated.
    167          1.1     lukem          */
    168          1.1     lukem         if (pat->flags & UTBM_SPACE_COMPRESS) {
    169          1.1     lukem             check_space = 0;
    170          1.1     lukem             while (start > text &&
    171          1.1     lukem                    (_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) {
    172          1.1     lukem                 check_space = _utbm_isspace(c1, 1);
    173          1.1     lukem                 c2 = *--start;
    174          1.1     lukem                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    175          1.1     lukem                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    176          1.1     lukem                     0xd800 <= c1 && c1 <= 0xdbff) {
    177          1.1     lukem                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    178          1.1     lukem                     start--;
    179          1.1     lukem                 } else
    180          1.1     lukem                   c1 = c2;
    181          1.1     lukem             }
    182          1.1     lukem             /*
    183          1.1     lukem              * Handle things if space compression was indicated and one or
    184          1.1     lukem              * more member characters were found.
    185          1.1     lukem              */
    186          1.1     lukem             if (check_space) {
    187          1.1     lukem                 if (cp->uc != ' ')
    188          1.1     lukem                   return 0;
    189          1.1     lukem                 cp--;
    190          1.1     lukem                 count--;
    191          1.1     lukem             }
    192          1.1     lukem         }
    193          1.1     lukem 
    194          1.1     lukem         /*
    195          1.1     lukem          * Handle the normal comparison cases.
    196          1.1     lukem          */
    197          1.1     lukem         if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc)))
    198          1.1     lukem           return 0;
    199          1.1     lukem 
    200          1.1     lukem         count -= (c1 >= 0x10000) ? 2 : 1;
    201          1.1     lukem         if (count > 0) {
    202          1.1     lukem             cp--;
    203          1.1     lukem 
    204          1.1     lukem             /*
    205          1.1     lukem              * Get the next preceding character.
    206          1.1     lukem              */
    207          1.1     lukem             if (start > text) {
    208          1.1     lukem                 c2 = *--start;
    209          1.1     lukem                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    210          1.1     lukem                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    211          1.1     lukem                     0xd800 <= c1 && c1 <= 0xdbff) {
    212          1.1     lukem                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    213          1.1     lukem                     start--;
    214          1.1     lukem                 } else
    215          1.1     lukem                   c1 = c2;
    216          1.1     lukem             }
    217          1.1     lukem         }
    218          1.1     lukem     }
    219          1.1     lukem 
    220          1.1     lukem     /*
    221          1.1     lukem      * Set the match start position.
    222          1.1     lukem      */
    223          1.1     lukem     *match_start = start - text;
    224          1.1     lukem     return 1;
    225          1.1     lukem }
    226          1.1     lukem 
    227          1.1     lukem /*************************************************************************
    228          1.1     lukem  *
    229          1.1     lukem  * API.
    230          1.1     lukem  *
    231          1.1     lukem  *************************************************************************/
    232          1.1     lukem 
    233          1.1     lukem utbm_pattern_t
    234          1.1     lukem utbm_create_pattern(void)
    235          1.1     lukem {
    236          1.1     lukem     utbm_pattern_t p;
    237          1.1     lukem 
    238          1.1     lukem     p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t));
    239          1.1     lukem     (void) memset((char *) p, '\0', sizeof(_utbm_pattern_t));
    240          1.1     lukem     return p;
    241          1.1     lukem }
    242          1.1     lukem 
    243          1.1     lukem void
    244          1.1     lukem utbm_free_pattern(utbm_pattern_t pattern)
    245          1.1     lukem {
    246          1.1     lukem     if (pattern == 0)
    247          1.1     lukem       return;
    248          1.1     lukem 
    249          1.1     lukem     if (pattern->pat_size > 0)
    250          1.1     lukem       free((char *) pattern->pat);
    251          1.1     lukem 
    252          1.1     lukem     if (pattern->skip_size > 0)
    253          1.1     lukem       free((char *) pattern->skip);
    254          1.1     lukem 
    255          1.1     lukem     free((char *) pattern);
    256          1.1     lukem }
    257          1.1     lukem 
    258          1.1     lukem void
    259          1.1     lukem utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags,
    260          1.1     lukem              utbm_pattern_t p)
    261          1.1     lukem {
    262          1.1     lukem     int have_space;
    263          1.1     lukem     unsigned long i, j, k, slen;
    264          1.1     lukem     _utbm_char_t *cp;
    265          1.1     lukem     _utbm_skip_t *sp;
    266          1.1     lukem     ucs4_t c1, c2, sentinel;
    267          1.1     lukem 
    268          1.1     lukem     if (p == 0 || pat == 0 || *pat == 0 || patlen == 0)
    269          1.1     lukem       return;
    270          1.1     lukem 
    271          1.1     lukem     /*
    272          1.1     lukem      * Reset the pattern buffer.
    273          1.1     lukem      */
    274          1.1     lukem     p->patlen = p->pat_used = p->skip_used = 0;
    275          1.1     lukem 
    276          1.1     lukem     /*
    277          1.1     lukem      * Set the flags.
    278          1.1     lukem      */
    279          1.1     lukem     p->flags = flags;
    280          1.1     lukem 
    281          1.1     lukem     /*
    282          1.1     lukem      * Initialize the extra skip flag.
    283          1.1     lukem      */
    284          1.1     lukem     p->md4 = 1;
    285          1.1     lukem 
    286          1.1     lukem     /*
    287          1.1     lukem      * Allocate more storage if necessary.
    288          1.1     lukem      */
    289          1.1     lukem     if (patlen > p->pat_size) {
    290          1.1     lukem         if (p->pat_size == 0) {
    291          1.1     lukem             p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen);
    292          1.1     lukem             p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen);
    293          1.1     lukem         } else {
    294          1.1     lukem             p->pat = (_utbm_char_t *)
    295          1.1     lukem                 realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen);
    296          1.1     lukem             p->skip = (_utbm_skip_t *)
    297          1.1     lukem                 realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen);
    298          1.1     lukem         }
    299          1.1     lukem         p->pat_size = p->skip_size = patlen;
    300          1.1     lukem     }
    301          1.1     lukem 
    302          1.1     lukem     /*
    303          1.1     lukem      * Preprocess the pattern to remove controls (if specified) and determine
    304          1.1     lukem      * case.
    305          1.1     lukem      */
    306          1.1     lukem     for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) {
    307          1.1     lukem         c1 = pat[i];
    308          1.1     lukem         c2 = (i + 1 < patlen) ? pat[i + 1] : ~0;
    309          1.1     lukem         if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
    310          1.1     lukem           c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    311          1.1     lukem 
    312          1.1     lukem         /*
    313          1.1     lukem          * Make sure the `have_space' flag is turned off if the character
    314          1.1     lukem          * is not an appropriate one.
    315          1.1     lukem          */
    316          1.1     lukem         if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS))
    317          1.1     lukem           have_space = 0;
    318          1.1     lukem 
    319          1.1     lukem         /*
    320          1.1     lukem          * If non-spacing characters should be ignored, do it here.
    321          1.1     lukem          */
    322          1.1     lukem         if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1))
    323          1.1     lukem           continue;
    324          1.1     lukem 
    325          1.1     lukem         /*
    326          1.1     lukem          * Check if spaces and controls need to be compressed.
    327          1.1     lukem          */
    328          1.1     lukem         if (flags & UTBM_SPACE_COMPRESS) {
    329          1.1     lukem             if (_utbm_isspace(c1, 1)) {
    330          1.1     lukem                 if (!have_space) {
    331          1.1     lukem                     /*
    332          1.1     lukem                      * Add a space and set the flag.
    333          1.1     lukem                      */
    334          1.1     lukem                     cp->uc = cp->lc = cp->tc = ' ';
    335          1.1     lukem                     cp++;
    336          1.1     lukem 
    337          1.1     lukem                     /*
    338          1.1     lukem                      * Increase the real pattern length.
    339          1.1     lukem                      */
    340          1.1     lukem                     p->patlen++;
    341          1.1     lukem                     sentinel = ' ';
    342          1.1     lukem                     have_space = 1;
    343          1.1     lukem                 }
    344          1.1     lukem                 continue;
    345          1.1     lukem             }
    346          1.1     lukem 
    347          1.1     lukem             /*
    348          1.1     lukem              * Ignore all control characters.
    349          1.1     lukem              */
    350          1.1     lukem             if (_utbm_iscntrl(c1))
    351          1.1     lukem               continue;
    352          1.1     lukem         }
    353          1.1     lukem 
    354          1.1     lukem         /*
    355          1.1     lukem          * Add the character.
    356          1.1     lukem          */
    357          1.1     lukem         if (flags & UTBM_CASEFOLD) {
    358          1.1     lukem             cp->uc = _utbm_toupper(c1);
    359          1.1     lukem             cp->lc = _utbm_tolower(c1);
    360          1.1     lukem             cp->tc = _utbm_totitle(c1);
    361          1.1     lukem         } else
    362          1.1     lukem           cp->uc = cp->lc = cp->tc = c1;
    363          1.1     lukem 
    364          1.1     lukem         /*
    365          1.1     lukem          * Set the sentinel character.
    366          1.1     lukem          */
    367          1.1     lukem         sentinel = cp->uc;
    368          1.1     lukem 
    369          1.1     lukem         /*
    370          1.1     lukem          * Move to the next character.
    371          1.1     lukem          */
    372          1.1     lukem         cp++;
    373          1.1     lukem 
    374          1.1     lukem         /*
    375          1.1     lukem          * Increase the real pattern length appropriately.
    376          1.1     lukem          */
    377          1.1     lukem         p->patlen += (c1 >= 0x10000) ? 2 : 1;
    378          1.1     lukem 
    379          1.1     lukem         /*
    380          1.1     lukem          * Increment the loop index for UTF-16 characters.
    381          1.1     lukem          */
    382          1.1     lukem         i += (c1 >= 0x10000) ? 1 : 0;
    383          1.1     lukem 
    384          1.1     lukem     }
    385          1.1     lukem 
    386          1.1     lukem     /*
    387          1.1     lukem      * Set the number of characters actually used.
    388          1.1     lukem      */
    389          1.1     lukem     p->pat_used = cp - p->pat;
    390          1.1     lukem 
    391          1.1     lukem     /*
    392          1.1     lukem      * Go through and construct the skip array and determine the actual length
    393          1.1     lukem      * of the pattern in UCS2 terms.
    394          1.1     lukem      */
    395          1.1     lukem     slen = p->patlen - 1;
    396          1.1     lukem     cp = p->pat;
    397          1.1     lukem     for (i = k = 0; i < p->pat_used; i++, cp++) {
    398          1.1     lukem         /*
    399          1.1     lukem          * Locate the character in the skip array.
    400          1.1     lukem          */
    401          1.1     lukem         for (sp = p->skip, j = 0;
    402          1.1     lukem              j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ;
    403          1.1     lukem 
    404          1.1     lukem         /*
    405          1.1     lukem          * If the character is not found, set the new skip element and
    406          1.1     lukem          * increase the number of skip elements.
    407          1.1     lukem          */
    408          1.1     lukem         if (j == p->skip_used) {
    409          1.1     lukem             sp->ch = cp;
    410          1.1     lukem             p->skip_used++;
    411          1.1     lukem         }
    412          1.1     lukem 
    413          1.1     lukem         /*
    414          1.1     lukem          * Set the updated skip value.  If the character is UTF-16 and is
    415          1.1     lukem          * not the last one in the pattern, add one to its skip value.
    416          1.1     lukem          */
    417          1.1     lukem         sp->skip = slen - k;
    418          1.1     lukem         if (cp->uc >= 0x10000 && k + 2 < slen)
    419          1.1     lukem           sp->skip++;
    420          1.1     lukem 
    421          1.1     lukem         /*
    422          1.1     lukem          * Set the new extra skip for the sentinel character.
    423          1.1     lukem          */
    424          1.1     lukem         if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) &&
    425          1.1     lukem             cp->uc == sentinel)
    426          1.1     lukem           p->md4 = slen - k;
    427          1.1     lukem 
    428          1.1     lukem         /*
    429          1.1     lukem          * Increase the actual index.
    430          1.1     lukem          */
    431          1.1     lukem         k += (cp->uc >= 0x10000) ? 2 : 1;
    432          1.1     lukem     }
    433          1.1     lukem }
    434          1.1     lukem 
    435          1.1     lukem int
    436          1.1     lukem utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen,
    437          1.1     lukem           unsigned long *match_start, unsigned long *match_end)
    438          1.1     lukem {
    439          1.1     lukem     unsigned long k;
    440          1.1     lukem     ucs2_t *start, *end;
    441          1.1     lukem 
    442          1.1     lukem     if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 ||
    443          1.1     lukem         textlen < pat->patlen)
    444          1.1     lukem       return 0;
    445          1.1     lukem 
    446          1.1     lukem     start = text + pat->patlen;
    447          1.1     lukem     end = text + textlen;
    448          1.1     lukem 
    449          1.1     lukem     /*
    450          1.1     lukem      * Adjust the start point if it points to a low surrogate.
    451          1.1     lukem      */
    452          1.1     lukem     if (0xdc00 <= *start && *start <= 0xdfff &&
    453          1.1     lukem         0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    454          1.1     lukem       start--;
    455          1.1     lukem 
    456          1.1     lukem     while (start < end) {
    457          1.1     lukem         while ((k = _utbm_skip(pat, start, end))) {
    458          1.1     lukem             start += k;
    459          1.1     lukem             if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
    460          1.1     lukem                 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    461          1.1     lukem               start--;
    462          1.1     lukem         }
    463          1.1     lukem 
    464          1.1     lukem         if (start < end &&
    465          1.1     lukem             _utbm_match(pat, text, start, end, match_start, match_end))
    466          1.1     lukem           return 1;
    467          1.1     lukem 
    468          1.1     lukem         start += pat->md4;
    469          1.1     lukem         if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
    470          1.1     lukem             0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    471          1.1     lukem           start--;
    472          1.1     lukem     }
    473          1.1     lukem     return 0;
    474          1.1     lukem }
    475