Home | History | Annotate | Line # | Download | only in utbm
utbm.c revision 1.1.1.4
      1  1.1.1.2  lukem /*	$NetBSD: utbm.c,v 1.1.1.4 2014/05/28 09:58:45 tron Exp $	*/
      2  1.1.1.2  lukem 
      3  1.1.1.4   tron /* $OpenLDAP$ */
      4      1.1  lukem /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
      5      1.1  lukem  *
      6  1.1.1.4   tron  * Copyright 1998-2014 The OpenLDAP Foundation.
      7      1.1  lukem  * All rights reserved.
      8      1.1  lukem  *
      9      1.1  lukem  * Redistribution and use in source and binary forms, with or without
     10      1.1  lukem  * modification, are permitted only as authorized by the OpenLDAP
     11      1.1  lukem  * Public License.
     12      1.1  lukem  *
     13      1.1  lukem  * A copy of this license is available in file LICENSE in the
     14      1.1  lukem  * top-level directory of the distribution or, alternatively, at
     15      1.1  lukem  * <http://www.OpenLDAP.org/license.html>.
     16      1.1  lukem  */
     17      1.1  lukem /* Copyright 1997, 1998, 1999 Computing Research Labs,
     18      1.1  lukem  * New Mexico State University
     19      1.1  lukem  *
     20      1.1  lukem  * Permission is hereby granted, free of charge, to any person obtaining a
     21      1.1  lukem  * copy of this software and associated documentation files (the "Software"),
     22      1.1  lukem  * to deal in the Software without restriction, including without limitation
     23      1.1  lukem  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     24      1.1  lukem  * and/or sell copies of the Software, and to permit persons to whom the
     25      1.1  lukem  * Software is furnished to do so, subject to the following conditions:
     26      1.1  lukem  *
     27      1.1  lukem  * The above copyright notice and this permission notice shall be included in
     28      1.1  lukem  * all copies or substantial portions of the Software.
     29      1.1  lukem  *
     30      1.1  lukem  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     31      1.1  lukem  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     32      1.1  lukem  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     33      1.1  lukem  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
     34      1.1  lukem  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
     35      1.1  lukem  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
     36      1.1  lukem  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     37      1.1  lukem  */
     38  1.1.1.4   tron /* Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp  */
     39      1.1  lukem 
     40      1.1  lukem /*
     41      1.1  lukem  * Assumptions:
     42      1.1  lukem  * 1. Case conversions of UTF-16 characters must also be UTF-16 characters.
     43      1.1  lukem  * 2. Case conversions are all one-to-one.
     44      1.1  lukem  * 3. Text and pattern have already been normalized in some fashion.
     45      1.1  lukem  */
     46      1.1  lukem 
     47      1.1  lukem #include <stdlib.h>
     48      1.1  lukem #include <unistd.h>
     49      1.1  lukem #include <string.h>
     50      1.1  lukem #include "utbm.h"
     51      1.1  lukem 
     52      1.1  lukem /*
     53      1.1  lukem  * Single pattern character.
     54      1.1  lukem  */
     55      1.1  lukem typedef struct {
     56      1.1  lukem     ucs4_t lc;
     57      1.1  lukem     ucs4_t uc;
     58      1.1  lukem     ucs4_t tc;
     59      1.1  lukem } _utbm_char_t;
     60      1.1  lukem 
     61      1.1  lukem typedef struct {
     62      1.1  lukem     _utbm_char_t *ch;
     63      1.1  lukem     unsigned long skip;
     64      1.1  lukem } _utbm_skip_t;
     65      1.1  lukem 
     66      1.1  lukem typedef struct _utbm_pattern_t {
     67      1.1  lukem     unsigned long flags;
     68      1.1  lukem 
     69      1.1  lukem     _utbm_char_t *pat;
     70      1.1  lukem     unsigned long pat_used;
     71      1.1  lukem     unsigned long pat_size;
     72      1.1  lukem     unsigned long patlen;
     73      1.1  lukem 
     74      1.1  lukem     _utbm_skip_t *skip;
     75      1.1  lukem     unsigned long skip_used;
     76      1.1  lukem     unsigned long skip_size;
     77      1.1  lukem 
     78      1.1  lukem     unsigned long md4;
     79      1.1  lukem } _utbm_pattern_t;
     80      1.1  lukem 
     81      1.1  lukem /*************************************************************************
     82      1.1  lukem  *
     83      1.1  lukem  * Support functions.
     84      1.1  lukem  *
     85      1.1  lukem  *************************************************************************/
     86      1.1  lukem 
     87      1.1  lukem /*
     88      1.1  lukem  * Routine to look up the skip value for a character.
     89      1.1  lukem  */
     90      1.1  lukem static unsigned long
     91      1.1  lukem _utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end)
     92      1.1  lukem {
     93      1.1  lukem     unsigned long i;
     94      1.1  lukem     ucs4_t c1, c2;
     95      1.1  lukem     _utbm_skip_t *sp;
     96      1.1  lukem 
     97      1.1  lukem     if (start >= end)
     98      1.1  lukem       return 0;
     99      1.1  lukem 
    100      1.1  lukem     c1 = *start;
    101      1.1  lukem     c2 = (start + 1 < end) ? *(start + 1) : ~0;
    102      1.1  lukem     if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
    103      1.1  lukem       c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    104      1.1  lukem 
    105      1.1  lukem     for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) {
    106      1.1  lukem         if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) {
    107      1.1  lukem             return ((unsigned long) (end - start) < sp->skip) ?
    108      1.1  lukem                 end - start : sp->skip;
    109      1.1  lukem         }
    110      1.1  lukem     }
    111      1.1  lukem     return p->patlen;
    112      1.1  lukem }
    113      1.1  lukem 
    114      1.1  lukem static int
    115      1.1  lukem _utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end,
    116      1.1  lukem             unsigned long *match_start, unsigned long *match_end)
    117      1.1  lukem {
    118      1.1  lukem     int check_space;
    119      1.1  lukem     ucs4_t c1, c2;
    120      1.1  lukem     unsigned long count;
    121      1.1  lukem     _utbm_char_t *cp;
    122      1.1  lukem 
    123      1.1  lukem     /*
    124      1.1  lukem      * Set the potential match endpoint first.
    125      1.1  lukem      */
    126      1.1  lukem     *match_end = (start - text) + 1;
    127      1.1  lukem 
    128      1.1  lukem     c1 = *start;
    129      1.1  lukem     c2 = (start + 1 < end) ? *(start + 1) : ~0;
    130      1.1  lukem     if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) {
    131      1.1  lukem         c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    132      1.1  lukem         /*
    133      1.1  lukem          * Adjust the match end point to occur after the UTF-16 character.
    134      1.1  lukem          */
    135      1.1  lukem         *match_end = *match_end + 1;
    136      1.1  lukem     }
    137      1.1  lukem 
    138      1.1  lukem     if (pat->pat_used == 1) {
    139      1.1  lukem         *match_start = start - text;
    140      1.1  lukem         return 1;
    141      1.1  lukem     }
    142      1.1  lukem 
    143      1.1  lukem     /*
    144      1.1  lukem      * Compare backward.
    145      1.1  lukem      */
    146      1.1  lukem     cp = pat->pat + (pat->pat_used - 1);
    147      1.1  lukem 
    148      1.1  lukem     for (count = pat->patlen; start > text && count > 0;) {
    149      1.1  lukem         /*
    150      1.1  lukem          * Ignore non-spacing characters if indicated.
    151      1.1  lukem          */
    152      1.1  lukem         if (pat->flags & UTBM_IGNORE_NONSPACING) {
    153      1.1  lukem             while (start > text && _utbm_nonspacing(c1)) {
    154      1.1  lukem                 c2 = *--start;
    155      1.1  lukem                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    156      1.1  lukem                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    157      1.1  lukem                     0xd800 <= c1 && c1 <= 0xdbff) {
    158      1.1  lukem                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    159      1.1  lukem                     start--;
    160      1.1  lukem                 } else
    161      1.1  lukem                   c1 = c2;
    162      1.1  lukem             }
    163      1.1  lukem         }
    164      1.1  lukem 
    165      1.1  lukem         /*
    166      1.1  lukem          * Handle space compression if indicated.
    167      1.1  lukem          */
    168      1.1  lukem         if (pat->flags & UTBM_SPACE_COMPRESS) {
    169      1.1  lukem             check_space = 0;
    170      1.1  lukem             while (start > text &&
    171      1.1  lukem                    (_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) {
    172      1.1  lukem                 check_space = _utbm_isspace(c1, 1);
    173      1.1  lukem                 c2 = *--start;
    174      1.1  lukem                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    175      1.1  lukem                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    176      1.1  lukem                     0xd800 <= c1 && c1 <= 0xdbff) {
    177      1.1  lukem                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    178      1.1  lukem                     start--;
    179      1.1  lukem                 } else
    180      1.1  lukem                   c1 = c2;
    181      1.1  lukem             }
    182      1.1  lukem             /*
    183      1.1  lukem              * Handle things if space compression was indicated and one or
    184      1.1  lukem              * more member characters were found.
    185      1.1  lukem              */
    186      1.1  lukem             if (check_space) {
    187      1.1  lukem                 if (cp->uc != ' ')
    188      1.1  lukem                   return 0;
    189      1.1  lukem                 cp--;
    190      1.1  lukem                 count--;
    191      1.1  lukem             }
    192      1.1  lukem         }
    193      1.1  lukem 
    194      1.1  lukem         /*
    195      1.1  lukem          * Handle the normal comparison cases.
    196      1.1  lukem          */
    197      1.1  lukem         if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc)))
    198      1.1  lukem           return 0;
    199      1.1  lukem 
    200      1.1  lukem         count -= (c1 >= 0x10000) ? 2 : 1;
    201      1.1  lukem         if (count > 0) {
    202      1.1  lukem             cp--;
    203      1.1  lukem 
    204      1.1  lukem             /*
    205      1.1  lukem              * Get the next preceding character.
    206      1.1  lukem              */
    207      1.1  lukem             if (start > text) {
    208      1.1  lukem                 c2 = *--start;
    209      1.1  lukem                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    210      1.1  lukem                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    211      1.1  lukem                     0xd800 <= c1 && c1 <= 0xdbff) {
    212      1.1  lukem                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    213      1.1  lukem                     start--;
    214      1.1  lukem                 } else
    215      1.1  lukem                   c1 = c2;
    216      1.1  lukem             }
    217      1.1  lukem         }
    218      1.1  lukem     }
    219      1.1  lukem 
    220      1.1  lukem     /*
    221      1.1  lukem      * Set the match start position.
    222      1.1  lukem      */
    223      1.1  lukem     *match_start = start - text;
    224      1.1  lukem     return 1;
    225      1.1  lukem }
    226      1.1  lukem 
    227      1.1  lukem /*************************************************************************
    228      1.1  lukem  *
    229      1.1  lukem  * API.
    230      1.1  lukem  *
    231      1.1  lukem  *************************************************************************/
    232      1.1  lukem 
    233      1.1  lukem utbm_pattern_t
    234      1.1  lukem utbm_create_pattern(void)
    235      1.1  lukem {
    236      1.1  lukem     utbm_pattern_t p;
    237      1.1  lukem 
    238      1.1  lukem     p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t));
    239      1.1  lukem     (void) memset((char *) p, '\0', sizeof(_utbm_pattern_t));
    240      1.1  lukem     return p;
    241      1.1  lukem }
    242      1.1  lukem 
    243      1.1  lukem void
    244      1.1  lukem utbm_free_pattern(utbm_pattern_t pattern)
    245      1.1  lukem {
    246      1.1  lukem     if (pattern == 0)
    247      1.1  lukem       return;
    248      1.1  lukem 
    249      1.1  lukem     if (pattern->pat_size > 0)
    250      1.1  lukem       free((char *) pattern->pat);
    251      1.1  lukem 
    252      1.1  lukem     if (pattern->skip_size > 0)
    253      1.1  lukem       free((char *) pattern->skip);
    254      1.1  lukem 
    255      1.1  lukem     free((char *) pattern);
    256      1.1  lukem }
    257      1.1  lukem 
    258      1.1  lukem void
    259      1.1  lukem utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags,
    260      1.1  lukem              utbm_pattern_t p)
    261      1.1  lukem {
    262      1.1  lukem     int have_space;
    263      1.1  lukem     unsigned long i, j, k, slen;
    264      1.1  lukem     _utbm_char_t *cp;
    265      1.1  lukem     _utbm_skip_t *sp;
    266      1.1  lukem     ucs4_t c1, c2, sentinel;
    267      1.1  lukem 
    268      1.1  lukem     if (p == 0 || pat == 0 || *pat == 0 || patlen == 0)
    269      1.1  lukem       return;
    270      1.1  lukem 
    271      1.1  lukem     /*
    272      1.1  lukem      * Reset the pattern buffer.
    273      1.1  lukem      */
    274      1.1  lukem     p->patlen = p->pat_used = p->skip_used = 0;
    275      1.1  lukem 
    276      1.1  lukem     /*
    277      1.1  lukem      * Set the flags.
    278      1.1  lukem      */
    279      1.1  lukem     p->flags = flags;
    280      1.1  lukem 
    281      1.1  lukem     /*
    282      1.1  lukem      * Initialize the extra skip flag.
    283      1.1  lukem      */
    284      1.1  lukem     p->md4 = 1;
    285      1.1  lukem 
    286      1.1  lukem     /*
    287      1.1  lukem      * Allocate more storage if necessary.
    288      1.1  lukem      */
    289      1.1  lukem     if (patlen > p->pat_size) {
    290      1.1  lukem         if (p->pat_size == 0) {
    291      1.1  lukem             p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen);
    292      1.1  lukem             p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen);
    293      1.1  lukem         } else {
    294      1.1  lukem             p->pat = (_utbm_char_t *)
    295      1.1  lukem                 realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen);
    296      1.1  lukem             p->skip = (_utbm_skip_t *)
    297      1.1  lukem                 realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen);
    298      1.1  lukem         }
    299      1.1  lukem         p->pat_size = p->skip_size = patlen;
    300      1.1  lukem     }
    301      1.1  lukem 
    302      1.1  lukem     /*
    303      1.1  lukem      * Preprocess the pattern to remove controls (if specified) and determine
    304      1.1  lukem      * case.
    305      1.1  lukem      */
    306      1.1  lukem     for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) {
    307      1.1  lukem         c1 = pat[i];
    308      1.1  lukem         c2 = (i + 1 < patlen) ? pat[i + 1] : ~0;
    309      1.1  lukem         if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
    310      1.1  lukem           c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    311      1.1  lukem 
    312      1.1  lukem         /*
    313      1.1  lukem          * Make sure the `have_space' flag is turned off if the character
    314      1.1  lukem          * is not an appropriate one.
    315      1.1  lukem          */
    316      1.1  lukem         if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS))
    317      1.1  lukem           have_space = 0;
    318      1.1  lukem 
    319      1.1  lukem         /*
    320      1.1  lukem          * If non-spacing characters should be ignored, do it here.
    321      1.1  lukem          */
    322      1.1  lukem         if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1))
    323      1.1  lukem           continue;
    324      1.1  lukem 
    325      1.1  lukem         /*
    326      1.1  lukem          * Check if spaces and controls need to be compressed.
    327      1.1  lukem          */
    328      1.1  lukem         if (flags & UTBM_SPACE_COMPRESS) {
    329      1.1  lukem             if (_utbm_isspace(c1, 1)) {
    330      1.1  lukem                 if (!have_space) {
    331      1.1  lukem                     /*
    332      1.1  lukem                      * Add a space and set the flag.
    333      1.1  lukem                      */
    334      1.1  lukem                     cp->uc = cp->lc = cp->tc = ' ';
    335      1.1  lukem                     cp++;
    336      1.1  lukem 
    337      1.1  lukem                     /*
    338      1.1  lukem                      * Increase the real pattern length.
    339      1.1  lukem                      */
    340      1.1  lukem                     p->patlen++;
    341      1.1  lukem                     sentinel = ' ';
    342      1.1  lukem                     have_space = 1;
    343      1.1  lukem                 }
    344      1.1  lukem                 continue;
    345      1.1  lukem             }
    346      1.1  lukem 
    347      1.1  lukem             /*
    348      1.1  lukem              * Ignore all control characters.
    349      1.1  lukem              */
    350      1.1  lukem             if (_utbm_iscntrl(c1))
    351      1.1  lukem               continue;
    352      1.1  lukem         }
    353      1.1  lukem 
    354      1.1  lukem         /*
    355      1.1  lukem          * Add the character.
    356      1.1  lukem          */
    357      1.1  lukem         if (flags & UTBM_CASEFOLD) {
    358      1.1  lukem             cp->uc = _utbm_toupper(c1);
    359      1.1  lukem             cp->lc = _utbm_tolower(c1);
    360      1.1  lukem             cp->tc = _utbm_totitle(c1);
    361      1.1  lukem         } else
    362      1.1  lukem           cp->uc = cp->lc = cp->tc = c1;
    363      1.1  lukem 
    364      1.1  lukem         /*
    365      1.1  lukem          * Set the sentinel character.
    366      1.1  lukem          */
    367      1.1  lukem         sentinel = cp->uc;
    368      1.1  lukem 
    369      1.1  lukem         /*
    370      1.1  lukem          * Move to the next character.
    371      1.1  lukem          */
    372      1.1  lukem         cp++;
    373      1.1  lukem 
    374      1.1  lukem         /*
    375      1.1  lukem          * Increase the real pattern length appropriately.
    376      1.1  lukem          */
    377      1.1  lukem         p->patlen += (c1 >= 0x10000) ? 2 : 1;
    378      1.1  lukem 
    379      1.1  lukem         /*
    380      1.1  lukem          * Increment the loop index for UTF-16 characters.
    381      1.1  lukem          */
    382      1.1  lukem         i += (c1 >= 0x10000) ? 1 : 0;
    383      1.1  lukem 
    384      1.1  lukem     }
    385      1.1  lukem 
    386      1.1  lukem     /*
    387      1.1  lukem      * Set the number of characters actually used.
    388      1.1  lukem      */
    389      1.1  lukem     p->pat_used = cp - p->pat;
    390      1.1  lukem 
    391      1.1  lukem     /*
    392      1.1  lukem      * Go through and construct the skip array and determine the actual length
    393      1.1  lukem      * of the pattern in UCS2 terms.
    394      1.1  lukem      */
    395      1.1  lukem     slen = p->patlen - 1;
    396      1.1  lukem     cp = p->pat;
    397      1.1  lukem     for (i = k = 0; i < p->pat_used; i++, cp++) {
    398      1.1  lukem         /*
    399      1.1  lukem          * Locate the character in the skip array.
    400      1.1  lukem          */
    401      1.1  lukem         for (sp = p->skip, j = 0;
    402      1.1  lukem              j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ;
    403      1.1  lukem 
    404      1.1  lukem         /*
    405      1.1  lukem          * If the character is not found, set the new skip element and
    406      1.1  lukem          * increase the number of skip elements.
    407      1.1  lukem          */
    408      1.1  lukem         if (j == p->skip_used) {
    409      1.1  lukem             sp->ch = cp;
    410      1.1  lukem             p->skip_used++;
    411      1.1  lukem         }
    412      1.1  lukem 
    413      1.1  lukem         /*
    414      1.1  lukem          * Set the updated skip value.  If the character is UTF-16 and is
    415      1.1  lukem          * not the last one in the pattern, add one to its skip value.
    416      1.1  lukem          */
    417      1.1  lukem         sp->skip = slen - k;
    418      1.1  lukem         if (cp->uc >= 0x10000 && k + 2 < slen)
    419      1.1  lukem           sp->skip++;
    420      1.1  lukem 
    421      1.1  lukem         /*
    422      1.1  lukem          * Set the new extra skip for the sentinel character.
    423      1.1  lukem          */
    424      1.1  lukem         if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) &&
    425      1.1  lukem             cp->uc == sentinel)
    426      1.1  lukem           p->md4 = slen - k;
    427      1.1  lukem 
    428      1.1  lukem         /*
    429      1.1  lukem          * Increase the actual index.
    430      1.1  lukem          */
    431      1.1  lukem         k += (cp->uc >= 0x10000) ? 2 : 1;
    432      1.1  lukem     }
    433      1.1  lukem }
    434      1.1  lukem 
    435      1.1  lukem int
    436      1.1  lukem utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen,
    437      1.1  lukem           unsigned long *match_start, unsigned long *match_end)
    438      1.1  lukem {
    439      1.1  lukem     unsigned long k;
    440      1.1  lukem     ucs2_t *start, *end;
    441      1.1  lukem 
    442      1.1  lukem     if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 ||
    443      1.1  lukem         textlen < pat->patlen)
    444      1.1  lukem       return 0;
    445      1.1  lukem 
    446      1.1  lukem     start = text + pat->patlen;
    447      1.1  lukem     end = text + textlen;
    448      1.1  lukem 
    449      1.1  lukem     /*
    450      1.1  lukem      * Adjust the start point if it points to a low surrogate.
    451      1.1  lukem      */
    452      1.1  lukem     if (0xdc00 <= *start && *start <= 0xdfff &&
    453      1.1  lukem         0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    454      1.1  lukem       start--;
    455      1.1  lukem 
    456      1.1  lukem     while (start < end) {
    457      1.1  lukem         while ((k = _utbm_skip(pat, start, end))) {
    458      1.1  lukem             start += k;
    459      1.1  lukem             if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
    460      1.1  lukem                 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    461      1.1  lukem               start--;
    462      1.1  lukem         }
    463      1.1  lukem 
    464      1.1  lukem         if (start < end &&
    465      1.1  lukem             _utbm_match(pat, text, start, end, match_start, match_end))
    466      1.1  lukem           return 1;
    467      1.1  lukem 
    468      1.1  lukem         start += pat->md4;
    469      1.1  lukem         if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
    470      1.1  lukem             0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    471      1.1  lukem           start--;
    472      1.1  lukem     }
    473      1.1  lukem     return 0;
    474      1.1  lukem }
    475