Home | History | Annotate | Line # | Download | only in utbm
utbm.c revision 1.4
      1  1.3  christos /*	$NetBSD: utbm.c,v 1.4 2025/09/05 21:16:23 christos Exp $	*/
      2  1.2  christos 
      3  1.2  christos /* $OpenLDAP$ */
      4  1.1     lukem /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
      5  1.1     lukem  *
      6  1.4  christos  * Copyright 1998-2024 The OpenLDAP Foundation.
      7  1.1     lukem  * All rights reserved.
      8  1.1     lukem  *
      9  1.1     lukem  * Redistribution and use in source and binary forms, with or without
     10  1.1     lukem  * modification, are permitted only as authorized by the OpenLDAP
     11  1.1     lukem  * Public License.
     12  1.1     lukem  *
     13  1.1     lukem  * A copy of this license is available in file LICENSE in the
     14  1.1     lukem  * top-level directory of the distribution or, alternatively, at
     15  1.1     lukem  * <http://www.OpenLDAP.org/license.html>.
     16  1.1     lukem  */
     17  1.1     lukem /* Copyright 1997, 1998, 1999 Computing Research Labs,
     18  1.1     lukem  * New Mexico State University
     19  1.1     lukem  *
     20  1.1     lukem  * Permission is hereby granted, free of charge, to any person obtaining a
     21  1.1     lukem  * copy of this software and associated documentation files (the "Software"),
     22  1.1     lukem  * to deal in the Software without restriction, including without limitation
     23  1.1     lukem  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     24  1.1     lukem  * and/or sell copies of the Software, and to permit persons to whom the
     25  1.1     lukem  * Software is furnished to do so, subject to the following conditions:
     26  1.1     lukem  *
     27  1.1     lukem  * The above copyright notice and this permission notice shall be included in
     28  1.1     lukem  * all copies or substantial portions of the Software.
     29  1.1     lukem  *
     30  1.1     lukem  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     31  1.1     lukem  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     32  1.1     lukem  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     33  1.1     lukem  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
     34  1.1     lukem  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
     35  1.1     lukem  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
     36  1.1     lukem  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     37  1.1     lukem  */
     38  1.2  christos /* Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp  */
     39  1.1     lukem 
     40  1.1     lukem /*
     41  1.1     lukem  * Assumptions:
     42  1.1     lukem  * 1. Case conversions of UTF-16 characters must also be UTF-16 characters.
     43  1.1     lukem  * 2. Case conversions are all one-to-one.
     44  1.1     lukem  * 3. Text and pattern have already been normalized in some fashion.
     45  1.1     lukem  */
     46  1.1     lukem 
     47  1.1     lukem #include <stdlib.h>
     48  1.1     lukem #include <unistd.h>
     49  1.1     lukem #include <string.h>
     50  1.1     lukem #include "utbm.h"
     51  1.1     lukem 
     52  1.1     lukem /*
     53  1.1     lukem  * Single pattern character.
     54  1.1     lukem  */
     55  1.1     lukem typedef struct {
     56  1.1     lukem     ucs4_t lc;
     57  1.1     lukem     ucs4_t uc;
     58  1.1     lukem     ucs4_t tc;
     59  1.1     lukem } _utbm_char_t;
     60  1.1     lukem 
     61  1.1     lukem typedef struct {
     62  1.1     lukem     _utbm_char_t *ch;
     63  1.1     lukem     unsigned long skip;
     64  1.1     lukem } _utbm_skip_t;
     65  1.1     lukem 
     66  1.1     lukem typedef struct _utbm_pattern_t {
     67  1.1     lukem     unsigned long flags;
     68  1.1     lukem 
     69  1.1     lukem     _utbm_char_t *pat;
     70  1.1     lukem     unsigned long pat_used;
     71  1.1     lukem     unsigned long pat_size;
     72  1.1     lukem     unsigned long patlen;
     73  1.1     lukem 
     74  1.1     lukem     _utbm_skip_t *skip;
     75  1.1     lukem     unsigned long skip_used;
     76  1.1     lukem     unsigned long skip_size;
     77  1.1     lukem 
     78  1.1     lukem     unsigned long md4;
     79  1.1     lukem } _utbm_pattern_t;
     80  1.1     lukem 
     81  1.1     lukem /*************************************************************************
     82  1.1     lukem  *
     83  1.1     lukem  * Support functions.
     84  1.1     lukem  *
     85  1.1     lukem  *************************************************************************/
     86  1.1     lukem 
     87  1.1     lukem /*
     88  1.1     lukem  * Routine to look up the skip value for a character.
     89  1.1     lukem  */
     90  1.1     lukem static unsigned long
     91  1.1     lukem _utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end)
     92  1.1     lukem {
     93  1.1     lukem     unsigned long i;
     94  1.1     lukem     ucs4_t c1, c2;
     95  1.1     lukem     _utbm_skip_t *sp;
     96  1.1     lukem 
     97  1.1     lukem     if (start >= end)
     98  1.1     lukem       return 0;
     99  1.1     lukem 
    100  1.1     lukem     c1 = *start;
    101  1.1     lukem     c2 = (start + 1 < end) ? *(start + 1) : ~0;
    102  1.1     lukem     if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
    103  1.1     lukem       c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    104  1.1     lukem 
    105  1.1     lukem     for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) {
    106  1.1     lukem         if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) {
    107  1.1     lukem             return ((unsigned long) (end - start) < sp->skip) ?
    108  1.1     lukem                 end - start : sp->skip;
    109  1.1     lukem         }
    110  1.1     lukem     }
    111  1.1     lukem     return p->patlen;
    112  1.1     lukem }
    113  1.1     lukem 
    114  1.1     lukem static int
    115  1.1     lukem _utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end,
    116  1.1     lukem             unsigned long *match_start, unsigned long *match_end)
    117  1.1     lukem {
    118  1.1     lukem     int check_space;
    119  1.1     lukem     ucs4_t c1, c2;
    120  1.1     lukem     unsigned long count;
    121  1.1     lukem     _utbm_char_t *cp;
    122  1.1     lukem 
    123  1.1     lukem     /*
    124  1.1     lukem      * Set the potential match endpoint first.
    125  1.1     lukem      */
    126  1.1     lukem     *match_end = (start - text) + 1;
    127  1.1     lukem 
    128  1.1     lukem     c1 = *start;
    129  1.1     lukem     c2 = (start + 1 < end) ? *(start + 1) : ~0;
    130  1.1     lukem     if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) {
    131  1.1     lukem         c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    132  1.1     lukem         /*
    133  1.1     lukem          * Adjust the match end point to occur after the UTF-16 character.
    134  1.1     lukem          */
    135  1.1     lukem         *match_end = *match_end + 1;
    136  1.1     lukem     }
    137  1.1     lukem 
    138  1.1     lukem     if (pat->pat_used == 1) {
    139  1.1     lukem         *match_start = start - text;
    140  1.1     lukem         return 1;
    141  1.1     lukem     }
    142  1.1     lukem 
    143  1.1     lukem     /*
    144  1.1     lukem      * Compare backward.
    145  1.1     lukem      */
    146  1.1     lukem     cp = pat->pat + (pat->pat_used - 1);
    147  1.1     lukem 
    148  1.1     lukem     for (count = pat->patlen; start > text && count > 0;) {
    149  1.1     lukem         /*
    150  1.1     lukem          * Ignore non-spacing characters if indicated.
    151  1.1     lukem          */
    152  1.1     lukem         if (pat->flags & UTBM_IGNORE_NONSPACING) {
    153  1.1     lukem             while (start > text && _utbm_nonspacing(c1)) {
    154  1.1     lukem                 c2 = *--start;
    155  1.1     lukem                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    156  1.1     lukem                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    157  1.1     lukem                     0xd800 <= c1 && c1 <= 0xdbff) {
    158  1.1     lukem                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    159  1.1     lukem                     start--;
    160  1.1     lukem                 } else
    161  1.1     lukem                   c1 = c2;
    162  1.1     lukem             }
    163  1.1     lukem         }
    164  1.1     lukem 
    165  1.1     lukem         /*
    166  1.1     lukem          * Handle space compression if indicated.
    167  1.1     lukem          */
    168  1.1     lukem         if (pat->flags & UTBM_SPACE_COMPRESS) {
    169  1.1     lukem             check_space = 0;
    170  1.1     lukem             while (start > text &&
    171  1.1     lukem                    (_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) {
    172  1.1     lukem                 check_space = _utbm_isspace(c1, 1);
    173  1.1     lukem                 c2 = *--start;
    174  1.1     lukem                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    175  1.1     lukem                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    176  1.1     lukem                     0xd800 <= c1 && c1 <= 0xdbff) {
    177  1.1     lukem                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    178  1.1     lukem                     start--;
    179  1.1     lukem                 } else
    180  1.1     lukem                   c1 = c2;
    181  1.1     lukem             }
    182  1.1     lukem             /*
    183  1.1     lukem              * Handle things if space compression was indicated and one or
    184  1.1     lukem              * more member characters were found.
    185  1.1     lukem              */
    186  1.1     lukem             if (check_space) {
    187  1.1     lukem                 if (cp->uc != ' ')
    188  1.1     lukem                   return 0;
    189  1.1     lukem                 cp--;
    190  1.1     lukem                 count--;
    191  1.1     lukem             }
    192  1.1     lukem         }
    193  1.1     lukem 
    194  1.1     lukem         /*
    195  1.1     lukem          * Handle the normal comparison cases.
    196  1.1     lukem          */
    197  1.1     lukem         if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc)))
    198  1.1     lukem           return 0;
    199  1.1     lukem 
    200  1.1     lukem         count -= (c1 >= 0x10000) ? 2 : 1;
    201  1.1     lukem         if (count > 0) {
    202  1.1     lukem             cp--;
    203  1.1     lukem 
    204  1.1     lukem             /*
    205  1.1     lukem              * Get the next preceding character.
    206  1.1     lukem              */
    207  1.1     lukem             if (start > text) {
    208  1.1     lukem                 c2 = *--start;
    209  1.1     lukem                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    210  1.1     lukem                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    211  1.1     lukem                     0xd800 <= c1 && c1 <= 0xdbff) {
    212  1.1     lukem                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    213  1.1     lukem                     start--;
    214  1.1     lukem                 } else
    215  1.1     lukem                   c1 = c2;
    216  1.1     lukem             }
    217  1.1     lukem         }
    218  1.1     lukem     }
    219  1.1     lukem 
    220  1.1     lukem     /*
    221  1.1     lukem      * Set the match start position.
    222  1.1     lukem      */
    223  1.1     lukem     *match_start = start - text;
    224  1.1     lukem     return 1;
    225  1.1     lukem }
    226  1.1     lukem 
    227  1.1     lukem /*************************************************************************
    228  1.1     lukem  *
    229  1.1     lukem  * API.
    230  1.1     lukem  *
    231  1.1     lukem  *************************************************************************/
    232  1.1     lukem 
    233  1.1     lukem utbm_pattern_t
    234  1.1     lukem utbm_create_pattern(void)
    235  1.1     lukem {
    236  1.1     lukem     utbm_pattern_t p;
    237  1.1     lukem 
    238  1.1     lukem     p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t));
    239  1.1     lukem     (void) memset((char *) p, '\0', sizeof(_utbm_pattern_t));
    240  1.1     lukem     return p;
    241  1.1     lukem }
    242  1.1     lukem 
    243  1.1     lukem void
    244  1.1     lukem utbm_free_pattern(utbm_pattern_t pattern)
    245  1.1     lukem {
    246  1.1     lukem     if (pattern == 0)
    247  1.1     lukem       return;
    248  1.1     lukem 
    249  1.1     lukem     if (pattern->pat_size > 0)
    250  1.1     lukem       free((char *) pattern->pat);
    251  1.1     lukem 
    252  1.1     lukem     if (pattern->skip_size > 0)
    253  1.1     lukem       free((char *) pattern->skip);
    254  1.1     lukem 
    255  1.1     lukem     free((char *) pattern);
    256  1.1     lukem }
    257  1.1     lukem 
    258  1.1     lukem void
    259  1.1     lukem utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags,
    260  1.1     lukem              utbm_pattern_t p)
    261  1.1     lukem {
    262  1.1     lukem     int have_space;
    263  1.1     lukem     unsigned long i, j, k, slen;
    264  1.1     lukem     _utbm_char_t *cp;
    265  1.1     lukem     _utbm_skip_t *sp;
    266  1.1     lukem     ucs4_t c1, c2, sentinel;
    267  1.1     lukem 
    268  1.1     lukem     if (p == 0 || pat == 0 || *pat == 0 || patlen == 0)
    269  1.1     lukem       return;
    270  1.1     lukem 
    271  1.1     lukem     /*
    272  1.1     lukem      * Reset the pattern buffer.
    273  1.1     lukem      */
    274  1.1     lukem     p->patlen = p->pat_used = p->skip_used = 0;
    275  1.1     lukem 
    276  1.1     lukem     /*
    277  1.1     lukem      * Set the flags.
    278  1.1     lukem      */
    279  1.1     lukem     p->flags = flags;
    280  1.1     lukem 
    281  1.1     lukem     /*
    282  1.1     lukem      * Initialize the extra skip flag.
    283  1.1     lukem      */
    284  1.1     lukem     p->md4 = 1;
    285  1.1     lukem 
    286  1.1     lukem     /*
    287  1.1     lukem      * Allocate more storage if necessary.
    288  1.1     lukem      */
    289  1.1     lukem     if (patlen > p->pat_size) {
    290  1.1     lukem         if (p->pat_size == 0) {
    291  1.1     lukem             p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen);
    292  1.1     lukem             p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen);
    293  1.1     lukem         } else {
    294  1.1     lukem             p->pat = (_utbm_char_t *)
    295  1.1     lukem                 realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen);
    296  1.1     lukem             p->skip = (_utbm_skip_t *)
    297  1.1     lukem                 realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen);
    298  1.1     lukem         }
    299  1.1     lukem         p->pat_size = p->skip_size = patlen;
    300  1.1     lukem     }
    301  1.1     lukem 
    302  1.1     lukem     /*
    303  1.1     lukem      * Preprocess the pattern to remove controls (if specified) and determine
    304  1.1     lukem      * case.
    305  1.1     lukem      */
    306  1.1     lukem     for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) {
    307  1.1     lukem         c1 = pat[i];
    308  1.1     lukem         c2 = (i + 1 < patlen) ? pat[i + 1] : ~0;
    309  1.1     lukem         if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
    310  1.1     lukem           c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    311  1.1     lukem 
    312  1.1     lukem         /*
    313  1.1     lukem          * Make sure the `have_space' flag is turned off if the character
    314  1.1     lukem          * is not an appropriate one.
    315  1.1     lukem          */
    316  1.1     lukem         if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS))
    317  1.1     lukem           have_space = 0;
    318  1.1     lukem 
    319  1.1     lukem         /*
    320  1.1     lukem          * If non-spacing characters should be ignored, do it here.
    321  1.1     lukem          */
    322  1.1     lukem         if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1))
    323  1.1     lukem           continue;
    324  1.1     lukem 
    325  1.1     lukem         /*
    326  1.1     lukem          * Check if spaces and controls need to be compressed.
    327  1.1     lukem          */
    328  1.1     lukem         if (flags & UTBM_SPACE_COMPRESS) {
    329  1.1     lukem             if (_utbm_isspace(c1, 1)) {
    330  1.1     lukem                 if (!have_space) {
    331  1.1     lukem                     /*
    332  1.1     lukem                      * Add a space and set the flag.
    333  1.1     lukem                      */
    334  1.1     lukem                     cp->uc = cp->lc = cp->tc = ' ';
    335  1.1     lukem                     cp++;
    336  1.1     lukem 
    337  1.1     lukem                     /*
    338  1.1     lukem                      * Increase the real pattern length.
    339  1.1     lukem                      */
    340  1.1     lukem                     p->patlen++;
    341  1.1     lukem                     sentinel = ' ';
    342  1.1     lukem                     have_space = 1;
    343  1.1     lukem                 }
    344  1.1     lukem                 continue;
    345  1.1     lukem             }
    346  1.1     lukem 
    347  1.1     lukem             /*
    348  1.1     lukem              * Ignore all control characters.
    349  1.1     lukem              */
    350  1.1     lukem             if (_utbm_iscntrl(c1))
    351  1.1     lukem               continue;
    352  1.1     lukem         }
    353  1.1     lukem 
    354  1.1     lukem         /*
    355  1.1     lukem          * Add the character.
    356  1.1     lukem          */
    357  1.1     lukem         if (flags & UTBM_CASEFOLD) {
    358  1.1     lukem             cp->uc = _utbm_toupper(c1);
    359  1.1     lukem             cp->lc = _utbm_tolower(c1);
    360  1.1     lukem             cp->tc = _utbm_totitle(c1);
    361  1.1     lukem         } else
    362  1.1     lukem           cp->uc = cp->lc = cp->tc = c1;
    363  1.1     lukem 
    364  1.1     lukem         /*
    365  1.1     lukem          * Set the sentinel character.
    366  1.1     lukem          */
    367  1.1     lukem         sentinel = cp->uc;
    368  1.1     lukem 
    369  1.1     lukem         /*
    370  1.1     lukem          * Move to the next character.
    371  1.1     lukem          */
    372  1.1     lukem         cp++;
    373  1.1     lukem 
    374  1.1     lukem         /*
    375  1.1     lukem          * Increase the real pattern length appropriately.
    376  1.1     lukem          */
    377  1.1     lukem         p->patlen += (c1 >= 0x10000) ? 2 : 1;
    378  1.1     lukem 
    379  1.1     lukem         /*
    380  1.1     lukem          * Increment the loop index for UTF-16 characters.
    381  1.1     lukem          */
    382  1.1     lukem         i += (c1 >= 0x10000) ? 1 : 0;
    383  1.1     lukem 
    384  1.1     lukem     }
    385  1.1     lukem 
    386  1.1     lukem     /*
    387  1.1     lukem      * Set the number of characters actually used.
    388  1.1     lukem      */
    389  1.1     lukem     p->pat_used = cp - p->pat;
    390  1.1     lukem 
    391  1.1     lukem     /*
    392  1.1     lukem      * Go through and construct the skip array and determine the actual length
    393  1.1     lukem      * of the pattern in UCS2 terms.
    394  1.1     lukem      */
    395  1.1     lukem     slen = p->patlen - 1;
    396  1.1     lukem     cp = p->pat;
    397  1.1     lukem     for (i = k = 0; i < p->pat_used; i++, cp++) {
    398  1.1     lukem         /*
    399  1.1     lukem          * Locate the character in the skip array.
    400  1.1     lukem          */
    401  1.1     lukem         for (sp = p->skip, j = 0;
    402  1.1     lukem              j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ;
    403  1.1     lukem 
    404  1.1     lukem         /*
    405  1.1     lukem          * If the character is not found, set the new skip element and
    406  1.1     lukem          * increase the number of skip elements.
    407  1.1     lukem          */
    408  1.1     lukem         if (j == p->skip_used) {
    409  1.1     lukem             sp->ch = cp;
    410  1.1     lukem             p->skip_used++;
    411  1.1     lukem         }
    412  1.1     lukem 
    413  1.1     lukem         /*
    414  1.1     lukem          * Set the updated skip value.  If the character is UTF-16 and is
    415  1.1     lukem          * not the last one in the pattern, add one to its skip value.
    416  1.1     lukem          */
    417  1.1     lukem         sp->skip = slen - k;
    418  1.1     lukem         if (cp->uc >= 0x10000 && k + 2 < slen)
    419  1.1     lukem           sp->skip++;
    420  1.1     lukem 
    421  1.1     lukem         /*
    422  1.1     lukem          * Set the new extra skip for the sentinel character.
    423  1.1     lukem          */
    424  1.1     lukem         if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) &&
    425  1.1     lukem             cp->uc == sentinel)
    426  1.1     lukem           p->md4 = slen - k;
    427  1.1     lukem 
    428  1.1     lukem         /*
    429  1.1     lukem          * Increase the actual index.
    430  1.1     lukem          */
    431  1.1     lukem         k += (cp->uc >= 0x10000) ? 2 : 1;
    432  1.1     lukem     }
    433  1.1     lukem }
    434  1.1     lukem 
    435  1.1     lukem int
    436  1.1     lukem utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen,
    437  1.1     lukem           unsigned long *match_start, unsigned long *match_end)
    438  1.1     lukem {
    439  1.1     lukem     unsigned long k;
    440  1.1     lukem     ucs2_t *start, *end;
    441  1.1     lukem 
    442  1.1     lukem     if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 ||
    443  1.1     lukem         textlen < pat->patlen)
    444  1.1     lukem       return 0;
    445  1.1     lukem 
    446  1.1     lukem     start = text + pat->patlen;
    447  1.1     lukem     end = text + textlen;
    448  1.1     lukem 
    449  1.1     lukem     /*
    450  1.1     lukem      * Adjust the start point if it points to a low surrogate.
    451  1.1     lukem      */
    452  1.1     lukem     if (0xdc00 <= *start && *start <= 0xdfff &&
    453  1.1     lukem         0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    454  1.1     lukem       start--;
    455  1.1     lukem 
    456  1.1     lukem     while (start < end) {
    457  1.1     lukem         while ((k = _utbm_skip(pat, start, end))) {
    458  1.1     lukem             start += k;
    459  1.1     lukem             if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
    460  1.1     lukem                 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    461  1.1     lukem               start--;
    462  1.1     lukem         }
    463  1.1     lukem 
    464  1.1     lukem         if (start < end &&
    465  1.1     lukem             _utbm_match(pat, text, start, end, match_start, match_end))
    466  1.1     lukem           return 1;
    467  1.1     lukem 
    468  1.1     lukem         start += pat->md4;
    469  1.1     lukem         if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
    470  1.1     lukem             0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    471  1.1     lukem           start--;
    472  1.1     lukem     }
    473  1.1     lukem     return 0;
    474  1.1     lukem }
    475