Home | History | Annotate | Line # | Download | only in utbm
utbm.c revision 1.1.1.1.8.2
      1 /* $OpenLDAP: pkg/ldap/libraries/liblunicode/utbm/utbm.c,v 1.7.2.3 2008/02/11 23:26:42 kurt Exp $ */
      2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
      3  *
      4  * Copyright 1998-2008 The OpenLDAP Foundation.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted only as authorized by the OpenLDAP
      9  * Public License.
     10  *
     11  * A copy of this license is available in file LICENSE in the
     12  * top-level directory of the distribution or, alternatively, at
     13  * <http://www.OpenLDAP.org/license.html>.
     14  */
     15 /* Copyright 1997, 1998, 1999 Computing Research Labs,
     16  * New Mexico State University
     17  *
     18  * Permission is hereby granted, free of charge, to any person obtaining a
     19  * copy of this software and associated documentation files (the "Software"),
     20  * to deal in the Software without restriction, including without limitation
     21  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     22  * and/or sell copies of the Software, and to permit persons to whom the
     23  * Software is furnished to do so, subject to the following conditions:
     24  *
     25  * The above copyright notice and this permission notice shall be included in
     26  * all copies or substantial portions of the Software.
     27  *
     28  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     29  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     30  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     31  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
     32  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
     33  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
     34  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     35  */
     36 /* $Id: utbm.c,v 1.1.1.1.8.2 2008/05/22 14:20:37 lukem Exp $ */
     37 
     38 /*
     39  * Assumptions:
     40  * 1. Case conversions of UTF-16 characters must also be UTF-16 characters.
     41  * 2. Case conversions are all one-to-one.
     42  * 3. Text and pattern have already been normalized in some fashion.
     43  */
     44 
     45 #include <stdlib.h>
     46 #include <unistd.h>
     47 #include <string.h>
     48 #include "utbm.h"
     49 
     50 /*
     51  * Single pattern character.
     52  */
     53 typedef struct {
     54     ucs4_t lc;
     55     ucs4_t uc;
     56     ucs4_t tc;
     57 } _utbm_char_t;
     58 
     59 typedef struct {
     60     _utbm_char_t *ch;
     61     unsigned long skip;
     62 } _utbm_skip_t;
     63 
     64 typedef struct _utbm_pattern_t {
     65     unsigned long flags;
     66 
     67     _utbm_char_t *pat;
     68     unsigned long pat_used;
     69     unsigned long pat_size;
     70     unsigned long patlen;
     71 
     72     _utbm_skip_t *skip;
     73     unsigned long skip_used;
     74     unsigned long skip_size;
     75 
     76     unsigned long md4;
     77 } _utbm_pattern_t;
     78 
     79 /*************************************************************************
     80  *
     81  * Support functions.
     82  *
     83  *************************************************************************/
     84 
     85 /*
     86  * Routine to look up the skip value for a character.
     87  */
     88 static unsigned long
     89 _utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end)
     90 {
     91     unsigned long i;
     92     ucs4_t c1, c2;
     93     _utbm_skip_t *sp;
     94 
     95     if (start >= end)
     96       return 0;
     97 
     98     c1 = *start;
     99     c2 = (start + 1 < end) ? *(start + 1) : ~0;
    100     if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
    101       c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    102 
    103     for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) {
    104         if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) {
    105             return ((unsigned long) (end - start) < sp->skip) ?
    106                 end - start : sp->skip;
    107         }
    108     }
    109     return p->patlen;
    110 }
    111 
    112 static int
    113 _utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end,
    114             unsigned long *match_start, unsigned long *match_end)
    115 {
    116     int check_space;
    117     ucs4_t c1, c2;
    118     unsigned long count;
    119     _utbm_char_t *cp;
    120 
    121     /*
    122      * Set the potential match endpoint first.
    123      */
    124     *match_end = (start - text) + 1;
    125 
    126     c1 = *start;
    127     c2 = (start + 1 < end) ? *(start + 1) : ~0;
    128     if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) {
    129         c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    130         /*
    131          * Adjust the match end point to occur after the UTF-16 character.
    132          */
    133         *match_end = *match_end + 1;
    134     }
    135 
    136     if (pat->pat_used == 1) {
    137         *match_start = start - text;
    138         return 1;
    139     }
    140 
    141     /*
    142      * Compare backward.
    143      */
    144     cp = pat->pat + (pat->pat_used - 1);
    145 
    146     for (count = pat->patlen; start > text && count > 0;) {
    147         /*
    148          * Ignore non-spacing characters if indicated.
    149          */
    150         if (pat->flags & UTBM_IGNORE_NONSPACING) {
    151             while (start > text && _utbm_nonspacing(c1)) {
    152                 c2 = *--start;
    153                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    154                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    155                     0xd800 <= c1 && c1 <= 0xdbff) {
    156                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    157                     start--;
    158                 } else
    159                   c1 = c2;
    160             }
    161         }
    162 
    163         /*
    164          * Handle space compression if indicated.
    165          */
    166         if (pat->flags & UTBM_SPACE_COMPRESS) {
    167             check_space = 0;
    168             while (start > text &&
    169                    (_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) {
    170                 check_space = _utbm_isspace(c1, 1);
    171                 c2 = *--start;
    172                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    173                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    174                     0xd800 <= c1 && c1 <= 0xdbff) {
    175                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    176                     start--;
    177                 } else
    178                   c1 = c2;
    179             }
    180             /*
    181              * Handle things if space compression was indicated and one or
    182              * more member characters were found.
    183              */
    184             if (check_space) {
    185                 if (cp->uc != ' ')
    186                   return 0;
    187                 cp--;
    188                 count--;
    189             }
    190         }
    191 
    192         /*
    193          * Handle the normal comparison cases.
    194          */
    195         if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc)))
    196           return 0;
    197 
    198         count -= (c1 >= 0x10000) ? 2 : 1;
    199         if (count > 0) {
    200             cp--;
    201 
    202             /*
    203              * Get the next preceding character.
    204              */
    205             if (start > text) {
    206                 c2 = *--start;
    207                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
    208                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
    209                     0xd800 <= c1 && c1 <= 0xdbff) {
    210                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    211                     start--;
    212                 } else
    213                   c1 = c2;
    214             }
    215         }
    216     }
    217 
    218     /*
    219      * Set the match start position.
    220      */
    221     *match_start = start - text;
    222     return 1;
    223 }
    224 
    225 /*************************************************************************
    226  *
    227  * API.
    228  *
    229  *************************************************************************/
    230 
    231 utbm_pattern_t
    232 utbm_create_pattern(void)
    233 {
    234     utbm_pattern_t p;
    235 
    236     p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t));
    237     (void) memset((char *) p, '\0', sizeof(_utbm_pattern_t));
    238     return p;
    239 }
    240 
    241 void
    242 utbm_free_pattern(utbm_pattern_t pattern)
    243 {
    244     if (pattern == 0)
    245       return;
    246 
    247     if (pattern->pat_size > 0)
    248       free((char *) pattern->pat);
    249 
    250     if (pattern->skip_size > 0)
    251       free((char *) pattern->skip);
    252 
    253     free((char *) pattern);
    254 }
    255 
    256 void
    257 utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags,
    258              utbm_pattern_t p)
    259 {
    260     int have_space;
    261     unsigned long i, j, k, slen;
    262     _utbm_char_t *cp;
    263     _utbm_skip_t *sp;
    264     ucs4_t c1, c2, sentinel;
    265 
    266     if (p == 0 || pat == 0 || *pat == 0 || patlen == 0)
    267       return;
    268 
    269     /*
    270      * Reset the pattern buffer.
    271      */
    272     p->patlen = p->pat_used = p->skip_used = 0;
    273 
    274     /*
    275      * Set the flags.
    276      */
    277     p->flags = flags;
    278 
    279     /*
    280      * Initialize the extra skip flag.
    281      */
    282     p->md4 = 1;
    283 
    284     /*
    285      * Allocate more storage if necessary.
    286      */
    287     if (patlen > p->pat_size) {
    288         if (p->pat_size == 0) {
    289             p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen);
    290             p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen);
    291         } else {
    292             p->pat = (_utbm_char_t *)
    293                 realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen);
    294             p->skip = (_utbm_skip_t *)
    295                 realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen);
    296         }
    297         p->pat_size = p->skip_size = patlen;
    298     }
    299 
    300     /*
    301      * Preprocess the pattern to remove controls (if specified) and determine
    302      * case.
    303      */
    304     for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) {
    305         c1 = pat[i];
    306         c2 = (i + 1 < patlen) ? pat[i + 1] : ~0;
    307         if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
    308           c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
    309 
    310         /*
    311          * Make sure the `have_space' flag is turned off if the character
    312          * is not an appropriate one.
    313          */
    314         if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS))
    315           have_space = 0;
    316 
    317         /*
    318          * If non-spacing characters should be ignored, do it here.
    319          */
    320         if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1))
    321           continue;
    322 
    323         /*
    324          * Check if spaces and controls need to be compressed.
    325          */
    326         if (flags & UTBM_SPACE_COMPRESS) {
    327             if (_utbm_isspace(c1, 1)) {
    328                 if (!have_space) {
    329                     /*
    330                      * Add a space and set the flag.
    331                      */
    332                     cp->uc = cp->lc = cp->tc = ' ';
    333                     cp++;
    334 
    335                     /*
    336                      * Increase the real pattern length.
    337                      */
    338                     p->patlen++;
    339                     sentinel = ' ';
    340                     have_space = 1;
    341                 }
    342                 continue;
    343             }
    344 
    345             /*
    346              * Ignore all control characters.
    347              */
    348             if (_utbm_iscntrl(c1))
    349               continue;
    350         }
    351 
    352         /*
    353          * Add the character.
    354          */
    355         if (flags & UTBM_CASEFOLD) {
    356             cp->uc = _utbm_toupper(c1);
    357             cp->lc = _utbm_tolower(c1);
    358             cp->tc = _utbm_totitle(c1);
    359         } else
    360           cp->uc = cp->lc = cp->tc = c1;
    361 
    362         /*
    363          * Set the sentinel character.
    364          */
    365         sentinel = cp->uc;
    366 
    367         /*
    368          * Move to the next character.
    369          */
    370         cp++;
    371 
    372         /*
    373          * Increase the real pattern length appropriately.
    374          */
    375         p->patlen += (c1 >= 0x10000) ? 2 : 1;
    376 
    377         /*
    378          * Increment the loop index for UTF-16 characters.
    379          */
    380         i += (c1 >= 0x10000) ? 1 : 0;
    381 
    382     }
    383 
    384     /*
    385      * Set the number of characters actually used.
    386      */
    387     p->pat_used = cp - p->pat;
    388 
    389     /*
    390      * Go through and construct the skip array and determine the actual length
    391      * of the pattern in UCS2 terms.
    392      */
    393     slen = p->patlen - 1;
    394     cp = p->pat;
    395     for (i = k = 0; i < p->pat_used; i++, cp++) {
    396         /*
    397          * Locate the character in the skip array.
    398          */
    399         for (sp = p->skip, j = 0;
    400              j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ;
    401 
    402         /*
    403          * If the character is not found, set the new skip element and
    404          * increase the number of skip elements.
    405          */
    406         if (j == p->skip_used) {
    407             sp->ch = cp;
    408             p->skip_used++;
    409         }
    410 
    411         /*
    412          * Set the updated skip value.  If the character is UTF-16 and is
    413          * not the last one in the pattern, add one to its skip value.
    414          */
    415         sp->skip = slen - k;
    416         if (cp->uc >= 0x10000 && k + 2 < slen)
    417           sp->skip++;
    418 
    419         /*
    420          * Set the new extra skip for the sentinel character.
    421          */
    422         if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) &&
    423             cp->uc == sentinel)
    424           p->md4 = slen - k;
    425 
    426         /*
    427          * Increase the actual index.
    428          */
    429         k += (cp->uc >= 0x10000) ? 2 : 1;
    430     }
    431 }
    432 
    433 int
    434 utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen,
    435           unsigned long *match_start, unsigned long *match_end)
    436 {
    437     unsigned long k;
    438     ucs2_t *start, *end;
    439 
    440     if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 ||
    441         textlen < pat->patlen)
    442       return 0;
    443 
    444     start = text + pat->patlen;
    445     end = text + textlen;
    446 
    447     /*
    448      * Adjust the start point if it points to a low surrogate.
    449      */
    450     if (0xdc00 <= *start && *start <= 0xdfff &&
    451         0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    452       start--;
    453 
    454     while (start < end) {
    455         while ((k = _utbm_skip(pat, start, end))) {
    456             start += k;
    457             if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
    458                 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    459               start--;
    460         }
    461 
    462         if (start < end &&
    463             _utbm_match(pat, text, start, end, match_start, match_end))
    464           return 1;
    465 
    466         start += pat->md4;
    467         if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
    468             0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
    469           start--;
    470     }
    471     return 0;
    472 }
    473