1 1.1 christos /* Extended regular expression matching and search library. 2 1.1 christos Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. 3 1.1 christos This file is part of the GNU C Library. 4 1.1 christos Contributed by Isamu Hasegawa <isamu (at) yamato.ibm.com>. 5 1.1 christos 6 1.1 christos This program is free software; you can redistribute it and/or modify 7 1.1 christos it under the terms of the GNU General Public License as published by 8 1.1 christos the Free Software Foundation; either version 2, or (at your option) 9 1.1 christos any later version. 10 1.1 christos 11 1.1 christos This program is distributed in the hope that it will be useful, 12 1.1 christos but WITHOUT ANY WARRANTY; without even the implied warranty of 13 1.1 christos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 1.1 christos GNU General Public License for more details. 15 1.1 christos 16 1.1 christos You should have received a copy of the GNU General Public License along 17 1.1 christos with this program; if not, write to the Free Software Foundation, 18 1.1 christos Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 19 1.2 christos #include <sys/cdefs.h> 20 1.2 christos __RCSID("$NetBSD: regex_internal.c,v 1.2 2016/05/17 14:00:09 christos Exp $"); 21 1.2 christos 22 1.1 christos 23 1.1 christos static void re_string_construct_common (const char *str, Idx len, 24 1.1 christos re_string_t *pstr, 25 1.1 christos REG_TRANSLATE_TYPE trans, bool icase, 26 1.1 christos const re_dfa_t *dfa) internal_function; 27 1.1 christos static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, 28 1.1 christos const re_node_set *nodes, 29 1.1 christos re_hashval_t hash) internal_function; 30 1.1 christos static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, 31 1.1 christos const re_node_set *nodes, 32 1.1 christos unsigned int context, 33 1.1 christos re_hashval_t hash) internal_function; 34 1.1 christos 35 1.1 christos /* Functions for string operation. */ 37 1.1 christos 38 1.1 christos /* This function allocate the buffers. It is necessary to call 39 1.1 christos re_string_reconstruct before using the object. */ 40 1.1 christos 41 1.1 christos static reg_errcode_t 42 1.1 christos internal_function 43 1.1 christos re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len, 44 1.1 christos REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 45 1.1 christos { 46 1.1 christos reg_errcode_t ret; 47 1.1 christos Idx init_buf_len; 48 1.1 christos 49 1.1 christos /* Ensure at least one character fits into the buffers. */ 50 1.1 christos if (init_len < dfa->mb_cur_max) 51 1.1 christos init_len = dfa->mb_cur_max; 52 1.1 christos init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 53 1.1 christos re_string_construct_common (str, len, pstr, trans, icase, dfa); 54 1.1 christos 55 1.1 christos ret = re_string_realloc_buffers (pstr, init_buf_len); 56 1.1 christos if (BE (ret != REG_NOERROR, 0)) 57 1.1 christos return ret; 58 1.1 christos 59 1.1 christos pstr->word_char = dfa->word_char; 60 1.1 christos pstr->word_ops_used = dfa->word_ops_used; 61 1.1 christos pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 62 1.1 christos pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; 63 1.1 christos pstr->valid_raw_len = pstr->valid_len; 64 1.1 christos return REG_NOERROR; 65 1.1 christos } 66 1.1 christos 67 1.1 christos /* This function allocate the buffers, and initialize them. */ 68 1.1 christos 69 1.1 christos static reg_errcode_t 70 1.1 christos internal_function 71 1.1 christos re_string_construct (re_string_t *pstr, const char *str, Idx len, 72 1.1 christos REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 73 1.1 christos { 74 1.1 christos reg_errcode_t ret; 75 1.1 christos memset (pstr, '\0', sizeof (re_string_t)); 76 1.1 christos re_string_construct_common (str, len, pstr, trans, icase, dfa); 77 1.1 christos 78 1.1 christos if (len > 0) 79 1.1 christos { 80 1.1 christos ret = re_string_realloc_buffers (pstr, len + 1); 81 1.1 christos if (BE (ret != REG_NOERROR, 0)) 82 1.1 christos return ret; 83 1.1 christos } 84 1.1 christos pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 85 1.1 christos 86 1.1 christos if (icase) 87 1.1 christos { 88 1.1 christos #ifdef RE_ENABLE_I18N 89 1.1 christos if (dfa->mb_cur_max > 1) 90 1.1 christos { 91 1.1 christos while (1) 92 1.1 christos { 93 1.1 christos ret = build_wcs_upper_buffer (pstr); 94 1.1 christos if (BE (ret != REG_NOERROR, 0)) 95 1.1 christos return ret; 96 1.1 christos if (pstr->valid_raw_len >= len) 97 1.1 christos break; 98 1.1 christos if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) 99 1.1 christos break; 100 1.1 christos ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 101 1.1 christos if (BE (ret != REG_NOERROR, 0)) 102 1.1 christos return ret; 103 1.1 christos } 104 1.1 christos } 105 1.1 christos else 106 1.1 christos #endif /* RE_ENABLE_I18N */ 107 1.1 christos build_upper_buffer (pstr); 108 1.1 christos } 109 1.1 christos else 110 1.1 christos { 111 1.1 christos #ifdef RE_ENABLE_I18N 112 1.1 christos if (dfa->mb_cur_max > 1) 113 1.1 christos build_wcs_buffer (pstr); 114 1.1 christos else 115 1.1 christos #endif /* RE_ENABLE_I18N */ 116 1.1 christos { 117 1.1 christos if (trans != NULL) 118 1.1 christos re_string_translate_buffer (pstr); 119 1.1 christos else 120 1.1 christos { 121 1.1 christos pstr->valid_len = pstr->bufs_len; 122 1.1 christos pstr->valid_raw_len = pstr->bufs_len; 123 1.1 christos } 124 1.1 christos } 125 1.1 christos } 126 1.1 christos 127 1.1 christos return REG_NOERROR; 128 1.1 christos } 129 1.1 christos 130 1.1 christos /* Helper functions for re_string_allocate, and re_string_construct. */ 131 1.1 christos 132 1.1 christos static reg_errcode_t 133 1.1 christos internal_function 134 1.1 christos re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) 135 1.1 christos { 136 1.1 christos #ifdef RE_ENABLE_I18N 137 1.1 christos if (pstr->mb_cur_max > 1) 138 1.1 christos { 139 1.1 christos wint_t *new_wcs = re_xrealloc (pstr->wcs, wint_t, new_buf_len); 140 1.1 christos if (BE (new_wcs == NULL, 0)) 141 1.1 christos return REG_ESPACE; 142 1.1 christos pstr->wcs = new_wcs; 143 1.1 christos if (pstr->offsets != NULL) 144 1.1 christos { 145 1.1 christos Idx *new_offsets = re_xrealloc (pstr->offsets, Idx, new_buf_len); 146 1.1 christos if (BE (new_offsets == NULL, 0)) 147 1.1 christos return REG_ESPACE; 148 1.1 christos pstr->offsets = new_offsets; 149 1.1 christos } 150 1.1 christos } 151 1.1 christos #endif /* RE_ENABLE_I18N */ 152 1.1 christos if (pstr->mbs_allocated) 153 1.1 christos { 154 1.1 christos unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, 155 1.1 christos new_buf_len); 156 1.1 christos if (BE (new_mbs == NULL, 0)) 157 1.1 christos return REG_ESPACE; 158 1.1 christos pstr->mbs = new_mbs; 159 1.1 christos } 160 1.1 christos pstr->bufs_len = new_buf_len; 161 1.1 christos return REG_NOERROR; 162 1.1 christos } 163 1.1 christos 164 1.1 christos 165 1.1 christos static void 166 1.1 christos internal_function 167 1.1 christos re_string_construct_common (const char *str, Idx len, re_string_t *pstr, 168 1.1 christos REG_TRANSLATE_TYPE trans, bool icase, 169 1.1 christos const re_dfa_t *dfa) 170 1.1 christos { 171 1.1 christos pstr->raw_mbs = (const unsigned char *) str; 172 1.1 christos pstr->len = len; 173 1.1 christos pstr->raw_len = len; 174 1.1 christos pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans; 175 1.1 christos pstr->icase = icase; 176 1.1 christos pstr->mbs_allocated = (trans != NULL || icase); 177 1.1 christos pstr->mb_cur_max = dfa->mb_cur_max; 178 1.1 christos pstr->is_utf8 = dfa->is_utf8; 179 1.1 christos pstr->map_notascii = dfa->map_notascii; 180 1.1 christos pstr->stop = pstr->len; 181 1.1 christos pstr->raw_stop = pstr->stop; 182 1.1 christos } 183 1.1 christos 184 1.1 christos #ifdef RE_ENABLE_I18N 185 1.1 christos 186 1.1 christos /* Build wide character buffer PSTR->WCS. 187 1.1 christos If the byte sequence of the string are: 188 1.1 christos <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 189 1.1 christos Then wide character buffer will be: 190 1.1 christos <wc1> , WEOF , <wc2> , WEOF , <wc3> 191 1.1 christos We use WEOF for padding, they indicate that the position isn't 192 1.1 christos a first byte of a multibyte character. 193 1.1 christos 194 1.1 christos Note that this function assumes PSTR->VALID_LEN elements are already 195 1.1 christos built and starts from PSTR->VALID_LEN. */ 196 1.1 christos 197 1.1 christos static void 198 1.1 christos internal_function 199 1.1 christos build_wcs_buffer (re_string_t *pstr) 200 1.1 christos { 201 1.1 christos #ifdef _LIBC 202 1.1 christos unsigned char buf[MB_LEN_MAX]; 203 1.1 christos assert (MB_LEN_MAX >= pstr->mb_cur_max); 204 1.1 christos #else 205 1.1 christos unsigned char buf[64]; 206 1.1 christos #endif 207 1.1 christos mbstate_t prev_st; 208 1.1 christos Idx byte_idx, end_idx, remain_len; 209 1.1 christos size_t mbclen; 210 1.1 christos 211 1.1 christos /* Build the buffers from pstr->valid_len to either pstr->len or 212 1.1 christos pstr->bufs_len. */ 213 1.1 christos end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 214 1.1 christos for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 215 1.1 christos { 216 1.1 christos wchar_t wc; 217 1.1 christos const char *p; 218 1.1 christos 219 1.1 christos remain_len = end_idx - byte_idx; 220 1.1 christos prev_st = pstr->cur_state; 221 1.1 christos /* Apply the translation if we need. */ 222 1.1 christos if (BE (pstr->trans != NULL, 0)) 223 1.1 christos { 224 1.1 christos int i, ch; 225 1.1 christos 226 1.1 christos for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 227 1.1 christos { 228 1.1 christos ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; 229 1.1 christos buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; 230 1.1 christos } 231 1.1 christos p = (const char *) buf; 232 1.1 christos } 233 1.1 christos else 234 1.1 christos p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; 235 1.1 christos mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); 236 1.1 christos if (BE (mbclen == (size_t) -2, 0)) 237 1.1 christos { 238 1.1 christos /* The buffer doesn't have enough space, finish to build. */ 239 1.1 christos pstr->cur_state = prev_st; 240 1.1 christos break; 241 1.1 christos } 242 1.1 christos else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) 243 1.1 christos { 244 1.1 christos /* We treat these cases as a singlebyte character. */ 245 1.1 christos mbclen = 1; 246 1.1 christos wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 247 1.1 christos if (BE (pstr->trans != NULL, 0)) 248 1.1 christos wc = pstr->trans[wc]; 249 1.1 christos pstr->cur_state = prev_st; 250 1.1 christos } 251 1.1 christos 252 1.1 christos /* Write wide character and padding. */ 253 1.1 christos pstr->wcs[byte_idx++] = wc; 254 1.1 christos /* Write paddings. */ 255 1.1 christos for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 256 1.1 christos pstr->wcs[byte_idx++] = WEOF; 257 1.1 christos } 258 1.1 christos pstr->valid_len = byte_idx; 259 1.1 christos pstr->valid_raw_len = byte_idx; 260 1.1 christos } 261 1.1 christos 262 1.1 christos /* Build wide character buffer PSTR->WCS like build_wcs_buffer, 263 1.1 christos but for REG_ICASE. */ 264 1.1 christos 265 1.1 christos static reg_errcode_t 266 1.1 christos internal_function 267 1.1 christos build_wcs_upper_buffer (re_string_t *pstr) 268 1.1 christos { 269 1.1 christos mbstate_t prev_st; 270 1.1 christos Idx src_idx, byte_idx, end_idx, remain_len; 271 1.1 christos size_t mbclen; 272 1.1 christos #ifdef _LIBC 273 1.1 christos char buf[MB_LEN_MAX]; 274 1.1 christos assert (MB_LEN_MAX >= pstr->mb_cur_max); 275 1.1 christos #else 276 1.1 christos char buf[64]; 277 1.1 christos #endif 278 1.1 christos 279 1.1 christos byte_idx = pstr->valid_len; 280 1.1 christos end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 281 1.1 christos 282 1.1 christos /* The following optimization assumes that ASCII characters can be 283 1.1 christos mapped to wide characters with a simple cast. */ 284 1.1 christos if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) 285 1.1 christos { 286 1.1 christos while (byte_idx < end_idx) 287 1.1 christos { 288 1.1 christos wchar_t wc; 289 1.1 christos 290 1.1 christos if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) 291 1.1 christos && mbsinit (&pstr->cur_state)) 292 1.1 christos { 293 1.1 christos /* In case of a singlebyte character. */ 294 1.1 christos pstr->mbs[byte_idx] 295 1.1 christos = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); 296 1.1 christos /* The next step uses the assumption that wchar_t is encoded 297 1.1 christos ASCII-safe: all ASCII values can be converted like this. */ 298 1.1 christos pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; 299 1.1 christos ++byte_idx; 300 1.1 christos continue; 301 1.1 christos } 302 1.1 christos 303 1.1 christos remain_len = end_idx - byte_idx; 304 1.1 christos prev_st = pstr->cur_state; 305 1.1 christos mbclen = mbrtowc (&wc, 306 1.1 christos ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 307 1.1 christos + byte_idx), remain_len, &pstr->cur_state); 308 1.1 christos if (BE ((size_t) (mbclen + 2) > 2, 1)) 309 1.1 christos { 310 1.1 christos wchar_t wcu = wc; 311 1.1 christos if (iswlower (wc)) 312 1.1 christos { 313 1.1 christos size_t mbcdlen; 314 1.1 christos 315 1.1 christos wcu = towupper (wc); 316 1.1 christos mbcdlen = wcrtomb (buf, wcu, &prev_st); 317 1.1 christos if (BE (mbclen == mbcdlen, 1)) 318 1.1 christos memcpy (pstr->mbs + byte_idx, buf, mbclen); 319 1.1 christos else 320 1.1 christos { 321 1.1 christos src_idx = byte_idx; 322 1.1 christos goto offsets_needed; 323 1.1 christos } 324 1.1 christos } 325 1.1 christos else 326 1.1 christos memcpy (pstr->mbs + byte_idx, 327 1.1 christos pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 328 1.1 christos pstr->wcs[byte_idx++] = wcu; 329 1.1 christos /* Write paddings. */ 330 1.1 christos for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 331 1.1 christos pstr->wcs[byte_idx++] = WEOF; 332 1.1 christos } 333 1.1 christos else if (mbclen == (size_t) -1 || mbclen == 0) 334 1.1 christos { 335 1.1 christos /* It is an invalid character or '\0'. Just use the byte. */ 336 1.1 christos int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 337 1.1 christos pstr->mbs[byte_idx] = ch; 338 1.1 christos /* And also cast it to wide char. */ 339 1.1 christos pstr->wcs[byte_idx++] = (wchar_t) ch; 340 1.1 christos if (BE (mbclen == (size_t) -1, 0)) 341 1.1 christos pstr->cur_state = prev_st; 342 1.1 christos } 343 1.1 christos else 344 1.1 christos { 345 1.1 christos /* The buffer doesn't have enough space, finish to build. */ 346 1.1 christos pstr->cur_state = prev_st; 347 1.1 christos break; 348 1.1 christos } 349 1.1 christos } 350 1.1 christos pstr->valid_len = byte_idx; 351 1.1 christos pstr->valid_raw_len = byte_idx; 352 1.1 christos return REG_NOERROR; 353 1.1 christos } 354 1.1 christos else 355 1.1 christos for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) 356 1.1 christos { 357 1.1 christos wchar_t wc; 358 1.1 christos const char *p; 359 1.1 christos offsets_needed: 360 1.1 christos remain_len = end_idx - byte_idx; 361 1.1 christos prev_st = pstr->cur_state; 362 1.1 christos if (BE (pstr->trans != NULL, 0)) 363 1.1 christos { 364 1.1 christos int i, ch; 365 1.1 christos 366 1.1 christos for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 367 1.1 christos { 368 1.1 christos ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; 369 1.1 christos buf[i] = pstr->trans[ch]; 370 1.1 christos } 371 1.1 christos p = (const char *) buf; 372 1.1 christos } 373 1.1 christos else 374 1.1 christos p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; 375 1.1 christos mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); 376 1.1 christos if (BE ((size_t) (mbclen + 2) > 2, 1)) 377 1.1 christos { 378 1.1 christos wchar_t wcu = wc; 379 1.1 christos if (iswlower (wc)) 380 1.1 christos { 381 1.1 christos size_t mbcdlen; 382 1.1 christos 383 1.1 christos wcu = towupper (wc); 384 1.1 christos mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); 385 1.1 christos if (BE (mbclen == mbcdlen, 1)) 386 1.1 christos memcpy (pstr->mbs + byte_idx, buf, mbclen); 387 1.1 christos else if (mbcdlen != (size_t) -1) 388 1.1 christos { 389 1.1 christos size_t i; 390 1.1 christos 391 1.1 christos if (byte_idx + mbcdlen > pstr->bufs_len) 392 1.1 christos { 393 1.1 christos pstr->cur_state = prev_st; 394 1.1 christos break; 395 1.1 christos } 396 1.1 christos 397 1.1 christos if (pstr->offsets == NULL) 398 1.1 christos { 399 1.1 christos pstr->offsets = re_xmalloc (Idx, pstr->bufs_len); 400 1.1 christos 401 1.1 christos if (pstr->offsets == NULL) 402 1.1 christos return REG_ESPACE; 403 1.1 christos } 404 1.1 christos if (!pstr->offsets_needed) 405 1.1 christos { 406 1.1 christos for (i = 0; i < (size_t) byte_idx; ++i) 407 1.1 christos pstr->offsets[i] = i; 408 1.1 christos pstr->offsets_needed = 1; 409 1.1 christos } 410 1.1 christos 411 1.1 christos memcpy (pstr->mbs + byte_idx, buf, mbcdlen); 412 1.1 christos pstr->wcs[byte_idx] = wcu; 413 1.1 christos pstr->offsets[byte_idx] = src_idx; 414 1.1 christos for (i = 1; i < mbcdlen; ++i) 415 1.1 christos { 416 1.1 christos pstr->offsets[byte_idx + i] 417 1.1 christos = src_idx + (i < mbclen ? i : mbclen - 1); 418 1.1 christos pstr->wcs[byte_idx + i] = WEOF; 419 1.1 christos } 420 1.1 christos pstr->len += mbcdlen - mbclen; 421 1.1 christos if (pstr->raw_stop > src_idx) 422 1.1 christos pstr->stop += mbcdlen - mbclen; 423 1.1 christos end_idx = (pstr->bufs_len > pstr->len) 424 1.1 christos ? pstr->len : pstr->bufs_len; 425 1.1 christos byte_idx += mbcdlen; 426 1.1 christos src_idx += mbclen; 427 1.1 christos continue; 428 1.1 christos } 429 1.1 christos else 430 1.1 christos memcpy (pstr->mbs + byte_idx, p, mbclen); 431 1.1 christos } 432 1.1 christos else 433 1.1 christos memcpy (pstr->mbs + byte_idx, p, mbclen); 434 1.1 christos 435 1.1 christos if (BE (pstr->offsets_needed != 0, 0)) 436 1.1 christos { 437 1.1 christos size_t i; 438 1.1 christos for (i = 0; i < mbclen; ++i) 439 1.1 christos pstr->offsets[byte_idx + i] = src_idx + i; 440 1.1 christos } 441 1.1 christos src_idx += mbclen; 442 1.1 christos 443 1.1 christos pstr->wcs[byte_idx++] = wcu; 444 1.1 christos /* Write paddings. */ 445 1.1 christos for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 446 1.1 christos pstr->wcs[byte_idx++] = WEOF; 447 1.1 christos } 448 1.1 christos else if (mbclen == (size_t) -1 || mbclen == 0) 449 1.1 christos { 450 1.1 christos /* It is an invalid character or '\0'. Just use the byte. */ 451 1.1 christos int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; 452 1.1 christos 453 1.1 christos if (BE (pstr->trans != NULL, 0)) 454 1.1 christos ch = pstr->trans [ch]; 455 1.1 christos pstr->mbs[byte_idx] = ch; 456 1.1 christos 457 1.1 christos if (BE (pstr->offsets_needed != 0, 0)) 458 1.1 christos pstr->offsets[byte_idx] = src_idx; 459 1.1 christos ++src_idx; 460 1.1 christos 461 1.1 christos /* And also cast it to wide char. */ 462 1.1 christos pstr->wcs[byte_idx++] = (wchar_t) ch; 463 1.1 christos if (BE (mbclen == (size_t) -1, 0)) 464 1.1 christos pstr->cur_state = prev_st; 465 1.1 christos } 466 1.1 christos else 467 1.1 christos { 468 1.1 christos /* The buffer doesn't have enough space, finish to build. */ 469 1.1 christos pstr->cur_state = prev_st; 470 1.1 christos break; 471 1.1 christos } 472 1.1 christos } 473 1.1 christos pstr->valid_len = byte_idx; 474 1.1 christos pstr->valid_raw_len = src_idx; 475 1.1 christos return REG_NOERROR; 476 1.1 christos } 477 1.1 christos 478 1.1 christos /* Skip characters until the index becomes greater than NEW_RAW_IDX. 479 1.1 christos Return the index. */ 480 1.1 christos 481 1.1 christos static Idx 482 1.1 christos internal_function 483 1.1 christos re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) 484 1.1 christos { 485 1.1 christos mbstate_t prev_st; 486 1.1 christos Idx rawbuf_idx; 487 1.1 christos size_t mbclen; 488 1.1 christos wchar_t wc = 0; 489 1.1 christos 490 1.1 christos /* Skip the characters which are not necessary to check. */ 491 1.1 christos for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; 492 1.1 christos rawbuf_idx < new_raw_idx;) 493 1.1 christos { 494 1.1 christos Idx remain_len; 495 1.1 christos remain_len = pstr->len - rawbuf_idx; 496 1.1 christos prev_st = pstr->cur_state; 497 1.1 christos mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx, 498 1.1 christos remain_len, &pstr->cur_state); 499 1.1 christos if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) 500 1.1 christos { 501 1.1 christos /* We treat these cases as a singlebyte character. */ 502 1.1 christos mbclen = 1; 503 1.1 christos pstr->cur_state = prev_st; 504 1.1 christos } 505 1.1 christos /* Then proceed the next character. */ 506 1.1 christos rawbuf_idx += mbclen; 507 1.1 christos } 508 1.1 christos *last_wc = (wint_t) wc; 509 1.1 christos return rawbuf_idx; 510 1.1 christos } 511 1.1 christos #endif /* RE_ENABLE_I18N */ 512 1.1 christos 513 1.1 christos /* Build the buffer PSTR->MBS, and apply the translation if we need. 514 1.1 christos This function is used in case of REG_ICASE. */ 515 1.1 christos 516 1.1 christos static void 517 1.1 christos internal_function 518 1.1 christos build_upper_buffer (re_string_t *pstr) 519 1.1 christos { 520 1.1 christos Idx char_idx, end_idx; 521 1.1 christos end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 522 1.1 christos 523 1.1 christos for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 524 1.1 christos { 525 1.1 christos int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 526 1.1 christos if (BE (pstr->trans != NULL, 0)) 527 1.1 christos ch = pstr->trans[ch]; 528 1.1 christos if (islower (ch)) 529 1.1 christos pstr->mbs[char_idx] = toupper (ch); 530 1.1 christos else 531 1.1 christos pstr->mbs[char_idx] = ch; 532 1.1 christos } 533 1.1 christos pstr->valid_len = char_idx; 534 1.1 christos pstr->valid_raw_len = char_idx; 535 1.1 christos } 536 1.1 christos 537 1.1 christos /* Apply TRANS to the buffer in PSTR. */ 538 1.1 christos 539 1.1 christos static void 540 1.1 christos internal_function 541 1.1 christos re_string_translate_buffer (re_string_t *pstr) 542 1.1 christos { 543 1.1 christos Idx buf_idx, end_idx; 544 1.1 christos end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 545 1.1 christos 546 1.1 christos for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 547 1.1 christos { 548 1.1 christos int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 549 1.1 christos pstr->mbs[buf_idx] = pstr->trans[ch]; 550 1.1 christos } 551 1.1 christos 552 1.1 christos pstr->valid_len = buf_idx; 553 1.1 christos pstr->valid_raw_len = buf_idx; 554 1.1 christos } 555 1.1 christos 556 1.1 christos /* This function re-construct the buffers. 557 1.1 christos Concretely, convert to wide character in case of pstr->mb_cur_max > 1, 558 1.1 christos convert to upper case in case of REG_ICASE, apply translation. */ 559 1.1 christos 560 1.1 christos static reg_errcode_t 561 1.1 christos internal_function 562 1.1 christos re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) 563 1.1 christos { 564 1.1 christos Idx offset; 565 1.1 christos 566 1.1 christos if (BE (pstr->raw_mbs_idx <= idx, 0)) 567 1.1 christos offset = idx - pstr->raw_mbs_idx; 568 1.1 christos else 569 1.1 christos { 570 1.1 christos /* Reset buffer. */ 571 1.1 christos #ifdef RE_ENABLE_I18N 572 1.1 christos if (pstr->mb_cur_max > 1) 573 1.1 christos memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 574 1.1 christos #endif /* RE_ENABLE_I18N */ 575 1.1 christos pstr->len = pstr->raw_len; 576 1.1 christos pstr->stop = pstr->raw_stop; 577 1.1 christos pstr->valid_len = 0; 578 1.1 christos pstr->raw_mbs_idx = 0; 579 1.1 christos pstr->valid_raw_len = 0; 580 1.1 christos pstr->offsets_needed = 0; 581 1.1 christos pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 582 1.1 christos : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 583 1.1 christos if (!pstr->mbs_allocated) 584 1.1 christos pstr->mbs = (unsigned char *) pstr->raw_mbs; 585 1.1 christos offset = idx; 586 1.1 christos } 587 1.1 christos 588 1.1 christos if (BE (offset != 0, 1)) 589 1.1 christos { 590 1.1 christos /* Are the characters which are already checked remain? */ 591 1.1 christos if (BE (offset < pstr->valid_raw_len, 1) 592 1.1 christos #ifdef RE_ENABLE_I18N 593 1.1 christos /* Handling this would enlarge the code too much. 594 1.1 christos Accept a slowdown in that case. */ 595 1.1 christos && pstr->offsets_needed == 0 596 1.1 christos #endif 597 1.1 christos ) 598 1.1 christos { 599 1.1 christos /* Yes, move them to the front of the buffer. */ 600 1.1 christos pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags); 601 1.1 christos #ifdef RE_ENABLE_I18N 602 1.1 christos if (pstr->mb_cur_max > 1) 603 1.1 christos memmove (pstr->wcs, pstr->wcs + offset, 604 1.1 christos (pstr->valid_len - offset) * sizeof (wint_t)); 605 1.1 christos #endif /* RE_ENABLE_I18N */ 606 1.1 christos if (BE (pstr->mbs_allocated, 0)) 607 1.1 christos memmove (pstr->mbs, pstr->mbs + offset, 608 1.1 christos pstr->valid_len - offset); 609 1.1 christos pstr->valid_len -= offset; 610 1.1 christos pstr->valid_raw_len -= offset; 611 1.1 christos #if DEBUG 612 1.1 christos assert (pstr->valid_len > 0); 613 1.1 christos #endif 614 1.1 christos } 615 1.1 christos else 616 1.1 christos { 617 1.1 christos /* No, skip all characters until IDX. */ 618 1.1 christos #ifdef RE_ENABLE_I18N 619 1.1 christos if (BE (pstr->offsets_needed, 0)) 620 1.1 christos { 621 1.1 christos pstr->len = pstr->raw_len - idx + offset; 622 1.1 christos pstr->stop = pstr->raw_stop - idx + offset; 623 1.1 christos pstr->offsets_needed = 0; 624 1.1 christos } 625 1.1 christos #endif 626 1.1 christos pstr->valid_len = 0; 627 1.1 christos pstr->valid_raw_len = 0; 628 1.1 christos #ifdef RE_ENABLE_I18N 629 1.1 christos if (pstr->mb_cur_max > 1) 630 1.1 christos { 631 1.1 christos Idx wcs_idx; 632 1.1 christos wint_t wc = WEOF; 633 1.1 christos 634 1.1 christos if (pstr->is_utf8) 635 1.1 christos { 636 1.1 christos const unsigned char *raw, *p, *q, *end; 637 1.1 christos 638 1.1 christos /* Special case UTF-8. Multi-byte chars start with any 639 1.1 christos byte other than 0x80 - 0xbf. */ 640 1.1 christos raw = pstr->raw_mbs + pstr->raw_mbs_idx; 641 1.1 christos end = raw + (offset - pstr->mb_cur_max); 642 1.1 christos for (p = raw + offset - 1; p >= end; --p) 643 1.1 christos if ((*p & 0xc0) != 0x80) 644 1.1 christos { 645 1.1 christos mbstate_t cur_state; 646 1.1 christos wchar_t wc2; 647 1.1 christos Idx mlen = raw + pstr->len - p; 648 1.1 christos unsigned char buf[6]; 649 1.1 christos size_t mbclen; 650 1.1 christos 651 1.1 christos q = p; 652 1.1 christos if (BE (pstr->trans != NULL, 0)) 653 1.1 christos { 654 1.1 christos int i = mlen < 6 ? mlen : 6; 655 1.1 christos while (--i >= 0) 656 1.1 christos buf[i] = pstr->trans[p[i]]; 657 1.1 christos q = buf; 658 1.1 christos } 659 1.1 christos /* XXX Don't use mbrtowc, we know which conversion 660 1.1 christos to use (UTF-8 -> UCS4). */ 661 1.1 christos memset (&cur_state, 0, sizeof (cur_state)); 662 1.1 christos mbclen = mbrtowc (&wc2, (const char *) p, mlen, 663 1.1 christos &cur_state); 664 1.1 christos if (raw + offset - p <= mbclen && mbclen < (size_t) -2) 665 1.1 christos { 666 1.1 christos memset (&pstr->cur_state, '\0', 667 1.1 christos sizeof (mbstate_t)); 668 1.1 christos pstr->valid_len = mbclen - (raw + offset - p); 669 1.1 christos wc = wc2; 670 1.1 christos } 671 1.1 christos break; 672 1.1 christos } 673 1.1 christos } 674 1.1 christos 675 1.1 christos if (wc == WEOF) 676 1.1 christos pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 677 1.1 christos if (BE (pstr->valid_len, 0)) 678 1.1 christos { 679 1.1 christos for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 680 1.1 christos pstr->wcs[wcs_idx] = WEOF; 681 1.1 christos if (pstr->mbs_allocated) 682 1.1 christos memset (pstr->mbs, -1, pstr->valid_len); 683 1.1 christos } 684 1.1 christos pstr->valid_raw_len = pstr->valid_len; 685 1.1 christos pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) 686 1.1 christos && IS_WIDE_WORD_CHAR (wc)) 687 1.1 christos ? CONTEXT_WORD 688 1.1 christos : ((IS_WIDE_NEWLINE (wc) 689 1.1 christos && pstr->newline_anchor) 690 1.1 christos ? CONTEXT_NEWLINE : 0)); 691 1.1 christos } 692 1.1 christos else 693 1.1 christos #endif /* RE_ENABLE_I18N */ 694 1.1 christos { 695 1.1 christos int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 696 1.1 christos if (pstr->trans) 697 1.1 christos c = pstr->trans[c]; 698 1.1 christos pstr->tip_context = (bitset_contain (pstr->word_char, c) 699 1.1 christos ? CONTEXT_WORD 700 1.1 christos : ((IS_NEWLINE (c) && pstr->newline_anchor) 701 1.1 christos ? CONTEXT_NEWLINE : 0)); 702 1.1 christos } 703 1.1 christos } 704 1.1 christos if (!BE (pstr->mbs_allocated, 0)) 705 1.1 christos pstr->mbs += offset; 706 1.1 christos } 707 1.1 christos pstr->raw_mbs_idx = idx; 708 1.1 christos pstr->len -= offset; 709 1.1 christos pstr->stop -= offset; 710 1.1 christos 711 1.1 christos /* Then build the buffers. */ 712 1.1 christos #ifdef RE_ENABLE_I18N 713 1.1 christos if (pstr->mb_cur_max > 1) 714 1.1 christos { 715 1.1 christos if (pstr->icase) 716 1.1 christos { 717 1.1 christos reg_errcode_t ret = build_wcs_upper_buffer (pstr); 718 1.1 christos if (BE (ret != REG_NOERROR, 0)) 719 1.1 christos return ret; 720 1.1 christos } 721 1.1 christos else 722 1.1 christos build_wcs_buffer (pstr); 723 1.1 christos } 724 1.1 christos else 725 1.1 christos #endif /* RE_ENABLE_I18N */ 726 1.1 christos if (BE (pstr->mbs_allocated, 0)) 727 1.1 christos { 728 1.1 christos if (pstr->icase) 729 1.1 christos build_upper_buffer (pstr); 730 1.1 christos else if (pstr->trans != NULL) 731 1.1 christos re_string_translate_buffer (pstr); 732 1.1 christos } 733 1.1 christos else 734 1.1 christos pstr->valid_len = pstr->len; 735 1.1 christos 736 1.1 christos pstr->cur_idx = 0; 737 1.1 christos return REG_NOERROR; 738 1.1 christos } 739 1.1 christos 740 1.1 christos static unsigned char 741 1.1 christos internal_function __attribute ((pure)) 742 1.1 christos re_string_peek_byte_case (const re_string_t *pstr, Idx idx) 743 1.1 christos { 744 1.1 christos int ch; 745 1.1 christos Idx off; 746 1.1 christos 747 1.1 christos /* Handle the common (easiest) cases first. */ 748 1.1 christos if (BE (!pstr->mbs_allocated, 1)) 749 1.1 christos return re_string_peek_byte (pstr, idx); 750 1.1 christos 751 1.1 christos #ifdef RE_ENABLE_I18N 752 1.1 christos if (pstr->mb_cur_max > 1 753 1.1 christos && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) 754 1.1 christos return re_string_peek_byte (pstr, idx); 755 1.1 christos #endif 756 1.1 christos 757 1.1 christos off = pstr->cur_idx + idx; 758 1.1 christos #ifdef RE_ENABLE_I18N 759 1.1 christos if (pstr->offsets_needed) 760 1.1 christos off = pstr->offsets[off]; 761 1.1 christos #endif 762 1.1 christos 763 1.1 christos ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 764 1.1 christos 765 1.1 christos #ifdef RE_ENABLE_I18N 766 1.1 christos /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I 767 1.1 christos this function returns CAPITAL LETTER I instead of first byte of 768 1.1 christos DOTLESS SMALL LETTER I. The latter would confuse the parser, 769 1.1 christos since peek_byte_case doesn't advance cur_idx in any way. */ 770 1.1 christos if (pstr->offsets_needed && !isascii (ch)) 771 1.1 christos return re_string_peek_byte (pstr, idx); 772 1.1 christos #endif 773 1.1 christos 774 1.1 christos return ch; 775 1.1 christos } 776 1.1 christos 777 1.1 christos static unsigned char 778 1.1 christos internal_function __attribute ((pure)) 779 1.1 christos re_string_fetch_byte_case (re_string_t *pstr) 780 1.1 christos { 781 1.1 christos if (BE (!pstr->mbs_allocated, 1)) 782 1.1 christos return re_string_fetch_byte (pstr); 783 1.1 christos 784 1.1 christos #ifdef RE_ENABLE_I18N 785 1.1 christos if (pstr->offsets_needed) 786 1.1 christos { 787 1.1 christos Idx off; 788 1.1 christos int ch; 789 1.1 christos 790 1.1 christos /* For tr_TR.UTF-8 [[:islower:]] there is 791 1.1 christos [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip 792 1.1 christos in that case the whole multi-byte character and return 793 1.1 christos the original letter. On the other side, with 794 1.1 christos [[: DOTLESS SMALL LETTER I return [[:I, as doing 795 1.1 christos anything else would complicate things too much. */ 796 1.1 christos 797 1.1 christos if (!re_string_first_byte (pstr, pstr->cur_idx)) 798 1.1 christos return re_string_fetch_byte (pstr); 799 1.1 christos 800 1.1 christos off = pstr->offsets[pstr->cur_idx]; 801 1.1 christos ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 802 1.1 christos 803 1.1 christos if (! isascii (ch)) 804 1.1 christos return re_string_fetch_byte (pstr); 805 1.1 christos 806 1.1 christos re_string_skip_bytes (pstr, 807 1.1 christos re_string_char_size_at (pstr, pstr->cur_idx)); 808 1.1 christos return ch; 809 1.1 christos } 810 1.1 christos #endif 811 1.1 christos 812 1.1 christos return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; 813 1.1 christos } 814 1.1 christos 815 1.1 christos static void 816 1.1 christos internal_function 817 1.1 christos re_string_destruct (re_string_t *pstr) 818 1.1 christos { 819 1.1 christos #ifdef RE_ENABLE_I18N 820 1.1 christos re_free (pstr->wcs); 821 1.1 christos re_free (pstr->offsets); 822 1.1 christos #endif /* RE_ENABLE_I18N */ 823 1.1 christos if (pstr->mbs_allocated) 824 1.1 christos re_free (pstr->mbs); 825 1.1 christos } 826 1.1 christos 827 1.1 christos /* Return the context at IDX in INPUT. */ 828 1.1 christos 829 1.1 christos static unsigned int 830 1.1 christos internal_function 831 1.1 christos re_string_context_at (const re_string_t *input, Idx idx, int eflags) 832 1.1 christos { 833 1.1 christos int c; 834 1.1 christos if (BE (! REG_VALID_INDEX (idx), 0)) 835 1.1 christos /* In this case, we use the value stored in input->tip_context, 836 1.1 christos since we can't know the character in input->mbs[-1] here. */ 837 1.1 christos return input->tip_context; 838 1.1 christos if (BE (idx == input->len, 0)) 839 1.1 christos return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 840 1.1 christos : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 841 1.1 christos #ifdef RE_ENABLE_I18N 842 1.1 christos if (input->mb_cur_max > 1) 843 1.1 christos { 844 1.1 christos wint_t wc; 845 1.1 christos Idx wc_idx = idx; 846 1.1 christos while(input->wcs[wc_idx] == WEOF) 847 1.1 christos { 848 1.1 christos #ifdef DEBUG 849 1.1 christos /* It must not happen. */ 850 1.1 christos assert (REG_VALID_INDEX (wc_idx)); 851 1.1 christos #endif 852 1.1 christos --wc_idx; 853 1.1 christos if (! REG_VALID_INDEX (wc_idx)) 854 1.1 christos return input->tip_context; 855 1.1 christos } 856 1.1 christos wc = input->wcs[wc_idx]; 857 1.1 christos if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) 858 1.1 christos return CONTEXT_WORD; 859 1.1 christos return (IS_WIDE_NEWLINE (wc) && input->newline_anchor 860 1.1 christos ? CONTEXT_NEWLINE : 0); 861 1.1 christos } 862 1.1 christos else 863 1.1 christos #endif 864 1.1 christos { 865 1.1 christos c = re_string_byte_at (input, idx); 866 1.1 christos if (bitset_contain (input->word_char, c)) 867 1.1 christos return CONTEXT_WORD; 868 1.1 christos return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; 869 1.1 christos } 870 1.1 christos } 871 1.1 christos 872 1.1 christos /* Functions for set operation. */ 874 1.1 christos 875 1.1 christos static reg_errcode_t 876 1.1 christos internal_function 877 1.1 christos re_node_set_alloc (re_node_set *set, Idx size) 878 1.1 christos { 879 1.1 christos set->alloc = size; 880 1.1 christos set->nelem = 0; 881 1.1 christos set->elems = re_xmalloc (Idx, size); 882 1.1 christos if (BE (set->elems == NULL, 0)) 883 1.1 christos return REG_ESPACE; 884 1.1 christos return REG_NOERROR; 885 1.1 christos } 886 1.1 christos 887 1.1 christos static reg_errcode_t 888 1.1 christos internal_function 889 1.1 christos re_node_set_init_1 (re_node_set *set, Idx elem) 890 1.1 christos { 891 1.1 christos set->alloc = 1; 892 1.1 christos set->nelem = 1; 893 1.1 christos set->elems = re_malloc (Idx, 1); 894 1.1 christos if (BE (set->elems == NULL, 0)) 895 1.1 christos { 896 1.1 christos set->alloc = set->nelem = 0; 897 1.1 christos return REG_ESPACE; 898 1.1 christos } 899 1.1 christos set->elems[0] = elem; 900 1.1 christos return REG_NOERROR; 901 1.1 christos } 902 1.1 christos 903 1.1 christos static reg_errcode_t 904 1.1 christos internal_function 905 1.1 christos re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2) 906 1.1 christos { 907 1.1 christos set->alloc = 2; 908 1.1 christos set->elems = re_malloc (Idx, 2); 909 1.1 christos if (BE (set->elems == NULL, 0)) 910 1.1 christos return REG_ESPACE; 911 1.1 christos if (elem1 == elem2) 912 1.1 christos { 913 1.1 christos set->nelem = 1; 914 1.1 christos set->elems[0] = elem1; 915 1.1 christos } 916 1.1 christos else 917 1.1 christos { 918 1.1 christos set->nelem = 2; 919 1.1 christos if (elem1 < elem2) 920 1.1 christos { 921 1.1 christos set->elems[0] = elem1; 922 1.1 christos set->elems[1] = elem2; 923 1.1 christos } 924 1.1 christos else 925 1.1 christos { 926 1.1 christos set->elems[0] = elem2; 927 1.1 christos set->elems[1] = elem1; 928 1.1 christos } 929 1.1 christos } 930 1.1 christos return REG_NOERROR; 931 1.1 christos } 932 1.1 christos 933 1.1 christos static reg_errcode_t 934 1.1 christos internal_function 935 1.1 christos re_node_set_init_copy (re_node_set *dest, const re_node_set *src) 936 1.1 christos { 937 1.1 christos dest->nelem = src->nelem; 938 1.1 christos if (src->nelem > 0) 939 1.1 christos { 940 1.1 christos dest->alloc = dest->nelem; 941 1.1 christos dest->elems = re_malloc (Idx, dest->alloc); 942 1.1 christos if (BE (dest->elems == NULL, 0)) 943 1.1 christos { 944 1.1 christos dest->alloc = dest->nelem = 0; 945 1.1 christos return REG_ESPACE; 946 1.1 christos } 947 1.1 christos memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]); 948 1.1 christos } 949 1.1 christos else 950 1.1 christos re_node_set_init_empty (dest); 951 1.1 christos return REG_NOERROR; 952 1.1 christos } 953 1.1 christos 954 1.1 christos /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 955 1.1 christos DEST. Return value indicate the error code or REG_NOERROR if succeeded. 956 1.1 christos Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 957 1.1 christos 958 1.1 christos static reg_errcode_t 959 1.1 christos internal_function 960 1.1 christos re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, 961 1.1 christos const re_node_set *src2) 962 1.1 christos { 963 1.1 christos Idx i1, i2, is, id, delta, sbase; 964 1.1 christos if (src1->nelem == 0 || src2->nelem == 0) 965 1.1 christos return REG_NOERROR; 966 1.1 christos 967 1.1 christos /* We need dest->nelem + 2 * elems_in_intersection; this is a 968 1.1 christos conservative estimate. */ 969 1.1 christos if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 970 1.1 christos { 971 1.1 christos Idx new_alloc = src1->nelem + src2->nelem + dest->alloc; 972 1.1 christos Idx *new_elems; 973 1.1 christos if (sizeof (Idx) < 3 974 1.1 christos && (new_alloc < dest->alloc 975 1.1 christos || ((Idx) (src1->nelem + src2->nelem) < src1->nelem))) 976 1.1 christos return REG_ESPACE; 977 1.1 christos new_elems = re_xrealloc (dest->elems, Idx, new_alloc); 978 1.1 christos if (BE (new_elems == NULL, 0)) 979 1.1 christos return REG_ESPACE; 980 1.1 christos dest->elems = new_elems; 981 1.1 christos dest->alloc = new_alloc; 982 1.1 christos } 983 1.1 christos 984 1.1 christos /* Find the items in the intersection of SRC1 and SRC2, and copy 985 1.1 christos into the top of DEST those that are not already in DEST itself. */ 986 1.1 christos sbase = dest->nelem + src1->nelem + src2->nelem; 987 1.1 christos i1 = src1->nelem - 1; 988 1.1 christos i2 = src2->nelem - 1; 989 1.1 christos id = dest->nelem - 1; 990 1.1 christos for (;;) 991 1.1 christos { 992 1.1 christos if (src1->elems[i1] == src2->elems[i2]) 993 1.1 christos { 994 1.1 christos /* Try to find the item in DEST. Maybe we could binary search? */ 995 1.1 christos while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1]) 996 1.1 christos --id; 997 1.1 christos 998 1.1 christos if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1]) 999 1.1 christos dest->elems[--sbase] = src1->elems[i1]; 1000 1.1 christos 1001 1.1 christos if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2)) 1002 1.1 christos break; 1003 1.1 christos } 1004 1.1 christos 1005 1.1 christos /* Lower the highest of the two items. */ 1006 1.1 christos else if (src1->elems[i1] < src2->elems[i2]) 1007 1.1 christos { 1008 1.1 christos if (! REG_VALID_INDEX (--i2)) 1009 1.1 christos break; 1010 1.1 christos } 1011 1.1 christos else 1012 1.1 christos { 1013 1.1 christos if (! REG_VALID_INDEX (--i1)) 1014 1.1 christos break; 1015 1.1 christos } 1016 1.1 christos } 1017 1.1 christos 1018 1.1 christos id = dest->nelem - 1; 1019 1.1 christos is = dest->nelem + src1->nelem + src2->nelem - 1; 1020 1.1 christos delta = is - sbase + 1; 1021 1.1 christos 1022 1.1 christos /* Now copy. When DELTA becomes zero, the remaining 1023 1.1 christos DEST elements are already in place; this is more or 1024 1.1 christos less the same loop that is in re_node_set_merge. */ 1025 1.1 christos dest->nelem += delta; 1026 1.1 christos if (delta > 0 && REG_VALID_INDEX (id)) 1027 1.1 christos for (;;) 1028 1.1 christos { 1029 1.1 christos if (dest->elems[is] > dest->elems[id]) 1030 1.1 christos { 1031 1.1 christos /* Copy from the top. */ 1032 1.1 christos dest->elems[id + delta--] = dest->elems[is--]; 1033 1.1 christos if (delta == 0) 1034 1.1 christos break; 1035 1.1 christos } 1036 1.1 christos else 1037 1.1 christos { 1038 1.1 christos /* Slide from the bottom. */ 1039 1.1 christos dest->elems[id + delta] = dest->elems[id]; 1040 1.1 christos if (! REG_VALID_INDEX (--id)) 1041 1.1 christos break; 1042 1.1 christos } 1043 1.1 christos } 1044 1.1 christos 1045 1.1 christos /* Copy remaining SRC elements. */ 1046 1.1 christos memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]); 1047 1.1 christos 1048 1.1 christos return REG_NOERROR; 1049 1.1 christos } 1050 1.1 christos 1051 1.1 christos /* Calculate the union set of the sets SRC1 and SRC2. And store it to 1052 1.1 christos DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1053 1.1 christos 1054 1.1 christos static reg_errcode_t 1055 1.1 christos internal_function 1056 1.1 christos re_node_set_init_union (re_node_set *dest, const re_node_set *src1, 1057 1.1 christos const re_node_set *src2) 1058 1.1 christos { 1059 1.1 christos Idx i1, i2, id; 1060 1.1 christos if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 1061 1.1 christos { 1062 1.1 christos dest->alloc = src1->nelem + src2->nelem; 1063 1.1 christos if (sizeof (Idx) < 2 && dest->alloc < src1->nelem) 1064 1.1 christos return REG_ESPACE; 1065 1.1 christos dest->elems = re_xmalloc (Idx, dest->alloc); 1066 1.1 christos if (BE (dest->elems == NULL, 0)) 1067 1.1 christos return REG_ESPACE; 1068 1.1 christos } 1069 1.1 christos else 1070 1.1 christos { 1071 1.1 christos if (src1 != NULL && src1->nelem > 0) 1072 1.1 christos return re_node_set_init_copy (dest, src1); 1073 1.1 christos else if (src2 != NULL && src2->nelem > 0) 1074 1.1 christos return re_node_set_init_copy (dest, src2); 1075 1.1 christos else 1076 1.1 christos re_node_set_init_empty (dest); 1077 1.1 christos return REG_NOERROR; 1078 1.1 christos } 1079 1.1 christos for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 1080 1.1 christos { 1081 1.1 christos if (src1->elems[i1] > src2->elems[i2]) 1082 1.1 christos { 1083 1.1 christos dest->elems[id++] = src2->elems[i2++]; 1084 1.1 christos continue; 1085 1.1 christos } 1086 1.1 christos if (src1->elems[i1] == src2->elems[i2]) 1087 1.1 christos ++i2; 1088 1.1 christos dest->elems[id++] = src1->elems[i1++]; 1089 1.1 christos } 1090 1.1 christos if (i1 < src1->nelem) 1091 1.1 christos { 1092 1.1 christos memcpy (dest->elems + id, src1->elems + i1, 1093 1.1 christos (src1->nelem - i1) * sizeof dest->elems[0]); 1094 1.1 christos id += src1->nelem - i1; 1095 1.1 christos } 1096 1.1 christos else if (i2 < src2->nelem) 1097 1.1 christos { 1098 1.1 christos memcpy (dest->elems + id, src2->elems + i2, 1099 1.1 christos (src2->nelem - i2) * sizeof dest->elems[0]); 1100 1.1 christos id += src2->nelem - i2; 1101 1.1 christos } 1102 1.1 christos dest->nelem = id; 1103 1.1 christos return REG_NOERROR; 1104 1.1 christos } 1105 1.1 christos 1106 1.1 christos /* Calculate the union set of the sets DEST and SRC. And store it to 1107 1.1 christos DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1108 1.1 christos 1109 1.1 christos static reg_errcode_t 1110 1.1 christos internal_function 1111 1.1 christos re_node_set_merge (re_node_set *dest, const re_node_set *src) 1112 1.1 christos { 1113 1.1 christos Idx is, id, sbase, delta; 1114 1.1 christos if (src == NULL || src->nelem == 0) 1115 1.1 christos return REG_NOERROR; 1116 1.1 christos if (sizeof (Idx) < 3 1117 1.1 christos && ((Idx) (2 * src->nelem) < src->nelem 1118 1.1 christos || (Idx) (2 * src->nelem + dest->nelem) < dest->nelem)) 1119 1.1 christos return REG_ESPACE; 1120 1.1 christos if (dest->alloc < 2 * src->nelem + dest->nelem) 1121 1.1 christos { 1122 1.1 christos Idx new_alloc = src->nelem + dest->alloc; 1123 1.1 christos Idx *new_buffer; 1124 1.1 christos if (sizeof (Idx) < 4 && new_alloc < dest->alloc) 1125 1.1 christos return REG_ESPACE; 1126 1.1 christos new_buffer = re_x2realloc (dest->elems, Idx, &new_alloc); 1127 1.1 christos if (BE (new_buffer == NULL, 0)) 1128 1.1 christos return REG_ESPACE; 1129 1.1 christos dest->elems = new_buffer; 1130 1.1 christos dest->alloc = new_alloc; 1131 1.1 christos } 1132 1.1 christos 1133 1.1 christos if (BE (dest->nelem == 0, 0)) 1134 1.1 christos { 1135 1.1 christos dest->nelem = src->nelem; 1136 1.1 christos memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]); 1137 1.1 christos return REG_NOERROR; 1138 1.1 christos } 1139 1.1 christos 1140 1.1 christos /* Copy into the top of DEST the items of SRC that are not 1141 1.1 christos found in DEST. Maybe we could binary search in DEST? */ 1142 1.1 christos for (sbase = dest->nelem + 2 * src->nelem, 1143 1.1 christos is = src->nelem - 1, id = dest->nelem - 1; 1144 1.1 christos REG_VALID_INDEX (is) && REG_VALID_INDEX (id); ) 1145 1.1 christos { 1146 1.1 christos if (dest->elems[id] == src->elems[is]) 1147 1.1 christos is--, id--; 1148 1.1 christos else if (dest->elems[id] < src->elems[is]) 1149 1.1 christos dest->elems[--sbase] = src->elems[is--]; 1150 1.1 christos else /* if (dest->elems[id] > src->elems[is]) */ 1151 1.1 christos --id; 1152 1.1 christos } 1153 1.1 christos 1154 1.1 christos if (REG_VALID_INDEX (is)) 1155 1.1 christos { 1156 1.1 christos /* If DEST is exhausted, the remaining items of SRC must be unique. */ 1157 1.1 christos sbase -= is + 1; 1158 1.1 christos memcpy (dest->elems + sbase, src->elems, 1159 1.1 christos (is + 1) * sizeof dest->elems[0]); 1160 1.1 christos } 1161 1.1 christos 1162 1.1 christos id = dest->nelem - 1; 1163 1.1 christos is = dest->nelem + 2 * src->nelem - 1; 1164 1.1 christos delta = is - sbase + 1; 1165 1.1 christos if (delta == 0) 1166 1.1 christos return REG_NOERROR; 1167 1.1 christos 1168 1.1 christos /* Now copy. When DELTA becomes zero, the remaining 1169 1.1 christos DEST elements are already in place. */ 1170 1.1 christos dest->nelem += delta; 1171 1.1 christos for (;;) 1172 1.1 christos { 1173 1.1 christos if (dest->elems[is] > dest->elems[id]) 1174 1.1 christos { 1175 1.1 christos /* Copy from the top. */ 1176 1.1 christos dest->elems[id + delta--] = dest->elems[is--]; 1177 1.1 christos if (delta == 0) 1178 1.1 christos break; 1179 1.1 christos } 1180 1.1 christos else 1181 1.1 christos { 1182 1.1 christos /* Slide from the bottom. */ 1183 1.1 christos dest->elems[id + delta] = dest->elems[id]; 1184 1.1 christos if (! REG_VALID_INDEX (--id)) 1185 1.1 christos { 1186 1.1 christos /* Copy remaining SRC elements. */ 1187 1.1 christos memcpy (dest->elems, dest->elems + sbase, 1188 1.1 christos delta * sizeof dest->elems[0]); 1189 1.1 christos break; 1190 1.1 christos } 1191 1.1 christos } 1192 1.1 christos } 1193 1.1 christos 1194 1.1 christos return REG_NOERROR; 1195 1.1 christos } 1196 1.1 christos 1197 1.1 christos /* Insert the new element ELEM to the re_node_set* SET. 1198 1.1 christos SET should not already have ELEM. 1199 1.1 christos Return true if successful. */ 1200 1.1 christos 1201 1.1 christos static bool 1202 1.1 christos internal_function 1203 1.1 christos re_node_set_insert (re_node_set *set, Idx elem) 1204 1.1 christos { 1205 1.1 christos Idx idx; 1206 1.1 christos /* In case the set is empty. */ 1207 1.1 christos if (set->alloc == 0) 1208 1.1 christos return re_node_set_init_1 (set, elem) == REG_NOERROR; 1209 1.1 christos 1210 1.1 christos if (BE (set->nelem, 0) == 0) 1211 1.1 christos { 1212 1.1 christos /* We already guaranteed above that set->alloc != 0. */ 1213 1.1 christos set->elems[0] = elem; 1214 1.1 christos ++set->nelem; 1215 1.1 christos return true; 1216 1.1 christos } 1217 1.1 christos 1218 1.1 christos /* Realloc if we need. */ 1219 1.1 christos if (set->alloc == set->nelem) 1220 1.1 christos { 1221 1.1 christos Idx *new_elems = re_x2realloc (set->elems, Idx, &set->alloc); 1222 1.1 christos if (BE (new_elems == NULL, 0)) 1223 1.1 christos return false; 1224 1.1 christos set->elems = new_elems; 1225 1.1 christos } 1226 1.1 christos 1227 1.1 christos /* Move the elements which follows the new element. Test the 1228 1.1 christos first element separately to skip a check in the inner loop. */ 1229 1.1 christos if (elem < set->elems[0]) 1230 1.1 christos { 1231 1.1 christos idx = 0; 1232 1.1 christos for (idx = set->nelem; idx > 0; idx--) 1233 1.1 christos set->elems[idx] = set->elems[idx - 1]; 1234 1.1 christos } 1235 1.1 christos else 1236 1.1 christos { 1237 1.1 christos for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) 1238 1.1 christos set->elems[idx] = set->elems[idx - 1]; 1239 1.1 christos } 1240 1.1 christos 1241 1.1 christos /* Insert the new element. */ 1242 1.1 christos set->elems[idx] = elem; 1243 1.1 christos ++set->nelem; 1244 1.1 christos return true; 1245 1.1 christos } 1246 1.1 christos 1247 1.1 christos /* Insert the new element ELEM to the re_node_set* SET. 1248 1.1 christos SET should not already have any element greater than or equal to ELEM. 1249 1.1 christos Return true if successful. */ 1250 1.1 christos 1251 1.1 christos static bool 1252 1.1 christos internal_function 1253 1.1 christos re_node_set_insert_last (re_node_set *set, Idx elem) 1254 1.1 christos { 1255 1.1 christos /* Realloc if we need. */ 1256 1.1 christos if (set->alloc == set->nelem) 1257 1.1 christos { 1258 1.1 christos Idx *new_elems; 1259 1.1 christos new_elems = re_x2realloc (set->elems, Idx, &set->alloc); 1260 1.1 christos if (BE (new_elems == NULL, 0)) 1261 1.1 christos return false; 1262 1.1 christos set->elems = new_elems; 1263 1.1 christos } 1264 1.1 christos 1265 1.1 christos /* Insert the new element. */ 1266 1.1 christos set->elems[set->nelem++] = elem; 1267 1.1 christos return true; 1268 1.1 christos } 1269 1.1 christos 1270 1.1 christos /* Compare two node sets SET1 and SET2. 1271 1.1 christos Return true if SET1 and SET2 are equivalent. */ 1272 1.1 christos 1273 1.1 christos static bool 1274 1.1 christos internal_function __attribute ((pure)) 1275 1.1 christos re_node_set_compare (const re_node_set *set1, const re_node_set *set2) 1276 1.1 christos { 1277 1.1 christos Idx i; 1278 1.1 christos if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 1279 1.1 christos return false; 1280 1.1 christos for (i = set1->nelem ; REG_VALID_INDEX (--i) ; ) 1281 1.1 christos if (set1->elems[i] != set2->elems[i]) 1282 1.1 christos return false; 1283 1.1 christos return true; 1284 1.1 christos } 1285 1.1 christos 1286 1.1 christos /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 1287 1.1 christos 1288 1.1 christos static Idx 1289 1.1 christos internal_function __attribute ((pure)) 1290 1.1 christos re_node_set_contains (const re_node_set *set, Idx elem) 1291 1.1 christos { 1292 1.1 christos __re_size_t idx, right, mid; 1293 1.1 christos if (! REG_VALID_NONZERO_INDEX (set->nelem)) 1294 1.1 christos return 0; 1295 1.1 christos 1296 1.1 christos /* Binary search the element. */ 1297 1.1 christos idx = 0; 1298 1.1 christos right = set->nelem - 1; 1299 1.1 christos while (idx < right) 1300 1.1 christos { 1301 1.1 christos mid = (idx + right) / 2; 1302 1.1 christos if (set->elems[mid] < elem) 1303 1.1 christos idx = mid + 1; 1304 1.1 christos else 1305 1.1 christos right = mid; 1306 1.1 christos } 1307 1.1 christos return set->elems[idx] == elem ? idx + 1 : 0; 1308 1.1 christos } 1309 1.1 christos 1310 1.1 christos static void 1311 1.1 christos internal_function 1312 1.1 christos re_node_set_remove_at (re_node_set *set, Idx idx) 1313 1.1 christos { 1314 1.1 christos if (idx < 0 || idx >= set->nelem) 1315 1.1 christos return; 1316 1.1 christos --set->nelem; 1317 1.1 christos for (; idx < set->nelem; idx++) 1318 1.1 christos set->elems[idx] = set->elems[idx + 1]; 1319 1.1 christos } 1320 1.1 christos 1321 1.1 christos 1323 1.1 christos /* Add the token TOKEN to dfa->nodes, and return the index of the token. 1324 1.1 christos Or return REG_MISSING if an error occurred. */ 1325 1.1 christos 1326 1.1 christos static Idx 1327 1.1 christos internal_function 1328 1.1 christos re_dfa_add_node (re_dfa_t *dfa, re_token_t token) 1329 1.1 christos { 1330 1.1 christos int type = token.type; 1331 1.1 christos if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) 1332 1.1 christos { 1333 1.1 christos Idx new_nodes_alloc = dfa->nodes_alloc; 1334 1.1 christos Idx *new_nexts, *new_indices; 1335 1.1 christos re_node_set *new_edests, *new_eclosures; 1336 1.1 christos 1337 1.1 christos re_token_t *new_nodes = re_x2realloc (dfa->nodes, re_token_t, 1338 1.1 christos &new_nodes_alloc); 1339 1.1 christos if (BE (new_nodes == NULL, 0)) 1340 1.1 christos return REG_MISSING; 1341 1.1 christos dfa->nodes = new_nodes; 1342 1.1 christos new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc); 1343 1.1 christos new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc); 1344 1.1 christos new_edests = re_xrealloc (dfa->edests, re_node_set, new_nodes_alloc); 1345 1.1 christos new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); 1346 1.1 christos if (BE (new_nexts == NULL || new_indices == NULL 1347 1.1 christos || new_edests == NULL || new_eclosures == NULL, 0)) 1348 1.1 christos return REG_MISSING; 1349 1.1 christos dfa->nexts = new_nexts; 1350 1.1 christos dfa->org_indices = new_indices; 1351 1.1 christos dfa->edests = new_edests; 1352 1.1 christos dfa->eclosures = new_eclosures; 1353 1.1 christos dfa->nodes_alloc = new_nodes_alloc; 1354 1.1 christos } 1355 1.1 christos dfa->nodes[dfa->nodes_len] = token; 1356 1.1 christos dfa->nodes[dfa->nodes_len].constraint = 0; 1357 1.1 christos #ifdef RE_ENABLE_I18N 1358 1.1 christos dfa->nodes[dfa->nodes_len].accept_mb = 1359 1.1 christos (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; 1360 1.1 christos #endif 1361 1.1 christos dfa->nexts[dfa->nodes_len] = REG_MISSING; 1362 1.1 christos re_node_set_init_empty (dfa->edests + dfa->nodes_len); 1363 1.1 christos re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); 1364 1.1 christos return dfa->nodes_len++; 1365 1.1 christos } 1366 1.1 christos 1367 1.1 christos static inline re_hashval_t 1368 1.1 christos internal_function 1369 1.1 christos calc_state_hash (const re_node_set *nodes, unsigned int context) 1370 1.1 christos { 1371 1.1 christos re_hashval_t hash = nodes->nelem + context; 1372 1.1 christos Idx i; 1373 1.1 christos for (i = 0 ; i < nodes->nelem ; i++) 1374 1.1 christos hash += nodes->elems[i]; 1375 1.1 christos return hash; 1376 1.1 christos } 1377 1.1 christos 1378 1.1 christos /* Search for the state whose node_set is equivalent to NODES. 1379 1.1 christos Return the pointer to the state, if we found it in the DFA. 1380 1.1 christos Otherwise create the new one and return it. In case of an error 1381 1.1 christos return NULL and set the error code in ERR. 1382 1.1 christos Note: - We assume NULL as the invalid state, then it is possible that 1383 1.1 christos return value is NULL and ERR is REG_NOERROR. 1384 1.1 christos - We never return non-NULL value in case of any errors, it is for 1385 1.1 christos optimization. */ 1386 1.1 christos 1387 1.1 christos static re_dfastate_t* 1388 1.1 christos internal_function 1389 1.1 christos re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes) 1390 1.1 christos { 1391 1.1 christos re_hashval_t hash; 1392 1.1 christos re_dfastate_t *new_state; 1393 1.1 christos struct re_state_table_entry *spot; 1394 1.1 christos Idx i; 1395 1.1 christos #ifdef lint 1396 1.1 christos /* Suppress bogus uninitialized-variable warnings. */ 1397 1.1 christos *err = REG_NOERROR; 1398 1.1 christos #endif 1399 1.1 christos if (BE (nodes->nelem == 0, 0)) 1400 1.1 christos { 1401 1.1 christos *err = REG_NOERROR; 1402 1.1 christos return NULL; 1403 1.1 christos } 1404 1.1 christos hash = calc_state_hash (nodes, 0); 1405 1.1 christos spot = dfa->state_table + (hash & dfa->state_hash_mask); 1406 1.1 christos 1407 1.1 christos for (i = 0 ; i < spot->num ; i++) 1408 1.1 christos { 1409 1.1 christos re_dfastate_t *state = spot->array[i]; 1410 1.1 christos if (hash != state->hash) 1411 1.1 christos continue; 1412 1.1 christos if (re_node_set_compare (&state->nodes, nodes)) 1413 1.1 christos return state; 1414 1.1 christos } 1415 1.1 christos 1416 1.1 christos /* There are no appropriate state in the dfa, create the new one. */ 1417 1.1 christos new_state = create_ci_newstate (dfa, nodes, hash); 1418 1.1 christos if (BE (new_state != NULL, 1)) 1419 1.1 christos return new_state; 1420 1.1 christos else 1421 1.1 christos { 1422 1.1 christos *err = REG_ESPACE; 1423 1.1 christos return NULL; 1424 1.1 christos } 1425 1.1 christos } 1426 1.1 christos 1427 1.1 christos /* Search for the state whose node_set is equivalent to NODES and 1428 1.1 christos whose context is equivalent to CONTEXT. 1429 1.1 christos Return the pointer to the state, if we found it in the DFA. 1430 1.1 christos Otherwise create the new one and return it. In case of an error 1431 1.1 christos return NULL and set the error code in ERR. 1432 1.1 christos Note: - We assume NULL as the invalid state, then it is possible that 1433 1.1 christos return value is NULL and ERR is REG_NOERROR. 1434 1.1 christos - We never return non-NULL value in case of any errors, it is for 1435 1.1 christos optimization. */ 1436 1.1 christos 1437 1.1 christos static re_dfastate_t* 1438 1.1 christos internal_function 1439 1.1 christos re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa, 1440 1.1 christos const re_node_set *nodes, unsigned int context) 1441 1.1 christos { 1442 1.1 christos re_hashval_t hash; 1443 1.1 christos re_dfastate_t *new_state; 1444 1.1 christos struct re_state_table_entry *spot; 1445 1.1 christos Idx i; 1446 1.1 christos #ifdef lint 1447 1.1 christos /* Suppress bogus uninitialized-variable warnings. */ 1448 1.1 christos *err = REG_NOERROR; 1449 1.1 christos #endif 1450 1.1 christos if (nodes->nelem == 0) 1451 1.1 christos { 1452 1.1 christos *err = REG_NOERROR; 1453 1.1 christos return NULL; 1454 1.1 christos } 1455 1.1 christos hash = calc_state_hash (nodes, context); 1456 1.1 christos spot = dfa->state_table + (hash & dfa->state_hash_mask); 1457 1.1 christos 1458 1.1 christos for (i = 0 ; i < spot->num ; i++) 1459 1.1 christos { 1460 1.1 christos re_dfastate_t *state = spot->array[i]; 1461 1.1 christos if (state->hash == hash 1462 1.1 christos && state->context == context 1463 1.1 christos && re_node_set_compare (state->entrance_nodes, nodes)) 1464 1.1 christos return state; 1465 1.1 christos } 1466 1.1 christos /* There are no appropriate state in `dfa', create the new one. */ 1467 1.1 christos new_state = create_cd_newstate (dfa, nodes, context, hash); 1468 1.1 christos if (BE (new_state != NULL, 1)) 1469 1.1 christos return new_state; 1470 1.1 christos else 1471 1.1 christos { 1472 1.1 christos *err = REG_ESPACE; 1473 1.1 christos return NULL; 1474 1.1 christos } 1475 1.1 christos } 1476 1.1 christos 1477 1.1 christos /* Finish initialization of the new state NEWSTATE, and using its hash value 1478 1.1 christos HASH put in the appropriate bucket of DFA's state table. Return value 1479 1.1 christos indicates the error code if failed. */ 1480 1.1 christos 1481 1.1 christos static reg_errcode_t 1482 1.1 christos internal_function 1483 1.1 christos register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash) 1484 1.1 christos { 1485 1.1 christos struct re_state_table_entry *spot; 1486 1.1 christos reg_errcode_t err; 1487 1.1 christos Idx i; 1488 1.1 christos 1489 1.1 christos newstate->hash = hash; 1490 1.1 christos err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); 1491 1.1 christos if (BE (err != REG_NOERROR, 0)) 1492 1.1 christos return REG_ESPACE; 1493 1.1 christos for (i = 0; i < newstate->nodes.nelem; i++) 1494 1.1 christos { 1495 1.1 christos Idx elem = newstate->nodes.elems[i]; 1496 1.1 christos if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) 1497 1.1 christos { 1498 1.1 christos bool ok = re_node_set_insert_last (&newstate->non_eps_nodes, elem); 1499 1.1 christos if (BE (! ok, 0)) 1500 1.1 christos return REG_ESPACE; 1501 1.1 christos } 1502 1.1 christos } 1503 1.1 christos 1504 1.1 christos spot = dfa->state_table + (hash & dfa->state_hash_mask); 1505 1.1 christos if (BE (spot->alloc <= spot->num, 0)) 1506 1.1 christos { 1507 1.1 christos Idx new_alloc = spot->num; 1508 1.1 christos re_dfastate_t **new_array = re_x2realloc (spot->array, re_dfastate_t *, 1509 1.1 christos &new_alloc); 1510 1.1 christos if (BE (new_array == NULL, 0)) 1511 1.1 christos return REG_ESPACE; 1512 1.1 christos spot->array = new_array; 1513 1.1 christos spot->alloc = new_alloc; 1514 1.1 christos } 1515 1.1 christos spot->array[spot->num++] = newstate; 1516 1.1 christos return REG_NOERROR; 1517 1.1 christos } 1518 1.1 christos 1519 1.1 christos /* Create the new state which is independ of contexts. 1520 1.1 christos Return the new state if succeeded, otherwise return NULL. */ 1521 1.1 christos 1522 1.1 christos static re_dfastate_t * 1523 1.1 christos internal_function 1524 1.1 christos create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1525 1.1 christos re_hashval_t hash) 1526 1.1 christos { 1527 1.1 christos Idx i; 1528 1.1 christos reg_errcode_t err; 1529 1.1 christos re_dfastate_t *newstate; 1530 1.1 christos 1531 1.1 christos newstate = re_calloc (re_dfastate_t, 1); 1532 1.1 christos if (BE (newstate == NULL, 0)) 1533 1.1 christos return NULL; 1534 1.1 christos err = re_node_set_init_copy (&newstate->nodes, nodes); 1535 1.1 christos if (BE (err != REG_NOERROR, 0)) 1536 1.1 christos { 1537 1.1 christos re_free (newstate); 1538 1.1 christos return NULL; 1539 1.1 christos } 1540 1.1 christos 1541 1.1 christos newstate->entrance_nodes = &newstate->nodes; 1542 1.1 christos for (i = 0 ; i < nodes->nelem ; i++) 1543 1.1 christos { 1544 1.1 christos re_token_t *node = dfa->nodes + nodes->elems[i]; 1545 1.1 christos re_token_type_t type = node->type; 1546 1.1 christos if (type == CHARACTER && !node->constraint) 1547 1.1 christos continue; 1548 1.1 christos #ifdef RE_ENABLE_I18N 1549 1.1 christos newstate->accept_mb |= node->accept_mb; 1550 1.1 christos #endif /* RE_ENABLE_I18N */ 1551 1.1 christos 1552 1.1 christos /* If the state has the halt node, the state is a halt state. */ 1553 1.1 christos if (type == END_OF_RE) 1554 1.1 christos newstate->halt = 1; 1555 1.1 christos else if (type == OP_BACK_REF) 1556 1.1 christos newstate->has_backref = 1; 1557 1.1 christos else if (type == ANCHOR || node->constraint) 1558 1.1 christos newstate->has_constraint = 1; 1559 1.1 christos } 1560 1.1 christos err = register_state (dfa, newstate, hash); 1561 1.1 christos if (BE (err != REG_NOERROR, 0)) 1562 1.1 christos { 1563 1.1 christos free_state (newstate); 1564 1.1 christos newstate = NULL; 1565 1.1 christos } 1566 1.1 christos return newstate; 1567 1.1 christos } 1568 1.1 christos 1569 1.1 christos /* Create the new state which is depend on the context CONTEXT. 1570 1.1 christos Return the new state if succeeded, otherwise return NULL. */ 1571 1.1 christos 1572 1.1 christos static re_dfastate_t * 1573 1.1 christos internal_function 1574 1.1 christos create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1575 1.1 christos unsigned int context, re_hashval_t hash) 1576 1.1 christos { 1577 1.1 christos Idx i, nctx_nodes = 0; 1578 1.1 christos reg_errcode_t err; 1579 1.1 christos re_dfastate_t *newstate; 1580 1.1 christos 1581 1.1 christos newstate = re_calloc (re_dfastate_t, 1); 1582 1.1 christos if (BE (newstate == NULL, 0)) 1583 1.1 christos return NULL; 1584 1.1 christos err = re_node_set_init_copy (&newstate->nodes, nodes); 1585 1.1 christos if (BE (err != REG_NOERROR, 0)) 1586 1.1 christos { 1587 1.1 christos re_free (newstate); 1588 1.1 christos return NULL; 1589 1.1 christos } 1590 1.1 christos 1591 1.1 christos newstate->context = context; 1592 1.1 christos newstate->entrance_nodes = &newstate->nodes; 1593 1.1 christos 1594 1.1 christos for (i = 0 ; i < nodes->nelem ; i++) 1595 1.1 christos { 1596 1.1 christos unsigned int constraint = 0; 1597 1.1 christos re_token_t *node = dfa->nodes + nodes->elems[i]; 1598 1.1 christos re_token_type_t type = node->type; 1599 1.1 christos if (node->constraint) 1600 1.1 christos constraint = node->constraint; 1601 1.1 christos 1602 1.1 christos if (type == CHARACTER && !constraint) 1603 1.1 christos continue; 1604 1.1 christos #ifdef RE_ENABLE_I18N 1605 1.1 christos newstate->accept_mb |= node->accept_mb; 1606 1.1 christos #endif /* RE_ENABLE_I18N */ 1607 1.1 christos 1608 1.1 christos /* If the state has the halt node, the state is a halt state. */ 1609 1.1 christos if (type == END_OF_RE) 1610 1.1 christos newstate->halt = 1; 1611 1.1 christos else if (type == OP_BACK_REF) 1612 1.1 christos newstate->has_backref = 1; 1613 1.1 christos else if (type == ANCHOR) 1614 1.1 christos constraint = node->opr.ctx_type; 1615 1.1 christos 1616 1.1 christos if (constraint) 1617 1.1 christos { 1618 1.1 christos if (newstate->entrance_nodes == &newstate->nodes) 1619 1.1 christos { 1620 1.1 christos newstate->entrance_nodes = re_malloc (re_node_set, 1); 1621 1.1 christos if (BE (newstate->entrance_nodes == NULL, 0)) 1622 1.1 christos { 1623 1.1 christos free_state (newstate); 1624 1.1 christos return NULL; 1625 1.1 christos } 1626 1.1 christos re_node_set_init_copy (newstate->entrance_nodes, nodes); 1627 1.1 christos nctx_nodes = 0; 1628 1.1 christos newstate->has_constraint = 1; 1629 1.1 christos } 1630 1.1 christos 1631 1.1 christos if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 1632 1.1 christos { 1633 1.1 christos re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 1634 1.1 christos ++nctx_nodes; 1635 1.1 christos } 1636 1.1 christos } 1637 1.1 christos } 1638 1.1 christos err = register_state (dfa, newstate, hash); 1639 1.1 christos if (BE (err != REG_NOERROR, 0)) 1640 1.1 christos { 1641 1.1 christos free_state (newstate); 1642 1.1 christos newstate = NULL; 1643 1.1 christos } 1644 1.1 christos return newstate; 1645 1.1 christos } 1646 1.1 christos 1647 1.1 christos static void 1648 1.1 christos internal_function 1649 1.1 christos free_state (re_dfastate_t *state) 1650 1.1 christos { 1651 1.1 christos re_node_set_free (&state->non_eps_nodes); 1652 1.1 christos re_node_set_free (&state->inveclosure); 1653 1.1 christos if (state->entrance_nodes != &state->nodes) 1654 1.1 christos { 1655 1.1 christos re_node_set_free (state->entrance_nodes); 1656 1.1 christos re_free (state->entrance_nodes); 1657 1.1 christos } 1658 1.1 christos re_node_set_free (&state->nodes); 1659 1.1 christos re_free (state->word_trtable); 1660 re_free (state->trtable); 1661 re_free (state); 1662 } 1663