Home | History | Annotate | Line # | Download | only in lib
      1 /*
      2   tre_regexec.c - TRE POSIX compatible matching functions (and more).
      3 
      4   This software is released under a BSD-style license.
      5   See the file LICENSE for details and copyright.
      6 
      7 */
      8 
      9 #ifdef HAVE_CONFIG_H
     10 #include <config.h>
     11 #endif /* HAVE_CONFIG_H */
     12 
     13 #include "tre-alloca.h"
     14 
     15 #include <assert.h>
     16 #include <stdlib.h>
     17 #include <string.h>
     18 #ifdef HAVE_WCHAR_H
     19 #include <wchar.h>
     20 #endif /* HAVE_WCHAR_H */
     21 #ifdef HAVE_WCTYPE_H
     22 #include <wctype.h>
     23 #endif /* HAVE_WCTYPE_H */
     24 #ifndef TRE_WCHAR
     25 #include <ctype.h>
     26 #endif /* !TRE_WCHAR */
     27 #ifdef HAVE_MALLOC_H
     28 #include <malloc.h>
     29 #endif /* HAVE_MALLOC_H */
     30 #include <limits.h>
     31 
     32 #include "tre-internal.h"
     33 #include "tre.h"
     34 #include "xmalloc.h"
     35 
     36 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
     37    endpoint values. */
     38 void
     39 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
     40 		const tre_tnfa_t *tnfa, int *tags, int match_eo)
     41 {
     42   tre_submatch_data_t *submatch_data;
     43   unsigned int i, j;
     44   int *parents;
     45 
     46   if (cflags & REG_NOSUB)
     47     return;
     48 
     49   i = 0;
     50   if (match_eo >= 0)
     51     {
     52       /* Construct submatch offsets from the tags. */
     53       DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
     54       submatch_data = tnfa->submatch_data;
     55       while (i < tnfa->num_submatches && i < nmatch)
     56 	{
     57 	  if (submatch_data[i].so_tag == tnfa->end_tag)
     58 	    pmatch[i].rm_so = match_eo;
     59 	  else
     60 	    pmatch[i].rm_so = tags[submatch_data[i].so_tag];
     61 
     62 	  if (submatch_data[i].eo_tag == tnfa->end_tag)
     63 	    pmatch[i].rm_eo = match_eo;
     64 	  else
     65 	    pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
     66 
     67 	  /* If either of the endpoints were not used, this submatch
     68 	     was not part of the match. */
     69 	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
     70 	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
     71 
     72 	  DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i,
     73 		  submatch_data[i].so_tag, pmatch[i].rm_so,
     74 		  submatch_data[i].eo_tag, pmatch[i].rm_eo));
     75 	  i++;
     76 	}
     77       /* Reset all submatches that are not within all of their parent
     78 	 submatches. */
     79       i = 0;
     80       while (i < tnfa->num_submatches && i < nmatch)
     81 	{
     82 	  if (pmatch[i].rm_eo == -1)
     83 	    assert(pmatch[i].rm_so == -1);
     84 	  assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
     85 
     86 	  parents = submatch_data[i].parents;
     87 	  if (parents != NULL)
     88 	    for (j = 0; parents[j] >= 0; j++)
     89 	      {
     90 		DPRINT(("pmatch[%d] parent %d\n", i, parents[j]));
     91 		if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
     92 		    || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
     93 		  pmatch[i].rm_so = pmatch[i].rm_eo = -1;
     94 	      }
     95 	  i++;
     96 	}
     97     }
     98 
     99   while (i < nmatch)
    100     {
    101       pmatch[i].rm_so = -1;
    102       pmatch[i].rm_eo = -1;
    103       i++;
    104     }
    105 }
    106 
    107 
    108 /*
    109   Wrapper functions for POSIX compatible regexp matching.
    110 */
    111 
    112 int
    113 tre_have_backrefs(const regex_t *preg)
    114 {
    115   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
    116   return tnfa->have_backrefs;
    117 }
    118 
    119 int
    120 tre_have_approx(const regex_t *preg)
    121 {
    122   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
    123   return tnfa->have_approx;
    124 }
    125 
    126 static int
    127 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
    128 	  tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
    129 	  int eflags)
    130 {
    131   reg_errcode_t status;
    132   int *tags = NULL, eo;
    133   if (tnfa->cflags & REG_NOSUB) nmatch = 0;
    134   if (tnfa->num_tags > 0 && nmatch > 0)
    135     {
    136 #ifdef TRE_USE_ALLOCA
    137       tags = alloca(sizeof(*tags) * tnfa->num_tags);
    138 #else /* !TRE_USE_ALLOCA */
    139       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
    140 #endif /* !TRE_USE_ALLOCA */
    141       if (tags == NULL)
    142 	return REG_ESPACE;
    143     }
    144 
    145   /* Dispatch to the appropriate matcher. */
    146   if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER)
    147     {
    148       /* The regex has back references, use the backtracking matcher. */
    149       if (type == STR_USER)
    150 	{
    151 	  const tre_str_source *source = string;
    152 	  if (source->rewind == NULL || source->compare == NULL)
    153 	    /* The backtracking matcher requires rewind and compare
    154 	       capabilities from the input stream. */
    155 	    return REG_BADPAT;
    156 	}
    157       status = tre_tnfa_run_backtrack(tnfa, string, (int)len, type,
    158 				      tags, eflags, &eo);
    159     }
    160 #ifdef TRE_APPROX
    161   else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER)
    162     {
    163       /* The regex uses approximate matching, use the approximate matcher. */
    164       regamatch_t match;
    165       regaparams_t params;
    166       tre_regaparams_default(&params);
    167       params.max_err = 0;
    168       params.max_cost = 0;
    169       status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
    170 				   &match, params, eflags, &eo);
    171     }
    172 #endif /* TRE_APPROX */
    173   else
    174     {
    175       /* Exact matching, no back references, use the parallel matcher. */
    176       status = tre_tnfa_run_parallel(tnfa, string, (int)len, type,
    177 				     tags, eflags, &eo);
    178     }
    179 
    180   if (status == REG_OK)
    181     /* A match was found, so fill the submatch registers. */
    182     tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
    183 #ifndef TRE_USE_ALLOCA
    184   if (tags)
    185     xfree(tags);
    186 #endif /* !TRE_USE_ALLOCA */
    187   return status;
    188 }
    189 
    190 int
    191 tre_regnexec(const regex_t *preg, const char *str, size_t len,
    192 	 size_t nmatch, regmatch_t pmatch[], int eflags)
    193 {
    194   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
    195   /* CONSTCOND */
    196   tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
    197 
    198   return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags);
    199 }
    200 
    201 int
    202 tre_regexec(const regex_t *preg, const char *str,
    203 	size_t nmatch, regmatch_t pmatch[], int eflags)
    204 {
    205 	size_t shift, len, i;
    206 	int ret;
    207 	tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
    208 	regmatch_t *p;
    209 
    210 	if (eflags & REG_STARTEND) {
    211 		if (pmatch == NULL || pmatch->rm_so < 0 || pmatch->rm_eo < 0
    212 		    || pmatch->rm_so > pmatch->rm_eo)
    213 			return REG_INVARG;
    214 		str += shift = pmatch->rm_so;
    215 		len = pmatch->rm_eo - pmatch->rm_so;
    216 		eflags &= ~REG_STARTEND;
    217 	} else {
    218 		shift = 0;
    219 		len = (size_t)-1;
    220 	}
    221 
    222 	ret = tre_regnexec(preg, str, len, nmatch, pmatch, eflags);
    223 
    224 	if (!ret && !(tnfa->cflags & REG_NOSUB) && len != (size_t)-1) {
    225 		for (i = nmatch, p = pmatch; i > 0; p++, i--) {
    226 			if (p->rm_so >= 0) p->rm_so += shift;
    227 			if (p->rm_eo >= 0) p->rm_eo += shift;
    228 		}
    229 	}
    230 
    231 	return ret;
    232 }
    233 
    234 #ifdef REG_USEBYTES
    235 int
    236 tre_regexecb(const regex_t *preg, const char *str,
    237         size_t nmatch, regmatch_t pmatch[], int eflags)
    238 {
    239   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
    240 
    241   return tre_match(tnfa, str, (unsigned)-1, STR_BYTE, nmatch, pmatch, eflags);
    242 }
    243 
    244 int
    245 tre_regnexecb(const regex_t *preg, const char *str, size_t len,
    246         size_t nmatch, regmatch_t pmatch[], int eflags)
    247 {
    248   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
    249 
    250   return tre_match(tnfa, str, len, STR_BYTE, nmatch, pmatch, eflags);
    251 }
    252 #endif /* REG_USEBYTES */
    253 
    254 
    255 #ifdef TRE_WCHAR
    256 
    257 int
    258 tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len,
    259 	  size_t nmatch, regmatch_t pmatch[], int eflags)
    260 {
    261   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
    262   return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags);
    263 }
    264 
    265 int
    266 tre_regwexec(const regex_t *preg, const wchar_t *str,
    267 	 size_t nmatch, regmatch_t pmatch[], int eflags)
    268 {
    269 	size_t shift, len, i;
    270 	int ret;
    271 	tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
    272 	regmatch_t *p;
    273 
    274 	if (eflags & REG_STARTEND) {
    275 		if (pmatch == NULL || pmatch->rm_so < 0 || pmatch->rm_eo < 0
    276 		    || pmatch->rm_so > pmatch->rm_eo)
    277 			return REG_INVARG;
    278 		str += shift = pmatch->rm_so;
    279 		len = pmatch->rm_eo - pmatch->rm_so;
    280 		eflags &= ~REG_STARTEND;
    281 	} else {
    282 		shift = 0;
    283 		len = (size_t)-1;
    284 	}
    285 
    286 	ret = tre_regwnexec(preg, str, len, nmatch, pmatch, eflags);
    287 
    288 	if (!ret && !(tnfa->cflags & REG_NOSUB) && len != (size_t)-1) {
    289 		for (i = nmatch, p = pmatch; i > 0; p++, i--) {
    290 			if (p->rm_so >= 0) p->rm_so += shift;
    291 			if (p->rm_eo >= 0) p->rm_eo += shift;
    292 		}
    293 	}
    294 
    295 	return ret;
    296 }
    297 
    298 #endif /* TRE_WCHAR */
    299 
    300 int
    301 tre_reguexec(const regex_t *preg, const tre_str_source *str,
    302 	 size_t nmatch, regmatch_t pmatch[], int eflags)
    303 {
    304   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
    305   return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags);
    306 }
    307 
    308 
    309 #ifdef TRE_APPROX
    310 
    311 /*
    312   Wrapper functions for approximate regexp matching.
    313 */
    314 
    315 static int
    316 tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
    317 		 tre_str_type_t type, regamatch_t *match, regaparams_t params,
    318 		 int eflags)
    319 {
    320   reg_errcode_t status;
    321   int *tags = NULL, eo;
    322   size_t nmatch;
    323 
    324   if (tnfa->cflags & REG_NOSUB)
    325     nmatch = 0;
    326   else
    327     nmatch = match->nmatch;
    328 
    329   /* If the regexp does not use approximate matching features, the
    330      maximum cost is zero, and the approximate matcher isn't forced,
    331      use the exact matcher instead. */
    332   if (params.max_cost == 0 && !tnfa->have_approx
    333       && !(eflags & REG_APPROX_MATCHER))
    334     return tre_match(tnfa, string, len, type, nmatch, match->pmatch,
    335 		     eflags);
    336 
    337   /* Back references are not supported by the approximate matcher. */
    338   if (tnfa->have_backrefs)
    339     return REG_BADPAT;
    340 
    341   if (tnfa->num_tags > 0 && nmatch > 0)
    342     {
    343 #if TRE_USE_ALLOCA
    344       tags = alloca(sizeof(*tags) * tnfa->num_tags);
    345 #else /* !TRE_USE_ALLOCA */
    346       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
    347 #endif /* !TRE_USE_ALLOCA */
    348       if (tags == NULL)
    349 	return REG_ESPACE;
    350     }
    351   status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
    352 			       match, params, eflags, &eo);
    353   if (status == REG_OK)
    354     tre_fill_pmatch(nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo);
    355 #ifndef TRE_USE_ALLOCA
    356   if (tags)
    357     xfree(tags);
    358 #endif /* !TRE_USE_ALLOCA */
    359   return status;
    360 }
    361 
    362 int
    363 tre_reganexec(const regex_t *preg, const char *str, size_t len,
    364 	  regamatch_t *match, regaparams_t params, int eflags)
    365 {
    366   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
    367   tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
    368 
    369   return tre_match_approx(tnfa, str, len, type, match, params, eflags);
    370 }
    371 
    372 int
    373 tre_regaexec(const regex_t *preg, const char *str,
    374 	 regamatch_t *match, regaparams_t params, int eflags)
    375 {
    376   return tre_reganexec(preg, str, (unsigned)-1, match, params, eflags);
    377 }
    378 
    379 #ifdef REG_USEBYTES
    380 int
    381 tre_regaexecb(const regex_t *preg, const char *str,
    382           regamatch_t *match, regaparams_t params, int eflags)
    383 {
    384   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
    385 
    386   return tre_match_approx(tnfa, str, (unsigned)-1, STR_BYTE,
    387                           match, params, eflags);
    388 }
    389 #endif /* REG_USEBYTES */
    390 
    391 #ifdef TRE_WCHAR
    392 
    393 int
    394 tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len,
    395 	   regamatch_t *match, regaparams_t params, int eflags)
    396 {
    397   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
    398   return tre_match_approx(tnfa, str, len, STR_WIDE,
    399 			  match, params, eflags);
    400 }
    401 
    402 int
    403 tre_regawexec(const regex_t *preg, const wchar_t *str,
    404 	  regamatch_t *match, regaparams_t params, int eflags)
    405 {
    406   return tre_regawnexec(preg, str, (unsigned)-1, match, params, eflags);
    407 }
    408 
    409 #endif /* TRE_WCHAR */
    410 
    411 void
    412 tre_regaparams_default(regaparams_t *params)
    413 {
    414   memset(params, 0, sizeof(*params));
    415   params->cost_ins = 1;
    416   params->cost_del = 1;
    417   params->cost_subst = 1;
    418   params->max_cost = INT_MAX;
    419   params->max_ins = INT_MAX;
    420   params->max_del = INT_MAX;
    421   params->max_subst = INT_MAX;
    422   params->max_err = INT_MAX;
    423 }
    424 
    425 #endif /* TRE_APPROX */
    426 
    427 /* EOF */
    428