Home | History | Annotate | Line # | Download | only in grep
fastgrep.c revision 1.5.56.1
      1       1.1     joerg /*	$OpenBSD: util.c,v 1.36 2007/10/02 17:59:18 otto Exp $	*/
      2       1.1     joerg /*	$FreeBSD: head/usr.bin/grep/fastgrep.c 211496 2010-08-19 09:28:59Z des $ */
      3       1.1     joerg 
      4       1.1     joerg /*-
      5       1.1     joerg  * Copyright (c) 1999 James Howard and Dag-Erling Codan Smrgrav
      6       1.1     joerg  * Copyright (C) 2008 Gabor Kovesdan <gabor (at) FreeBSD.org>
      7       1.1     joerg  * All rights reserved.
      8       1.1     joerg  *
      9       1.1     joerg  * Redistribution and use in source and binary forms, with or without
     10       1.1     joerg  * modification, are permitted provided that the following conditions
     11       1.1     joerg  * are met:
     12       1.1     joerg  * 1. Redistributions of source code must retain the above copyright
     13       1.1     joerg  *    notice, this list of conditions and the following disclaimer.
     14       1.1     joerg  * 2. Redistributions in binary form must reproduce the above copyright
     15       1.1     joerg  *    notice, this list of conditions and the following disclaimer in the
     16       1.1     joerg  *    documentation and/or other materials provided with the distribution.
     17       1.1     joerg  *
     18       1.1     joerg  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     19       1.1     joerg  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     20       1.1     joerg  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     21       1.1     joerg  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     22       1.1     joerg  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     23       1.1     joerg  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     24       1.1     joerg  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     25       1.1     joerg  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     26       1.1     joerg  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     27       1.1     joerg  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     28       1.1     joerg  * SUCH DAMAGE.
     29       1.1     joerg  */
     30       1.1     joerg 
     31       1.1     joerg /*
     32       1.1     joerg  * XXX: This file is a speed up for grep to cover the defects of the
     33       1.1     joerg  * regex library.  These optimizations should practically be implemented
     34       1.1     joerg  * there keeping this code clean.  This is a future TODO, but for the
     35       1.1     joerg  * meantime, we need to use this workaround.
     36       1.1     joerg  */
     37       1.1     joerg 
     38       1.5     joerg #if HAVE_NBTOOL_CONFIG_H
     39       1.5     joerg #include "nbtool_config.h"
     40       1.5     joerg #endif
     41       1.5     joerg 
     42       1.1     joerg #include <sys/cdefs.h>
     43  1.5.56.1  perseant __RCSID("$NetBSD: fastgrep.c,v 1.5.56.1 2025/08/02 05:58:28 perseant Exp $");
     44       1.1     joerg 
     45       1.1     joerg #include <limits.h>
     46       1.1     joerg #include <stdbool.h>
     47       1.1     joerg #include <stdlib.h>
     48       1.1     joerg #include <string.h>
     49       1.1     joerg #include <wchar.h>
     50       1.1     joerg #include <wctype.h>
     51       1.1     joerg 
     52       1.1     joerg #include "grep.h"
     53       1.1     joerg 
     54       1.1     joerg static inline int	grep_cmp(const unsigned char *, const unsigned char *, size_t);
     55       1.1     joerg static inline void	grep_revstr(unsigned char *, int);
     56       1.1     joerg 
     57  1.5.56.1  perseant static void
     58  1.5.56.1  perseant alloc_pattern(fastgrep_t *fg, const char *pat)
     59  1.5.56.1  perseant {
     60  1.5.56.1  perseant 	unsigned int i;
     61  1.5.56.1  perseant 
     62  1.5.56.1  perseant 	fg->pattern = (unsigned char *)grep_strdup(pat);
     63  1.5.56.1  perseant 	if (!iflag)
     64  1.5.56.1  perseant 		return;
     65  1.5.56.1  perseant 	for (i = 0;  fg->pattern[i]; i++)
     66  1.5.56.1  perseant 		fg->pattern[i] = towupper((unsigned char)fg->pattern[i]);
     67  1.5.56.1  perseant }
     68  1.5.56.1  perseant 
     69  1.5.56.1  perseant static void
     70  1.5.56.1  perseant map_pattern(fastgrep_t *fg, int hasDot)
     71       1.1     joerg {
     72       1.1     joerg 	unsigned int i;
     73       1.1     joerg 
     74  1.5.56.1  perseant 	for (i = 0; i <= UCHAR_MAX; i++)
     75  1.5.56.1  perseant 		fg->qsBc[i] = fg->len - hasDot;
     76  1.5.56.1  perseant 	for (i = hasDot + 1; i < fg->len; i++) {
     77  1.5.56.1  perseant 		unsigned char ch = fg->pattern[i];
     78  1.5.56.1  perseant 		if (iflag) {
     79  1.5.56.1  perseant 			fg->qsBc[ch] = fg->len - i;
     80  1.5.56.1  perseant 			ch = towlower(ch);
     81  1.5.56.1  perseant 		}
     82  1.5.56.1  perseant 		fg->qsBc[ch] = fg->len - i;
     83  1.5.56.1  perseant 	}
     84  1.5.56.1  perseant }
     85  1.5.56.1  perseant 
     86  1.5.56.1  perseant void
     87  1.5.56.1  perseant fgrepcomp(fastgrep_t *fg, const char *pat)
     88  1.5.56.1  perseant {
     89       1.1     joerg 	/* Initialize. */
     90       1.1     joerg 	fg->len = strlen(pat);
     91       1.1     joerg 	fg->bol = false;
     92       1.1     joerg 	fg->eol = false;
     93       1.1     joerg 	fg->reversed = false;
     94       1.1     joerg 
     95  1.5.56.1  perseant 	alloc_pattern(fg, pat);
     96       1.1     joerg 
     97       1.1     joerg 	/* Preprocess pattern. */
     98  1.5.56.1  perseant 	map_pattern(fg, 0);
     99       1.1     joerg }
    100       1.1     joerg 
    101       1.1     joerg /*
    102       1.1     joerg  * Returns: -1 on failure, 0 on success
    103       1.1     joerg  */
    104       1.1     joerg int
    105       1.1     joerg fastcomp(fastgrep_t *fg, const char *pat)
    106       1.1     joerg {
    107       1.1     joerg 	unsigned int i;
    108       1.1     joerg 	int firstHalfDot = -1;
    109       1.1     joerg 	int firstLastHalfDot = -1;
    110       1.1     joerg 	int hasDot = 0;
    111       1.1     joerg 	int lastHalfDot = 0;
    112       1.1     joerg 
    113       1.1     joerg 	/* Initialize. */
    114       1.1     joerg 	fg->len = strlen(pat);
    115       1.1     joerg 	fg->bol = false;
    116       1.1     joerg 	fg->eol = false;
    117       1.1     joerg 	fg->reversed = false;
    118       1.4     joerg 	fg->word = wflag;
    119       1.1     joerg 
    120       1.1     joerg 	/* Remove end-of-line character ('$'). */
    121       1.1     joerg 	if (fg->len > 0 && pat[fg->len - 1] == '$') {
    122       1.1     joerg 		fg->eol = true;
    123       1.1     joerg 		fg->len--;
    124       1.1     joerg 	}
    125       1.1     joerg 
    126       1.1     joerg 	/* Remove beginning-of-line character ('^'). */
    127       1.1     joerg 	if (pat[0] == '^') {
    128       1.1     joerg 		fg->bol = true;
    129       1.1     joerg 		fg->len--;
    130       1.3     joerg 		pat++;
    131       1.1     joerg 	}
    132       1.1     joerg 
    133       1.1     joerg 	if (fg->len >= 14 &&
    134       1.3     joerg 	    memcmp(pat, "[[:<:]]", 7) == 0 &&
    135       1.3     joerg 	    memcmp(pat + fg->len - 7, "[[:>:]]", 7) == 0) {
    136       1.1     joerg 		fg->len -= 14;
    137       1.3     joerg 		pat += 7;
    138       1.1     joerg 		/* Word boundary is handled separately in util.c */
    139       1.4     joerg 		fg->word = true;
    140       1.1     joerg 	}
    141       1.1     joerg 
    142       1.1     joerg 	/*
    143       1.3     joerg 	 * pat has been adjusted earlier to not include '^', '$' or
    144       1.3     joerg 	 * the word match character classes at the beginning and ending
    145       1.3     joerg 	 * of the string respectively.
    146       1.1     joerg 	 */
    147  1.5.56.1  perseant 	alloc_pattern(fg, pat);
    148       1.1     joerg 
    149       1.1     joerg 	/* Look for ways to cheat...er...avoid the full regex engine. */
    150       1.1     joerg 	for (i = 0; i < fg->len; i++) {
    151       1.1     joerg 		/* Can still cheat? */
    152       1.1     joerg 		if (fg->pattern[i] == '.') {
    153       1.1     joerg 			hasDot = i;
    154       1.1     joerg 			if (i < fg->len / 2) {
    155       1.1     joerg 				if (firstHalfDot < 0)
    156       1.1     joerg 					/* Closest dot to the beginning */
    157       1.1     joerg 					firstHalfDot = i;
    158       1.1     joerg 			} else {
    159       1.1     joerg 				/* Closest dot to the end of the pattern. */
    160       1.1     joerg 				lastHalfDot = i;
    161       1.1     joerg 				if (firstLastHalfDot < 0)
    162       1.1     joerg 					firstLastHalfDot = i;
    163       1.1     joerg 			}
    164       1.1     joerg 		} else {
    165       1.1     joerg 			/* Free memory and let others know this is empty. */
    166       1.1     joerg 			free(fg->pattern);
    167       1.1     joerg 			fg->pattern = NULL;
    168       1.1     joerg 			return (-1);
    169       1.1     joerg 		}
    170       1.1     joerg 	}
    171       1.1     joerg 
    172       1.1     joerg 	/*
    173       1.1     joerg 	 * Determine if a reverse search would be faster based on the placement
    174       1.1     joerg 	 * of the dots.
    175       1.1     joerg 	 */
    176       1.3     joerg 	if ((!(lflag || cflag)) && ((!(fg->bol || fg->eol)) &&
    177       1.1     joerg 	    ((lastHalfDot) && ((firstHalfDot < 0) ||
    178       1.1     joerg 	    ((fg->len - (lastHalfDot + 1)) < (size_t)firstHalfDot)))) &&
    179       1.1     joerg 	    !oflag && !color) {
    180       1.1     joerg 		fg->reversed = true;
    181       1.1     joerg 		hasDot = fg->len - (firstHalfDot < 0 ?
    182       1.1     joerg 		    firstLastHalfDot : firstHalfDot) - 1;
    183       1.1     joerg 		grep_revstr(fg->pattern, fg->len);
    184       1.1     joerg 	}
    185       1.1     joerg 
    186       1.1     joerg 	/*
    187       1.1     joerg 	 * Normal Quick Search would require a shift based on the position the
    188       1.1     joerg 	 * next character after the comparison is within the pattern.  With
    189       1.1     joerg 	 * wildcards, the position of the last dot effects the maximum shift
    190       1.1     joerg 	 * distance.
    191       1.1     joerg 	 * The closer to the end the wild card is the slower the search.  A
    192       1.1     joerg 	 * reverse version of this algorithm would be useful for wildcards near
    193       1.1     joerg 	 * the end of the string.
    194       1.1     joerg 	 *
    195       1.1     joerg 	 * Examples:
    196       1.1     joerg 	 * Pattern	Max shift
    197       1.1     joerg 	 * -------	---------
    198       1.1     joerg 	 * this		5
    199       1.1     joerg 	 * .his		4
    200       1.1     joerg 	 * t.is		3
    201       1.1     joerg 	 * th.s		2
    202       1.1     joerg 	 * thi.		1
    203       1.1     joerg 	 */
    204       1.1     joerg 
    205       1.1     joerg 	/* Preprocess pattern. */
    206  1.5.56.1  perseant 	map_pattern(fg, hasDot);
    207       1.1     joerg 
    208       1.1     joerg 	/*
    209       1.1     joerg 	 * Put pattern back to normal after pre-processing to allow for easy
    210       1.1     joerg 	 * comparisons later.
    211       1.1     joerg 	 */
    212       1.1     joerg 	if (fg->reversed)
    213       1.1     joerg 		grep_revstr(fg->pattern, fg->len);
    214       1.1     joerg 
    215       1.1     joerg 	return (0);
    216       1.1     joerg }
    217       1.1     joerg 
    218       1.1     joerg int
    219       1.1     joerg grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, regmatch_t *pmatch)
    220       1.1     joerg {
    221       1.1     joerg 	unsigned int j;
    222       1.1     joerg 	int ret = REG_NOMATCH;
    223       1.1     joerg 
    224       1.1     joerg 	if (pmatch->rm_so == (ssize_t)len)
    225       1.1     joerg 		return (ret);
    226       1.1     joerg 
    227       1.1     joerg 	if (fg->bol && pmatch->rm_so != 0) {
    228       1.1     joerg 		pmatch->rm_so = len;
    229       1.1     joerg 		pmatch->rm_eo = len;
    230       1.1     joerg 		return (ret);
    231       1.1     joerg 	}
    232       1.1     joerg 
    233       1.1     joerg 	/* No point in going farther if we do not have enough data. */
    234       1.1     joerg 	if (len < fg->len)
    235       1.1     joerg 		return (ret);
    236       1.1     joerg 
    237       1.1     joerg 	/* Only try once at the beginning or ending of the line. */
    238       1.1     joerg 	if (fg->bol || fg->eol) {
    239       1.1     joerg 		/* Simple text comparison. */
    240       1.1     joerg 		/* Verify data is >= pattern length before searching on it. */
    241       1.1     joerg 		if (len >= fg->len) {
    242       1.1     joerg 			/* Determine where in data to start search at. */
    243       1.1     joerg 			j = fg->eol ? len - fg->len : 0;
    244       1.1     joerg 			if (!((fg->bol && fg->eol) && (len != fg->len)))
    245       1.1     joerg 				if (grep_cmp(fg->pattern, data + j,
    246       1.1     joerg 				    fg->len) == -1) {
    247       1.1     joerg 					pmatch->rm_so = j;
    248       1.1     joerg 					pmatch->rm_eo = j + fg->len;
    249       1.1     joerg 						ret = 0;
    250       1.1     joerg 				}
    251       1.1     joerg 		}
    252       1.1     joerg 	} else if (fg->reversed) {
    253       1.1     joerg 		/* Quick Search algorithm. */
    254       1.1     joerg 		j = len;
    255       1.1     joerg 		do {
    256       1.1     joerg 			if (grep_cmp(fg->pattern, data + j - fg->len,
    257       1.1     joerg 				fg->len) == -1) {
    258       1.1     joerg 				pmatch->rm_so = j - fg->len;
    259       1.1     joerg 				pmatch->rm_eo = j;
    260       1.1     joerg 				ret = 0;
    261       1.1     joerg 				break;
    262       1.1     joerg 			}
    263       1.1     joerg 			/* Shift if within bounds, otherwise, we are done. */
    264       1.1     joerg 			if (j == fg->len)
    265       1.1     joerg 				break;
    266       1.1     joerg 			j -= fg->qsBc[data[j - fg->len - 1]];
    267       1.1     joerg 		} while (j >= fg->len);
    268       1.1     joerg 	} else {
    269       1.1     joerg 		/* Quick Search algorithm. */
    270       1.1     joerg 		j = pmatch->rm_so;
    271       1.1     joerg 		do {
    272       1.1     joerg 			if (grep_cmp(fg->pattern, data + j, fg->len) == -1) {
    273       1.1     joerg 				pmatch->rm_so = j;
    274       1.1     joerg 				pmatch->rm_eo = j + fg->len;
    275       1.1     joerg 				ret = 0;
    276       1.1     joerg 				break;
    277       1.1     joerg 			}
    278       1.1     joerg 
    279       1.1     joerg 			/* Shift if within bounds, otherwise, we are done. */
    280       1.1     joerg 			if (j + fg->len == len)
    281       1.1     joerg 				break;
    282       1.1     joerg 			else
    283       1.1     joerg 				j += fg->qsBc[data[j + fg->len]];
    284       1.1     joerg 		} while (j <= (len - fg->len));
    285       1.1     joerg 	}
    286       1.1     joerg 
    287       1.1     joerg 	return (ret);
    288       1.1     joerg }
    289       1.1     joerg 
    290       1.1     joerg /*
    291       1.1     joerg  * Returns:	i >= 0 on failure (position that it failed)
    292       1.1     joerg  *		-1 on success
    293       1.1     joerg  */
    294       1.1     joerg static inline int
    295       1.1     joerg grep_cmp(const unsigned char *pat, const unsigned char *data, size_t len)
    296       1.1     joerg {
    297       1.1     joerg 	size_t size;
    298       1.1     joerg 	wchar_t *wdata, *wpat;
    299       1.1     joerg 	unsigned int i;
    300       1.1     joerg 
    301       1.1     joerg 	if (iflag) {
    302       1.1     joerg 		if ((size = mbstowcs(NULL, (const char *)data, 0)) ==
    303       1.1     joerg 		    ((size_t) - 1))
    304       1.1     joerg 			return (-1);
    305       1.1     joerg 
    306       1.1     joerg 		wdata = grep_malloc(size * sizeof(wint_t));
    307       1.1     joerg 
    308       1.1     joerg 		if (mbstowcs(wdata, (const char *)data, size) ==
    309       1.1     joerg 		    ((size_t) - 1))
    310       1.1     joerg 			return (-1);
    311       1.1     joerg 
    312       1.1     joerg 		if ((size = mbstowcs(NULL, (const char *)pat, 0)) ==
    313       1.1     joerg 		    ((size_t) - 1))
    314       1.1     joerg 			return (-1);
    315       1.1     joerg 
    316       1.1     joerg 		wpat = grep_malloc(size * sizeof(wint_t));
    317       1.1     joerg 
    318       1.1     joerg 		if (mbstowcs(wpat, (const char *)pat, size) == ((size_t) - 1))
    319       1.1     joerg 			return (-1);
    320       1.1     joerg 		for (i = 0; i < len; i++) {
    321       1.1     joerg 			if ((towlower(wpat[i]) == towlower(wdata[i])) ||
    322       1.1     joerg 			    ((grepbehave != GREP_FIXED) && wpat[i] == L'.'))
    323       1.1     joerg 				continue;
    324       1.1     joerg 			free(wpat);
    325       1.1     joerg 			free(wdata);
    326  1.5.56.1  perseant 			return (i);
    327       1.1     joerg 		}
    328       1.1     joerg 	} else {
    329       1.1     joerg 		for (i = 0; i < len; i++) {
    330       1.1     joerg 			if ((pat[i] == data[i]) || ((grepbehave != GREP_FIXED) &&
    331       1.1     joerg 			    pat[i] == '.'))
    332       1.1     joerg 				continue;
    333       1.1     joerg 			return (i);
    334       1.1     joerg 		}
    335       1.1     joerg 	}
    336       1.1     joerg 	return (-1);
    337       1.1     joerg }
    338       1.1     joerg 
    339       1.1     joerg static inline void
    340       1.1     joerg grep_revstr(unsigned char *str, int len)
    341       1.1     joerg {
    342       1.1     joerg 	int i;
    343       1.1     joerg 	char c;
    344       1.1     joerg 
    345       1.1     joerg 	for (i = 0; i < len / 2; i++) {
    346       1.1     joerg 		c = str[i];
    347       1.1     joerg 		str[i] = str[len - i - 1];
    348       1.1     joerg 		str[len - i - 1] = c;
    349       1.1     joerg 	}
    350       1.1     joerg }
    351