Home | History | Annotate | Line # | Download | only in makemandb
custom_apropos_tokenizer.c revision 1.2
      1  1.2  abhinav /*	$NetBSD: custom_apropos_tokenizer.c,v 1.2 2017/10/31 10:14:27 abhinav Exp $	*/
      2  1.1  abhinav /*
      3  1.1  abhinav ** 2006 September 30
      4  1.1  abhinav **
      5  1.1  abhinav ** The author disclaims copyright to this source code.  In place of
      6  1.1  abhinav ** a legal notice, here is a blessing:
      7  1.1  abhinav **
      8  1.1  abhinav **    May you do good and not evil.
      9  1.1  abhinav **    May you find forgiveness for yourself and forgive others.
     10  1.1  abhinav **    May you share freely, never taking more than you give.
     11  1.1  abhinav **
     12  1.1  abhinav *************************************************************************
     13  1.1  abhinav ** Implementation of the full-text-search tokenizer that implements
     14  1.1  abhinav ** a Porter stemmer.
     15  1.1  abhinav */
     16  1.1  abhinav 
     17  1.1  abhinav /*
     18  1.1  abhinav ** The code in this file is only compiled if:
     19  1.1  abhinav **
     20  1.1  abhinav **     * The FTS3 module is being built as an extension
     21  1.1  abhinav **       (in which case SQLITE_CORE is not defined), or
     22  1.1  abhinav **
     23  1.1  abhinav **     * The FTS3 module is being built into the core of
     24  1.1  abhinav **       SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
     25  1.1  abhinav */
     26  1.1  abhinav 
     27  1.1  abhinav #include <assert.h>
     28  1.1  abhinav #include <ctype.h>
     29  1.1  abhinav #include <stdlib.h>
     30  1.1  abhinav #include <stdio.h>
     31  1.1  abhinav #include <string.h>
     32  1.1  abhinav 
     33  1.1  abhinav #include "custom_apropos_tokenizer.h"
     34  1.1  abhinav #include "fts3_tokenizer.h"
     35  1.1  abhinav #include "nostem.c"
     36  1.1  abhinav 
     37  1.1  abhinav /*
     38  1.1  abhinav  * Class derived from sqlite3_tokenizer
     39  1.1  abhinav  */
     40  1.1  abhinav typedef struct custom_apropos_tokenizer {
     41  1.1  abhinav 	sqlite3_tokenizer base;	/* Base class */
     42  1.1  abhinav } custom_apropos_tokenizer;
     43  1.1  abhinav 
     44  1.1  abhinav /*
     45  1.1  abhinav  * Class derived from sqlite3_tokenizer_cursor
     46  1.1  abhinav  */
     47  1.1  abhinav typedef struct custom_apropos_tokenizer_cursor {
     48  1.1  abhinav 	sqlite3_tokenizer_cursor base;
     49  1.1  abhinav 	const char *zInput;	/* input we are tokenizing */
     50  1.1  abhinav 	size_t nInput;		/* size of the input */
     51  1.1  abhinav 	size_t iOffset;		/* current position in zInput */
     52  1.1  abhinav 	size_t iToken;		/* index of next token to be returned */
     53  1.1  abhinav 	char *zToken;		/* storage for current token */
     54  1.1  abhinav 	size_t nAllocated;		/* space allocated to zToken buffer */
     55  1.1  abhinav } custom_apropos_tokenizer_cursor;
     56  1.1  abhinav 
     57  1.1  abhinav /*
     58  1.1  abhinav  * Create a new tokenizer instance.
     59  1.1  abhinav  */
     60  1.1  abhinav static int
     61  1.1  abhinav aproposPorterCreate(int argc, const char *const * argv,
     62  1.1  abhinav     sqlite3_tokenizer ** ppTokenizer)
     63  1.1  abhinav {
     64  1.1  abhinav 	custom_apropos_tokenizer *t;
     65  1.1  abhinav 	t = calloc(1, sizeof(*t));
     66  1.1  abhinav 	if (t == NULL)
     67  1.1  abhinav 		return SQLITE_NOMEM;
     68  1.1  abhinav 	*ppTokenizer = &t->base;
     69  1.1  abhinav 	return SQLITE_OK;
     70  1.1  abhinav }
     71  1.1  abhinav 
     72  1.1  abhinav /*
     73  1.1  abhinav  * Destroy a tokenizer
     74  1.1  abhinav  */
     75  1.1  abhinav static int
     76  1.1  abhinav aproposPorterDestroy(sqlite3_tokenizer * pTokenizer)
     77  1.1  abhinav {
     78  1.1  abhinav 	free(pTokenizer);
     79  1.1  abhinav 	return SQLITE_OK;
     80  1.1  abhinav }
     81  1.1  abhinav 
     82  1.1  abhinav /*
     83  1.1  abhinav  * Prepare to begin tokenizing a particular string.  The input
     84  1.1  abhinav  * string to be tokenized is zInput[0..nInput-1].  A cursor
     85  1.1  abhinav  * used to incrementally tokenize this string is returned in
     86  1.1  abhinav  * *ppCursor.
     87  1.1  abhinav  */
     88  1.1  abhinav static int
     89  1.1  abhinav aproposPorterOpen(
     90  1.1  abhinav     sqlite3_tokenizer * pTokenizer,	/* The tokenizer */
     91  1.1  abhinav     const char *zInput, int nInput,	/* String to be tokenized */
     92  1.1  abhinav     sqlite3_tokenizer_cursor ** ppCursor	/* OUT: Tokenization cursor */
     93  1.1  abhinav )
     94  1.1  abhinav {
     95  1.1  abhinav 	custom_apropos_tokenizer_cursor *c;
     96  1.1  abhinav 
     97  1.1  abhinav 	c = calloc(1, sizeof(*c));
     98  1.1  abhinav 	if (c == NULL)
     99  1.1  abhinav 		return SQLITE_NOMEM;
    100  1.1  abhinav 
    101  1.1  abhinav 	c->zInput = zInput;
    102  1.1  abhinav 	if (zInput != 0) {
    103  1.1  abhinav 		if (nInput < 0)
    104  1.1  abhinav 			c->nInput = strlen(zInput);
    105  1.1  abhinav 		else
    106  1.1  abhinav 			c->nInput = nInput;
    107  1.1  abhinav 	}
    108  1.1  abhinav 
    109  1.1  abhinav 	*ppCursor = &c->base;
    110  1.1  abhinav 	return SQLITE_OK;
    111  1.1  abhinav }
    112  1.1  abhinav 
    113  1.1  abhinav /*
    114  1.1  abhinav  * Close a tokenization cursor previously opened by a call to
    115  1.1  abhinav  * aproposPorterOpen() above.
    116  1.1  abhinav  */
    117  1.1  abhinav static int
    118  1.1  abhinav aproposPorterClose(sqlite3_tokenizer_cursor *pCursor)
    119  1.1  abhinav {
    120  1.1  abhinav 	custom_apropos_tokenizer_cursor *c = (custom_apropos_tokenizer_cursor *) pCursor;
    121  1.1  abhinav 	free(c->zToken);
    122  1.1  abhinav 	free(c);
    123  1.1  abhinav 	return SQLITE_OK;
    124  1.1  abhinav }
    125  1.1  abhinav 
    126  1.1  abhinav /*
    127  1.1  abhinav  * Vowel or consonant
    128  1.1  abhinav  */
    129  1.1  abhinav static const char cType[] = {
    130  1.1  abhinav 	0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
    131  1.1  abhinav 	1, 1, 1, 2, 1
    132  1.1  abhinav };
    133  1.1  abhinav 
    134  1.1  abhinav /*
    135  1.1  abhinav  * isConsonant() and isVowel() determine if their first character in
    136  1.1  abhinav  * the string they point to is a consonant or a vowel, according
    137  1.1  abhinav  * to Porter ruls.
    138  1.1  abhinav  *
    139  1.1  abhinav  * A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
    140  1.1  abhinav  * 'Y' is a consonant unless it follows another consonant,
    141  1.1  abhinav  * in which case it is a vowel.
    142  1.1  abhinav  *
    143  1.1  abhinav  * In these routine, the letters are in reverse order.  So the 'y' rule
    144  1.1  abhinav  * is that 'y' is a consonant unless it is followed by another
    145  1.1  abhinav  * consonent.
    146  1.1  abhinav  */
    147  1.1  abhinav static int isVowel(const char*);
    148  1.1  abhinav 
    149  1.1  abhinav static int
    150  1.1  abhinav isConsonant(const char *z)
    151  1.1  abhinav {
    152  1.1  abhinav 	int j;
    153  1.1  abhinav 	char x = *z;
    154  1.1  abhinav 	if (x == 0)
    155  1.1  abhinav 		return 0;
    156  1.1  abhinav 	assert(x >= 'a' && x <= 'z');
    157  1.1  abhinav 	j = cType[x - 'a'];
    158  1.1  abhinav 	if (j < 2)
    159  1.1  abhinav 		return j;
    160  1.1  abhinav 	return z[1] == 0 || isVowel(z + 1);
    161  1.1  abhinav }
    162  1.1  abhinav 
    163  1.1  abhinav static int
    164  1.1  abhinav isVowel(const char *z)
    165  1.1  abhinav {
    166  1.1  abhinav 	int j;
    167  1.1  abhinav 	char x = *z;
    168  1.1  abhinav 	if (x == 0)
    169  1.1  abhinav 		return 0;
    170  1.1  abhinav 	assert(x >= 'a' && x <= 'z');
    171  1.1  abhinav 	j = cType[x - 'a'];
    172  1.1  abhinav 	if (j < 2)
    173  1.1  abhinav 		return 1 - j;
    174  1.1  abhinav 	return isConsonant(z + 1);
    175  1.1  abhinav }
    176  1.1  abhinav 
    177  1.1  abhinav /*
    178  1.1  abhinav  * Let any sequence of one or more vowels be represented by V and let
    179  1.1  abhinav  * C be sequence of one or more consonants.  Then every word can be
    180  1.1  abhinav  * represented as:
    181  1.1  abhinav  *
    182  1.1  abhinav  *           [C] (VC){m} [V]
    183  1.1  abhinav  *
    184  1.1  abhinav  * In prose:  A word is an optional consonant followed by zero or
    185  1.1  abhinav  * vowel-consonant pairs followed by an optional vowel.  "m" is the
    186  1.1  abhinav  * number of vowel consonant pairs.  This routine computes the value
    187  1.1  abhinav  * of m for the first i bytes of a word.
    188  1.1  abhinav  *
    189  1.1  abhinav  * Return true if the m-value for z is 1 or more.  In other words,
    190  1.1  abhinav  * return true if z contains at least one vowel that is followed
    191  1.1  abhinav  * by a consonant.
    192  1.1  abhinav  *
    193  1.1  abhinav  * In this routine z[] is in reverse order.  So we are really looking
    194  1.1  abhinav  * for an instance of a consonant followed by a vowel.
    195  1.1  abhinav  */
    196  1.1  abhinav static int
    197  1.1  abhinav m_gt_0(const char *z)
    198  1.1  abhinav {
    199  1.1  abhinav 	while (isVowel(z)) {
    200  1.1  abhinav 		z++;
    201  1.1  abhinav 	}
    202  1.1  abhinav 	if (*z == 0)
    203  1.1  abhinav 		return 0;
    204  1.1  abhinav 	while (isConsonant(z)) {
    205  1.1  abhinav 		z++;
    206  1.1  abhinav 	}
    207  1.1  abhinav 	return *z != 0;
    208  1.1  abhinav }
    209  1.1  abhinav 
    210  1.1  abhinav /* Like mgt0 above except we are looking for a value of m which is
    211  1.1  abhinav  * exactly 1
    212  1.1  abhinav  */
    213  1.1  abhinav static int
    214  1.1  abhinav m_eq_1(const char *z)
    215  1.1  abhinav {
    216  1.1  abhinav 	while (isVowel(z)) {
    217  1.1  abhinav 		z++;
    218  1.1  abhinav 	}
    219  1.1  abhinav 	if (*z == 0)
    220  1.1  abhinav 		return 0;
    221  1.1  abhinav 	while (isConsonant(z)) {
    222  1.1  abhinav 		z++;
    223  1.1  abhinav 	}
    224  1.1  abhinav 	if (*z == 0)
    225  1.1  abhinav 		return 0;
    226  1.1  abhinav 	while (isVowel(z)) {
    227  1.1  abhinav 		z++;
    228  1.1  abhinav 	}
    229  1.1  abhinav 	if (*z == 0)
    230  1.1  abhinav 		return 1;
    231  1.1  abhinav 	while (isConsonant(z)) {
    232  1.1  abhinav 		z++;
    233  1.1  abhinav 	}
    234  1.1  abhinav 	return *z == 0;
    235  1.1  abhinav }
    236  1.1  abhinav 
    237  1.1  abhinav /* Like mgt0 above except we are looking for a value of m>1 instead
    238  1.1  abhinav  * or m>0
    239  1.1  abhinav  */
    240  1.1  abhinav static int
    241  1.1  abhinav m_gt_1(const char *z)
    242  1.1  abhinav {
    243  1.1  abhinav 	while (isVowel(z)) {
    244  1.1  abhinav 		z++;
    245  1.1  abhinav 	}
    246  1.1  abhinav 	if (*z == 0)
    247  1.1  abhinav 		return 0;
    248  1.1  abhinav 	while (isConsonant(z)) {
    249  1.1  abhinav 		z++;
    250  1.1  abhinav 	}
    251  1.1  abhinav 	if (*z == 0)
    252  1.1  abhinav 		return 0;
    253  1.1  abhinav 	while (isVowel(z)) {
    254  1.1  abhinav 		z++;
    255  1.1  abhinav 	}
    256  1.1  abhinav 	if (*z == 0)
    257  1.1  abhinav 		return 0;
    258  1.1  abhinav 	while (isConsonant(z)) {
    259  1.1  abhinav 		z++;
    260  1.1  abhinav 	}
    261  1.1  abhinav 	return *z != 0;
    262  1.1  abhinav }
    263  1.1  abhinav 
    264  1.1  abhinav /*
    265  1.1  abhinav  * Return TRUE if there is a vowel anywhere within z[0..n-1]
    266  1.1  abhinav  */
    267  1.1  abhinav static int
    268  1.1  abhinav hasVowel(const char *z)
    269  1.1  abhinav {
    270  1.1  abhinav 	while (isConsonant(z)) {
    271  1.1  abhinav 		z++;
    272  1.1  abhinav 	}
    273  1.1  abhinav 	return *z != 0;
    274  1.1  abhinav }
    275  1.1  abhinav 
    276  1.1  abhinav /*
    277  1.1  abhinav  * Return TRUE if the word ends in a double consonant.
    278  1.1  abhinav  *
    279  1.1  abhinav  * The text is reversed here. So we are really looking at
    280  1.1  abhinav  * the first two characters of z[].
    281  1.1  abhinav  */
    282  1.1  abhinav static int
    283  1.1  abhinav doubleConsonant(const char *z)
    284  1.1  abhinav {
    285  1.1  abhinav 	return isConsonant(z) && z[0] == z[1];
    286  1.1  abhinav }
    287  1.1  abhinav 
    288  1.1  abhinav /*
    289  1.1  abhinav  * Return TRUE if the word ends with three letters which
    290  1.1  abhinav  * are consonant-vowel-consonent and where the final consonant
    291  1.1  abhinav  * is not 'w', 'x', or 'y'.
    292  1.1  abhinav  *
    293  1.1  abhinav  * The word is reversed here.  So we are really checking the
    294  1.1  abhinav  * first three letters and the first one cannot be in [wxy].
    295  1.1  abhinav  */
    296  1.1  abhinav static int
    297  1.1  abhinav star_oh(const char *z)
    298  1.1  abhinav {
    299  1.1  abhinav 	return isConsonant(z) &&
    300  1.1  abhinav 	    z[0] != 'w' && z[0] != 'x' && z[0] != 'y' &&
    301  1.1  abhinav 	    isVowel(z + 1) &&
    302  1.1  abhinav 	    isConsonant(z + 2);
    303  1.1  abhinav }
    304  1.1  abhinav 
    305  1.1  abhinav /*
    306  1.1  abhinav  * If the word ends with zFrom and xCond() is true for the stem
    307  1.1  abhinav  * of the word that preceeds the zFrom ending, then change the
    308  1.1  abhinav  * ending to zTo.
    309  1.1  abhinav  *
    310  1.1  abhinav  * The input word *pz and zFrom are both in reverse order.  zTo
    311  1.1  abhinav  * is in normal order.
    312  1.1  abhinav  *
    313  1.1  abhinav  * Return TRUE if zFrom matches.  Return FALSE if zFrom does not
    314  1.1  abhinav  * match.  Not that TRUE is returned even if xCond() fails and
    315  1.1  abhinav  * no substitution occurs.
    316  1.1  abhinav  */
    317  1.1  abhinav static int
    318  1.1  abhinav stem(
    319  1.1  abhinav     char **pz,			/* The word being stemmed (Reversed) */
    320  1.1  abhinav     const char *zFrom,		/* If the ending matches this... (Reversed) */
    321  1.1  abhinav     const char *zTo,		/* ... change the ending to this (not reversed) */
    322  1.1  abhinav     int (*xCond) (const char *)	/* Condition that must be true */
    323  1.1  abhinav )
    324  1.1  abhinav {
    325  1.1  abhinav 	char *z = *pz;
    326  1.1  abhinav 	while (*zFrom && *zFrom == *z) {
    327  1.1  abhinav 		z++;
    328  1.1  abhinav 		zFrom++;
    329  1.1  abhinav 	}
    330  1.1  abhinav 	if (*zFrom != 0)
    331  1.1  abhinav 		return 0;
    332  1.1  abhinav 	if (xCond && !xCond(z))
    333  1.1  abhinav 		return 1;
    334  1.1  abhinav 	while (*zTo) {
    335  1.1  abhinav 		*(--z) = *(zTo++);
    336  1.1  abhinav 	}
    337  1.1  abhinav 	*pz = z;
    338  1.1  abhinav 	return 1;
    339  1.1  abhinav }
    340  1.1  abhinav 
    341  1.1  abhinav /*
    342  1.1  abhinav  * This is the fallback stemmer used when the porter stemmer is
    343  1.1  abhinav  * inappropriate.  The input word is copied into the output with
    344  1.1  abhinav  * US-ASCII case folding.  If the input word is too long (more
    345  1.1  abhinav  * than 20 bytes if it contains no digits or more than 6 bytes if
    346  1.1  abhinav  * it contains digits) then word is truncated to 20 or 6 bytes
    347  1.1  abhinav  * by taking 10 or 3 bytes from the beginning and end.
    348  1.1  abhinav  */
    349  1.1  abhinav static void
    350  1.1  abhinav copy_stemmer(const char *zIn, size_t nIn, char *zOut, size_t *pnOut)
    351  1.1  abhinav {
    352  1.1  abhinav 	size_t i, mx, j;
    353  1.1  abhinav 	int hasDigit = 0;
    354  1.1  abhinav 	for (i = 0; i < nIn; i++) {
    355  1.1  abhinav 		char c = zIn[i];
    356  1.1  abhinav 		if (c >= 'A' && c <= 'Z') {
    357  1.1  abhinav 			zOut[i] = c - 'A' + 'a';
    358  1.1  abhinav 		} else {
    359  1.1  abhinav 			if (c >= '0' && c <= '9')
    360  1.1  abhinav 				hasDigit = 1;
    361  1.1  abhinav 			zOut[i] = c;
    362  1.1  abhinav 		}
    363  1.1  abhinav 	}
    364  1.1  abhinav 	mx = hasDigit ? 3 : 10;
    365  1.1  abhinav 	if (nIn > mx * 2) {
    366  1.1  abhinav 		for (j = mx, i = nIn - mx; i < nIn; i++, j++) {
    367  1.1  abhinav 			zOut[j] = zOut[i];
    368  1.1  abhinav 		}
    369  1.1  abhinav 		i = j;
    370  1.1  abhinav 	}
    371  1.1  abhinav 	zOut[i] = 0;
    372  1.1  abhinav 	*pnOut = i;
    373  1.1  abhinav }
    374  1.1  abhinav 
    375  1.1  abhinav 
    376  1.1  abhinav /*
    377  1.1  abhinav  * Stem the input word zIn[0..nIn-1].  Store the output in zOut.
    378  1.1  abhinav  * zOut is at least big enough to hold nIn bytes.  Write the actual
    379  1.1  abhinav  * size of the output word (exclusive of the '\0' terminator) into *pnOut.
    380  1.1  abhinav  *
    381  1.1  abhinav  * Any upper-case characters in the US-ASCII character set ([A-Z])
    382  1.1  abhinav  * are converted to lower case.  Upper-case UTF characters are
    383  1.1  abhinav  * unchanged.
    384  1.1  abhinav  *
    385  1.1  abhinav  * Words that are longer than about 20 bytes are stemmed by retaining
    386  1.1  abhinav  * a few bytes from the beginning and the end of the word.  If the
    387  1.1  abhinav  * word contains digits, 3 bytes are taken from the beginning and
    388  1.1  abhinav  * 3 bytes from the end.  For long words without digits, 10 bytes
    389  1.1  abhinav  * are taken from each end.  US-ASCII case folding still applies.
    390  1.1  abhinav  *
    391  1.1  abhinav  * If the input word contains not digits but does characters not
    392  1.1  abhinav  * in [a-zA-Z] then no stemming is attempted and this routine just
    393  1.1  abhinav  * copies the input into the input into the output with US-ASCII
    394  1.1  abhinav  * case folding.
    395  1.1  abhinav  *
    396  1.1  abhinav  * Stemming never increases the length of the word.  So there is
    397  1.1  abhinav  * no chance of overflowing the zOut buffer.
    398  1.1  abhinav  */
    399  1.1  abhinav static void
    400  1.1  abhinav porter_stemmer(const char *zIn, size_t nIn, char *zOut, size_t *pnOut)
    401  1.1  abhinav {
    402  1.1  abhinav 	size_t i, j;
    403  1.1  abhinav 	char zReverse[28];
    404  1.1  abhinav 	char *z, *z2;
    405  1.1  abhinav 	if (nIn < 3 || nIn >= sizeof(zReverse) - 7) {
    406  1.1  abhinav 		/* The word is too big or too small for the porter stemmer.
    407  1.1  abhinav 		 * Fallback to the copy stemmer
    408  1.1  abhinav 		 */
    409  1.1  abhinav 		copy_stemmer(zIn, nIn, zOut, pnOut);
    410  1.1  abhinav 		return;
    411  1.1  abhinav 	}
    412  1.1  abhinav 
    413  1.1  abhinav 	for (i = 0, j = sizeof(zReverse) - 6; i < nIn; i++, j--) {
    414  1.1  abhinav 		char c = zIn[i];
    415  1.1  abhinav 		if (c >= 'A' && c <= 'Z') {
    416  1.1  abhinav 			zReverse[j] = c + 'a' - 'A';
    417  1.1  abhinav 		} else if (c >= 'a' && c <= 'z') {
    418  1.1  abhinav 			zReverse[j] = c;
    419  1.1  abhinav 		} else {
    420  1.1  abhinav 			/* The use of a character not in [a-zA-Z] means that
    421  1.1  abhinav 			 * we fallback * to the copy stemmer
    422  1.1  abhinav 			 */
    423  1.1  abhinav 			copy_stemmer(zIn, nIn, zOut, pnOut);
    424  1.1  abhinav 			return;
    425  1.1  abhinav 		}
    426  1.1  abhinav 	}
    427  1.1  abhinav 	memset(&zReverse[sizeof(zReverse) - 5], 0, 5);
    428  1.1  abhinav 	z = &zReverse[j + 1];
    429  1.1  abhinav 
    430  1.1  abhinav 
    431  1.1  abhinav 	/* Step 1a */
    432  1.1  abhinav 	if (z[0] == 's') {
    433  1.1  abhinav 		if (
    434  1.1  abhinav 		    !stem(&z, "sess", "ss", 0) &&
    435  1.1  abhinav 		    !stem(&z, "sei", "i", 0) &&
    436  1.1  abhinav 		    !stem(&z, "ss", "ss", 0)
    437  1.1  abhinav 		    ) {
    438  1.1  abhinav 			z++;
    439  1.1  abhinav 		}
    440  1.1  abhinav 	}
    441  1.1  abhinav 	/* Step 1b */
    442  1.1  abhinav 	z2 = z;
    443  1.1  abhinav 	if (stem(&z, "dee", "ee", m_gt_0)) {
    444  1.1  abhinav 		/* Do nothing.  The work was all in the test */
    445  1.1  abhinav 	} else if (
    446  1.1  abhinav 		    (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
    447  1.1  abhinav 		    && z != z2
    448  1.1  abhinav 	    ) {
    449  1.1  abhinav 		if (stem(&z, "ta", "ate", 0) ||
    450  1.1  abhinav 		    stem(&z, "lb", "ble", 0) ||
    451  1.1  abhinav 		    stem(&z, "zi", "ize", 0)) {
    452  1.1  abhinav 			/* Do nothing.  The work was all in the test */
    453  1.1  abhinav 		} else if (doubleConsonant(z) && (*z != 'l' && *z != 's' && *z != 'z')) {
    454  1.1  abhinav 			z++;
    455  1.1  abhinav 		} else if (m_eq_1(z) && star_oh(z)) {
    456  1.1  abhinav 			*(--z) = 'e';
    457  1.1  abhinav 		}
    458  1.1  abhinav 	}
    459  1.1  abhinav 	/* Step 1c */
    460  1.1  abhinav 	if (z[0] == 'y' && hasVowel(z + 1)) {
    461  1.1  abhinav 		z[0] = 'i';
    462  1.1  abhinav 	}
    463  1.1  abhinav 	/* Step 2 */
    464  1.1  abhinav 	switch (z[1]) {
    465  1.1  abhinav 	case 'a':
    466  1.1  abhinav 		if (!stem(&z, "lanoita", "ate", m_gt_0)) {
    467  1.1  abhinav 			stem(&z, "lanoit", "tion", m_gt_0);
    468  1.1  abhinav 		}
    469  1.1  abhinav 		break;
    470  1.1  abhinav 	case 'c':
    471  1.1  abhinav 		if (!stem(&z, "icne", "ence", m_gt_0)) {
    472  1.1  abhinav 			stem(&z, "icna", "ance", m_gt_0);
    473  1.1  abhinav 		}
    474  1.1  abhinav 		break;
    475  1.1  abhinav 	case 'e':
    476  1.1  abhinav 		stem(&z, "rezi", "ize", m_gt_0);
    477  1.1  abhinav 		break;
    478  1.1  abhinav 	case 'g':
    479  1.1  abhinav 		stem(&z, "igol", "log", m_gt_0);
    480  1.1  abhinav 		break;
    481  1.1  abhinav 	case 'l':
    482  1.1  abhinav 		if (!stem(&z, "ilb", "ble", m_gt_0)
    483  1.1  abhinav 		    && !stem(&z, "illa", "al", m_gt_0)
    484  1.1  abhinav 		    && !stem(&z, "iltne", "ent", m_gt_0)
    485  1.1  abhinav 		    && !stem(&z, "ile", "e", m_gt_0)
    486  1.1  abhinav 		    ) {
    487  1.1  abhinav 			stem(&z, "ilsuo", "ous", m_gt_0);
    488  1.1  abhinav 		}
    489  1.1  abhinav 		break;
    490  1.1  abhinav 	case 'o':
    491  1.1  abhinav 		if (!stem(&z, "noitazi", "ize", m_gt_0)
    492  1.1  abhinav 		    && !stem(&z, "noita", "ate", m_gt_0)
    493  1.1  abhinav 		    ) {
    494  1.1  abhinav 			stem(&z, "rota", "ate", m_gt_0);
    495  1.1  abhinav 		}
    496  1.1  abhinav 		break;
    497  1.1  abhinav 	case 's':
    498  1.1  abhinav 		if (!stem(&z, "msila", "al", m_gt_0)
    499  1.1  abhinav 		    && !stem(&z, "ssenevi", "ive", m_gt_0)
    500  1.1  abhinav 		    && !stem(&z, "ssenluf", "ful", m_gt_0)
    501  1.1  abhinav 		    ) {
    502  1.1  abhinav 			stem(&z, "ssensuo", "ous", m_gt_0);
    503  1.1  abhinav 		}
    504  1.1  abhinav 		break;
    505  1.1  abhinav 	case 't':
    506  1.1  abhinav 		if (!stem(&z, "itila", "al", m_gt_0)
    507  1.1  abhinav 		    && !stem(&z, "itivi", "ive", m_gt_0)
    508  1.1  abhinav 		    ) {
    509  1.1  abhinav 			stem(&z, "itilib", "ble", m_gt_0);
    510  1.1  abhinav 		}
    511  1.1  abhinav 		break;
    512  1.1  abhinav 	}
    513  1.1  abhinav 
    514  1.1  abhinav 	/* Step 3 */
    515  1.1  abhinav 	switch (z[0]) {
    516  1.1  abhinav 	case 'e':
    517  1.1  abhinav 		if (!stem(&z, "etaci", "ic", m_gt_0)
    518  1.1  abhinav 		    && !stem(&z, "evita", "", m_gt_0)
    519  1.1  abhinav 		    ) {
    520  1.1  abhinav 			stem(&z, "ezila", "al", m_gt_0);
    521  1.1  abhinav 		}
    522  1.1  abhinav 		break;
    523  1.1  abhinav 	case 'i':
    524  1.1  abhinav 		stem(&z, "itici", "ic", m_gt_0);
    525  1.1  abhinav 		break;
    526  1.1  abhinav 	case 'l':
    527  1.1  abhinav 		if (!stem(&z, "laci", "ic", m_gt_0)) {
    528  1.1  abhinav 			stem(&z, "luf", "", m_gt_0);
    529  1.1  abhinav 		}
    530  1.1  abhinav 		break;
    531  1.1  abhinav 	case 's':
    532  1.1  abhinav 		stem(&z, "ssen", "", m_gt_0);
    533  1.1  abhinav 		break;
    534  1.1  abhinav 	}
    535  1.1  abhinav 
    536  1.1  abhinav 	/* Step 4 */
    537  1.1  abhinav 	switch (z[1]) {
    538  1.1  abhinav 	case 'a':
    539  1.1  abhinav 		if (z[0] == 'l' && m_gt_1(z + 2)) {
    540  1.1  abhinav 			z += 2;
    541  1.1  abhinav 		}
    542  1.1  abhinav 		break;
    543  1.1  abhinav 	case 'c':
    544  1.1  abhinav 		if (z[0] == 'e' && z[2] == 'n' && (z[3] == 'a' || z[3] == 'e') && m_gt_1(z + 4)) {
    545  1.1  abhinav 			z += 4;
    546  1.1  abhinav 		}
    547  1.1  abhinav 		break;
    548  1.1  abhinav 	case 'e':
    549  1.1  abhinav 		if (z[0] == 'r' && m_gt_1(z + 2)) {
    550  1.1  abhinav 			z += 2;
    551  1.1  abhinav 		}
    552  1.1  abhinav 		break;
    553  1.1  abhinav 	case 'i':
    554  1.1  abhinav 		if (z[0] == 'c' && m_gt_1(z + 2)) {
    555  1.1  abhinav 			z += 2;
    556  1.1  abhinav 		}
    557  1.1  abhinav 		break;
    558  1.1  abhinav 	case 'l':
    559  1.1  abhinav 		if (z[0] == 'e' && z[2] == 'b' && (z[3] == 'a' || z[3] == 'i') && m_gt_1(z + 4)) {
    560  1.1  abhinav 			z += 4;
    561  1.1  abhinav 		}
    562  1.1  abhinav 		break;
    563  1.1  abhinav 	case 'n':
    564  1.1  abhinav 		if (z[0] == 't') {
    565  1.1  abhinav 			if (z[2] == 'a') {
    566  1.1  abhinav 				if (m_gt_1(z + 3)) {
    567  1.1  abhinav 					z += 3;
    568  1.1  abhinav 				}
    569  1.1  abhinav 			} else if (z[2] == 'e') {
    570  1.1  abhinav 				if (!stem(&z, "tneme", "", m_gt_1)
    571  1.1  abhinav 				    && !stem(&z, "tnem", "", m_gt_1)
    572  1.1  abhinav 				    ) {
    573  1.1  abhinav 					stem(&z, "tne", "", m_gt_1);
    574  1.1  abhinav 				}
    575  1.1  abhinav 			}
    576  1.1  abhinav 		}
    577  1.1  abhinav 		break;
    578  1.1  abhinav 	case 'o':
    579  1.1  abhinav 		if (z[0] == 'u') {
    580  1.1  abhinav 			if (m_gt_1(z + 2)) {
    581  1.1  abhinav 				z += 2;
    582  1.1  abhinav 			}
    583  1.1  abhinav 		} else if (z[3] == 's' || z[3] == 't') {
    584  1.1  abhinav 			stem(&z, "noi", "", m_gt_1);
    585  1.1  abhinav 		}
    586  1.1  abhinav 		break;
    587  1.1  abhinav 	case 's':
    588  1.1  abhinav 		if (z[0] == 'm' && z[2] == 'i' && m_gt_1(z + 3)) {
    589  1.1  abhinav 			z += 3;
    590  1.1  abhinav 		}
    591  1.1  abhinav 		break;
    592  1.1  abhinav 	case 't':
    593  1.1  abhinav 		if (!stem(&z, "eta", "", m_gt_1)) {
    594  1.1  abhinav 			stem(&z, "iti", "", m_gt_1);
    595  1.1  abhinav 		}
    596  1.1  abhinav 		break;
    597  1.1  abhinav 	case 'u':
    598  1.1  abhinav 		if (z[0] == 's' && z[2] == 'o' && m_gt_1(z + 3)) {
    599  1.1  abhinav 			z += 3;
    600  1.1  abhinav 		}
    601  1.1  abhinav 		break;
    602  1.1  abhinav 	case 'v':
    603  1.1  abhinav 	case 'z':
    604  1.1  abhinav 		if (z[0] == 'e' && z[2] == 'i' && m_gt_1(z + 3)) {
    605  1.1  abhinav 			z += 3;
    606  1.1  abhinav 		}
    607  1.1  abhinav 		break;
    608  1.1  abhinav 	}
    609  1.1  abhinav 
    610  1.1  abhinav 	/* Step 5a */
    611  1.1  abhinav 	if (z[0] == 'e') {
    612  1.1  abhinav 		if (m_gt_1(z + 1)) {
    613  1.1  abhinav 			z++;
    614  1.1  abhinav 		} else if (m_eq_1(z + 1) && !star_oh(z + 1)) {
    615  1.1  abhinav 			z++;
    616  1.1  abhinav 		}
    617  1.1  abhinav 	}
    618  1.1  abhinav 	/* Step 5b */
    619  1.1  abhinav 	if (m_gt_1(z) && z[0] == 'l' && z[1] == 'l') {
    620  1.1  abhinav 		z++;
    621  1.1  abhinav 	}
    622  1.1  abhinav 	/* z[] is now the stemmed word in reverse order.  Flip it back
    623  1.1  abhinav 	 * around into forward order and return.
    624  1.1  abhinav 	 */
    625  1.1  abhinav 	*pnOut = i = strlen(z);
    626  1.1  abhinav 	zOut[i] = 0;
    627  1.1  abhinav 	while (*z) {
    628  1.1  abhinav 		zOut[--i] = *(z++);
    629  1.1  abhinav 	}
    630  1.1  abhinav }
    631  1.1  abhinav 
    632  1.1  abhinav /*
    633  1.1  abhinav  * Based on whether the input word is in the nostem list or not
    634  1.1  abhinav  * call porter stemmer to stem it, or call copy_stemmer to keep it
    635  1.1  abhinav  * as it is (copy_stemmer converts simply converts it to lower case)
    636  1.1  abhinav  * Returns  SQLITE_OK if stemming is successful, an error code for
    637  1.1  abhinav  * any errors
    638  1.1  abhinav  */
    639  1.1  abhinav static int
    640  1.1  abhinav do_stem(const char *zIn, size_t nIn, char *zOut, size_t *pnOut)
    641  1.1  abhinav {
    642  1.1  abhinav 	/* Before looking up the word in the hash table, convert it to lower-case */
    643  1.1  abhinav 	char *dupword = malloc(nIn);
    644  1.1  abhinav 	if (dupword == NULL)
    645  1.1  abhinav 		return SQLITE_NOMEM;
    646  1.1  abhinav 
    647  1.1  abhinav 	for (size_t i = 0; i < nIn; i++)
    648  1.1  abhinav 		dupword[i] = tolower((unsigned char) zIn[i]);
    649  1.1  abhinav 
    650  1.1  abhinav 	size_t idx = nostem_hash(dupword, nIn);
    651  1.1  abhinav 	if (strncmp(nostem[idx], dupword, nIn) == 0 && nostem[idx][nIn] == 0)
    652  1.1  abhinav 		copy_stemmer(zIn, nIn, zOut, pnOut);
    653  1.1  abhinav 	else
    654  1.1  abhinav 		porter_stemmer(zIn, nIn, zOut, pnOut);
    655  1.1  abhinav 
    656  1.1  abhinav 	free(dupword);
    657  1.1  abhinav 	return SQLITE_OK;
    658  1.1  abhinav }
    659  1.1  abhinav 
    660  1.1  abhinav 
    661  1.1  abhinav /*
    662  1.1  abhinav  * Characters that can be part of a token.  We assume any character
    663  1.1  abhinav  * whose value is greater than 0x80 (any UTF character) can be
    664  1.1  abhinav  * part of a token.  In other words, delimiters all must have
    665  1.1  abhinav  * values of 0x7f or lower.
    666  1.1  abhinav  */
    667  1.1  abhinav static const char porterIdChar[] = {
    668  1.1  abhinav /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
    669  1.1  abhinav 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,	/* 3x */
    670  1.1  abhinav 	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	/* 4x */
    671  1.1  abhinav 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,	/* 5x */
    672  1.1  abhinav 	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	/* 6x */
    673  1.1  abhinav 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,	/* 7x */
    674  1.1  abhinav };
    675  1.1  abhinav 
    676  1.1  abhinav #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
    677  1.1  abhinav 
    678  1.1  abhinav /*
    679  1.1  abhinav  * Extract the next token from a tokenization cursor.  The cursor must
    680  1.1  abhinav  * have been opened by a prior call to aproposPorterOpen().
    681  1.1  abhinav  */
    682  1.1  abhinav static int
    683  1.1  abhinav aproposPorterNext(
    684  1.1  abhinav     sqlite3_tokenizer_cursor *pCursor,	/* Cursor returned by aproposPorterOpen */
    685  1.1  abhinav     const char **pzToken,	/* OUT: *pzToken is the token text */
    686  1.1  abhinav     int *pnBytes,		/* OUT: Number of bytes in token */
    687  1.1  abhinav     int *piStartOffset,		/* OUT: Starting offset of token */
    688  1.1  abhinav     int *piEndOffset,		/* OUT: Ending offset of token */
    689  1.1  abhinav     int *piPosition		/* OUT: Position integer of token */
    690  1.1  abhinav )
    691  1.1  abhinav {
    692  1.1  abhinav 	custom_apropos_tokenizer_cursor *c = (custom_apropos_tokenizer_cursor *) pCursor;
    693  1.1  abhinav 	const char *z = c->zInput;
    694  1.1  abhinav 
    695  1.1  abhinav 	while (c->iOffset < c->nInput) {
    696  1.1  abhinav 		size_t iStartOffset, ch;
    697  1.1  abhinav 
    698  1.1  abhinav 		/* Scan past delimiter characters */
    699  1.1  abhinav 		while (c->iOffset < c->nInput && isDelim(z[c->iOffset])) {
    700  1.1  abhinav 			c->iOffset++;
    701  1.1  abhinav 		}
    702  1.1  abhinav 
    703  1.1  abhinav 		/* Count non-delimiter characters. */
    704  1.1  abhinav 		iStartOffset = c->iOffset;
    705  1.1  abhinav 		while (c->iOffset < c->nInput && !isDelim(z[c->iOffset])) {
    706  1.1  abhinav 			c->iOffset++;
    707  1.1  abhinav 		}
    708  1.1  abhinav 
    709  1.1  abhinav 		if (c->iOffset > iStartOffset) {
    710  1.1  abhinav 			size_t n = c->iOffset - iStartOffset;
    711  1.1  abhinav 			if (n > c->nAllocated) {
    712  1.1  abhinav 				char *pNew;
    713  1.1  abhinav 				c->nAllocated = n + 20;
    714  1.1  abhinav 				pNew = realloc(c->zToken, c->nAllocated);
    715  1.1  abhinav 				if (!pNew)
    716  1.1  abhinav 					return SQLITE_NOMEM;
    717  1.1  abhinav 				c->zToken = pNew;
    718  1.1  abhinav 			}
    719  1.2  abhinav 
    720  1.2  abhinav 			size_t temp;
    721  1.2  abhinav 			int stemStatus = do_stem(&z[iStartOffset], n, c->zToken, &temp);
    722  1.2  abhinav 			*pnBytes = temp;
    723  1.1  abhinav 			if (stemStatus != SQLITE_OK)
    724  1.1  abhinav 				return stemStatus;
    725  1.1  abhinav 
    726  1.1  abhinav 			*pzToken = c->zToken;
    727  1.1  abhinav 			*piStartOffset = iStartOffset;
    728  1.1  abhinav 			*piEndOffset = c->iOffset;
    729  1.1  abhinav 			*piPosition = c->iToken++;
    730  1.1  abhinav 			return SQLITE_OK;
    731  1.1  abhinav 		}
    732  1.1  abhinav 	}
    733  1.1  abhinav 	return SQLITE_DONE;
    734  1.1  abhinav }
    735  1.1  abhinav 
    736  1.1  abhinav /*
    737  1.1  abhinav  * The set of routines that implement the porter-stemmer tokenizer
    738  1.1  abhinav  */
    739  1.1  abhinav static const sqlite3_tokenizer_module aproposPorterTokenizerModule = {
    740  1.1  abhinav 	0,
    741  1.1  abhinav 	aproposPorterCreate,
    742  1.1  abhinav 	aproposPorterDestroy,
    743  1.1  abhinav 	aproposPorterOpen,
    744  1.1  abhinav 	aproposPorterClose,
    745  1.1  abhinav 	aproposPorterNext,
    746  1.1  abhinav 	0
    747  1.1  abhinav };
    748  1.1  abhinav 
    749  1.1  abhinav /*
    750  1.1  abhinav  * Allocate a new porter tokenizer.  Return a pointer to the new
    751  1.1  abhinav  * tokenizer in *ppModule
    752  1.1  abhinav  */
    753  1.1  abhinav void
    754  1.1  abhinav get_custom_apropos_tokenizer(sqlite3_tokenizer_module const ** ppModule)
    755  1.1  abhinav {
    756  1.1  abhinav 	*ppModule = &aproposPorterTokenizerModule;
    757  1.1  abhinav }
    758