1 1.6 tnn /* $NetBSD: custom_apropos_tokenizer.c,v 1.6 2023/08/07 20:35:21 tnn Exp $ */ 2 1.1 abhinav /* 3 1.1 abhinav ** 2006 September 30 4 1.1 abhinav ** 5 1.1 abhinav ** The author disclaims copyright to this source code. In place of 6 1.1 abhinav ** a legal notice, here is a blessing: 7 1.1 abhinav ** 8 1.1 abhinav ** May you do good and not evil. 9 1.1 abhinav ** May you find forgiveness for yourself and forgive others. 10 1.1 abhinav ** May you share freely, never taking more than you give. 11 1.1 abhinav ** 12 1.1 abhinav ************************************************************************* 13 1.1 abhinav ** Implementation of the full-text-search tokenizer that implements 14 1.1 abhinav ** a Porter stemmer. 15 1.1 abhinav */ 16 1.1 abhinav 17 1.1 abhinav /* 18 1.1 abhinav ** The code in this file is only compiled if: 19 1.1 abhinav ** 20 1.1 abhinav ** * The FTS3 module is being built as an extension 21 1.1 abhinav ** (in which case SQLITE_CORE is not defined), or 22 1.1 abhinav ** 23 1.1 abhinav ** * The FTS3 module is being built into the core of 24 1.1 abhinav ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). 25 1.1 abhinav */ 26 1.1 abhinav 27 1.1 abhinav #include <assert.h> 28 1.1 abhinav #include <ctype.h> 29 1.1 abhinav #include <stdlib.h> 30 1.1 abhinav #include <stdio.h> 31 1.1 abhinav #include <string.h> 32 1.1 abhinav 33 1.1 abhinav #include "custom_apropos_tokenizer.h" 34 1.1 abhinav #include "fts3_tokenizer.h" 35 1.1 abhinav #include "nostem.c" 36 1.1 abhinav 37 1.1 abhinav /* 38 1.1 abhinav * Class derived from sqlite3_tokenizer 39 1.1 abhinav */ 40 1.1 abhinav typedef struct custom_apropos_tokenizer { 41 1.1 abhinav sqlite3_tokenizer base; /* Base class */ 42 1.1 abhinav } custom_apropos_tokenizer; 43 1.1 abhinav 44 1.1 abhinav /* 45 1.1 abhinav * Class derived from sqlite3_tokenizer_cursor 46 1.1 abhinav */ 47 1.1 abhinav typedef struct custom_apropos_tokenizer_cursor { 48 1.1 abhinav sqlite3_tokenizer_cursor base; 49 1.1 abhinav const char *zInput; /* input we are tokenizing */ 50 1.1 abhinav size_t nInput; /* size of the input */ 51 1.1 abhinav size_t iOffset; /* current position in zInput */ 52 1.1 abhinav size_t iToken; /* index of next token to be returned */ 53 1.1 abhinav char *zToken; /* storage for current token */ 54 1.1 abhinav size_t nAllocated; /* space allocated to zToken buffer */ 55 1.1 abhinav } custom_apropos_tokenizer_cursor; 56 1.1 abhinav 57 1.1 abhinav /* 58 1.1 abhinav * Create a new tokenizer instance. 59 1.1 abhinav */ 60 1.1 abhinav static int 61 1.1 abhinav aproposPorterCreate(int argc, const char *const * argv, 62 1.1 abhinav sqlite3_tokenizer ** ppTokenizer) 63 1.1 abhinav { 64 1.1 abhinav custom_apropos_tokenizer *t; 65 1.1 abhinav t = calloc(1, sizeof(*t)); 66 1.1 abhinav if (t == NULL) 67 1.1 abhinav return SQLITE_NOMEM; 68 1.1 abhinav *ppTokenizer = &t->base; 69 1.1 abhinav return SQLITE_OK; 70 1.1 abhinav } 71 1.1 abhinav 72 1.1 abhinav /* 73 1.1 abhinav * Destroy a tokenizer 74 1.1 abhinav */ 75 1.5 rin static int 76 1.1 abhinav aproposPorterDestroy(sqlite3_tokenizer * pTokenizer) 77 1.1 abhinav { 78 1.1 abhinav free(pTokenizer); 79 1.1 abhinav return SQLITE_OK; 80 1.1 abhinav } 81 1.1 abhinav 82 1.1 abhinav /* 83 1.1 abhinav * Prepare to begin tokenizing a particular string. The input 84 1.1 abhinav * string to be tokenized is zInput[0..nInput-1]. A cursor 85 1.5 rin * used to incrementally tokenize this string is returned in 86 1.1 abhinav * *ppCursor. 87 1.1 abhinav */ 88 1.5 rin static int 89 1.1 abhinav aproposPorterOpen( 90 1.1 abhinav sqlite3_tokenizer * pTokenizer, /* The tokenizer */ 91 1.1 abhinav const char *zInput, int nInput, /* String to be tokenized */ 92 1.1 abhinav sqlite3_tokenizer_cursor ** ppCursor /* OUT: Tokenization cursor */ 93 1.1 abhinav ) 94 1.1 abhinav { 95 1.1 abhinav custom_apropos_tokenizer_cursor *c; 96 1.1 abhinav 97 1.1 abhinav c = calloc(1, sizeof(*c)); 98 1.1 abhinav if (c == NULL) 99 1.1 abhinav return SQLITE_NOMEM; 100 1.1 abhinav 101 1.1 abhinav c->zInput = zInput; 102 1.1 abhinav if (zInput != 0) { 103 1.1 abhinav if (nInput < 0) 104 1.1 abhinav c->nInput = strlen(zInput); 105 1.1 abhinav else 106 1.1 abhinav c->nInput = nInput; 107 1.1 abhinav } 108 1.1 abhinav 109 1.1 abhinav *ppCursor = &c->base; 110 1.1 abhinav return SQLITE_OK; 111 1.1 abhinav } 112 1.1 abhinav 113 1.1 abhinav /* 114 1.1 abhinav * Close a tokenization cursor previously opened by a call to 115 1.1 abhinav * aproposPorterOpen() above. 116 1.1 abhinav */ 117 1.5 rin static int 118 1.1 abhinav aproposPorterClose(sqlite3_tokenizer_cursor *pCursor) 119 1.1 abhinav { 120 1.1 abhinav custom_apropos_tokenizer_cursor *c = (custom_apropos_tokenizer_cursor *) pCursor; 121 1.1 abhinav free(c->zToken); 122 1.1 abhinav free(c); 123 1.1 abhinav return SQLITE_OK; 124 1.1 abhinav } 125 1.1 abhinav 126 1.1 abhinav /* 127 1.1 abhinav * Vowel or consonant 128 1.1 abhinav */ 129 1.1 abhinav static const char cType[] = { 130 1.1 abhinav 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 131 1.1 abhinav 1, 1, 1, 2, 1 132 1.1 abhinav }; 133 1.1 abhinav 134 1.1 abhinav /* 135 1.1 abhinav * isConsonant() and isVowel() determine if their first character in 136 1.1 abhinav * the string they point to is a consonant or a vowel, according 137 1.5 rin * to Porter ruls. 138 1.1 abhinav * 139 1.1 abhinav * A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'. 140 1.1 abhinav * 'Y' is a consonant unless it follows another consonant, 141 1.1 abhinav * in which case it is a vowel. 142 1.1 abhinav * 143 1.1 abhinav * In these routine, the letters are in reverse order. So the 'y' rule 144 1.1 abhinav * is that 'y' is a consonant unless it is followed by another 145 1.1 abhinav * consonent. 146 1.1 abhinav */ 147 1.1 abhinav static int isVowel(const char*); 148 1.1 abhinav 149 1.5 rin static int 150 1.1 abhinav isConsonant(const char *z) 151 1.1 abhinav { 152 1.1 abhinav int j; 153 1.1 abhinav char x = *z; 154 1.1 abhinav if (x == 0) 155 1.1 abhinav return 0; 156 1.1 abhinav assert(x >= 'a' && x <= 'z'); 157 1.1 abhinav j = cType[x - 'a']; 158 1.1 abhinav if (j < 2) 159 1.1 abhinav return j; 160 1.1 abhinav return z[1] == 0 || isVowel(z + 1); 161 1.1 abhinav } 162 1.1 abhinav 163 1.5 rin static int 164 1.1 abhinav isVowel(const char *z) 165 1.1 abhinav { 166 1.1 abhinav int j; 167 1.1 abhinav char x = *z; 168 1.1 abhinav if (x == 0) 169 1.1 abhinav return 0; 170 1.1 abhinav assert(x >= 'a' && x <= 'z'); 171 1.1 abhinav j = cType[x - 'a']; 172 1.1 abhinav if (j < 2) 173 1.1 abhinav return 1 - j; 174 1.1 abhinav return isConsonant(z + 1); 175 1.1 abhinav } 176 1.1 abhinav 177 1.1 abhinav /* 178 1.1 abhinav * Let any sequence of one or more vowels be represented by V and let 179 1.1 abhinav * C be sequence of one or more consonants. Then every word can be 180 1.1 abhinav * represented as: 181 1.1 abhinav * 182 1.1 abhinav * [C] (VC){m} [V] 183 1.1 abhinav * 184 1.1 abhinav * In prose: A word is an optional consonant followed by zero or 185 1.1 abhinav * vowel-consonant pairs followed by an optional vowel. "m" is the 186 1.1 abhinav * number of vowel consonant pairs. This routine computes the value 187 1.1 abhinav * of m for the first i bytes of a word. 188 1.1 abhinav * 189 1.1 abhinav * Return true if the m-value for z is 1 or more. In other words, 190 1.1 abhinav * return true if z contains at least one vowel that is followed 191 1.1 abhinav * by a consonant. 192 1.1 abhinav * 193 1.1 abhinav * In this routine z[] is in reverse order. So we are really looking 194 1.1 abhinav * for an instance of a consonant followed by a vowel. 195 1.1 abhinav */ 196 1.5 rin static int 197 1.1 abhinav m_gt_0(const char *z) 198 1.1 abhinav { 199 1.1 abhinav while (isVowel(z)) { 200 1.1 abhinav z++; 201 1.1 abhinav } 202 1.1 abhinav if (*z == 0) 203 1.1 abhinav return 0; 204 1.1 abhinav while (isConsonant(z)) { 205 1.1 abhinav z++; 206 1.1 abhinav } 207 1.1 abhinav return *z != 0; 208 1.1 abhinav } 209 1.1 abhinav 210 1.1 abhinav /* Like mgt0 above except we are looking for a value of m which is 211 1.1 abhinav * exactly 1 212 1.1 abhinav */ 213 1.5 rin static int 214 1.1 abhinav m_eq_1(const char *z) 215 1.1 abhinav { 216 1.1 abhinav while (isVowel(z)) { 217 1.1 abhinav z++; 218 1.1 abhinav } 219 1.1 abhinav if (*z == 0) 220 1.1 abhinav return 0; 221 1.1 abhinav while (isConsonant(z)) { 222 1.1 abhinav z++; 223 1.1 abhinav } 224 1.1 abhinav if (*z == 0) 225 1.1 abhinav return 0; 226 1.1 abhinav while (isVowel(z)) { 227 1.1 abhinav z++; 228 1.1 abhinav } 229 1.1 abhinav if (*z == 0) 230 1.1 abhinav return 1; 231 1.1 abhinav while (isConsonant(z)) { 232 1.1 abhinav z++; 233 1.1 abhinav } 234 1.1 abhinav return *z == 0; 235 1.1 abhinav } 236 1.1 abhinav 237 1.1 abhinav /* Like mgt0 above except we are looking for a value of m>1 instead 238 1.1 abhinav * or m>0 239 1.1 abhinav */ 240 1.5 rin static int 241 1.1 abhinav m_gt_1(const char *z) 242 1.1 abhinav { 243 1.1 abhinav while (isVowel(z)) { 244 1.1 abhinav z++; 245 1.1 abhinav } 246 1.1 abhinav if (*z == 0) 247 1.1 abhinav return 0; 248 1.1 abhinav while (isConsonant(z)) { 249 1.1 abhinav z++; 250 1.1 abhinav } 251 1.1 abhinav if (*z == 0) 252 1.1 abhinav return 0; 253 1.1 abhinav while (isVowel(z)) { 254 1.1 abhinav z++; 255 1.1 abhinav } 256 1.1 abhinav if (*z == 0) 257 1.1 abhinav return 0; 258 1.1 abhinav while (isConsonant(z)) { 259 1.1 abhinav z++; 260 1.1 abhinav } 261 1.1 abhinav return *z != 0; 262 1.1 abhinav } 263 1.1 abhinav 264 1.1 abhinav /* 265 1.1 abhinav * Return TRUE if there is a vowel anywhere within z[0..n-1] 266 1.1 abhinav */ 267 1.5 rin static int 268 1.1 abhinav hasVowel(const char *z) 269 1.1 abhinav { 270 1.1 abhinav while (isConsonant(z)) { 271 1.1 abhinav z++; 272 1.1 abhinav } 273 1.1 abhinav return *z != 0; 274 1.1 abhinav } 275 1.1 abhinav 276 1.1 abhinav /* 277 1.1 abhinav * Return TRUE if the word ends in a double consonant. 278 1.1 abhinav * 279 1.1 abhinav * The text is reversed here. So we are really looking at 280 1.1 abhinav * the first two characters of z[]. 281 1.1 abhinav */ 282 1.5 rin static int 283 1.1 abhinav doubleConsonant(const char *z) 284 1.1 abhinav { 285 1.1 abhinav return isConsonant(z) && z[0] == z[1]; 286 1.1 abhinav } 287 1.1 abhinav 288 1.1 abhinav /* 289 1.1 abhinav * Return TRUE if the word ends with three letters which 290 1.1 abhinav * are consonant-vowel-consonent and where the final consonant 291 1.1 abhinav * is not 'w', 'x', or 'y'. 292 1.1 abhinav * 293 1.1 abhinav * The word is reversed here. So we are really checking the 294 1.1 abhinav * first three letters and the first one cannot be in [wxy]. 295 1.1 abhinav */ 296 1.5 rin static int 297 1.1 abhinav star_oh(const char *z) 298 1.1 abhinav { 299 1.1 abhinav return isConsonant(z) && 300 1.1 abhinav z[0] != 'w' && z[0] != 'x' && z[0] != 'y' && 301 1.1 abhinav isVowel(z + 1) && 302 1.1 abhinav isConsonant(z + 2); 303 1.1 abhinav } 304 1.1 abhinav 305 1.1 abhinav /* 306 1.1 abhinav * If the word ends with zFrom and xCond() is true for the stem 307 1.5 rin * of the word that precedes the zFrom ending, then change the 308 1.1 abhinav * ending to zTo. 309 1.1 abhinav * 310 1.1 abhinav * The input word *pz and zFrom are both in reverse order. zTo 311 1.5 rin * is in normal order. 312 1.1 abhinav * 313 1.1 abhinav * Return TRUE if zFrom matches. Return FALSE if zFrom does not 314 1.1 abhinav * match. Not that TRUE is returned even if xCond() fails and 315 1.1 abhinav * no substitution occurs. 316 1.1 abhinav */ 317 1.5 rin static int 318 1.1 abhinav stem( 319 1.1 abhinav char **pz, /* The word being stemmed (Reversed) */ 320 1.1 abhinav const char *zFrom, /* If the ending matches this... (Reversed) */ 321 1.1 abhinav const char *zTo, /* ... change the ending to this (not reversed) */ 322 1.1 abhinav int (*xCond) (const char *) /* Condition that must be true */ 323 1.1 abhinav ) 324 1.1 abhinav { 325 1.1 abhinav char *z = *pz; 326 1.1 abhinav while (*zFrom && *zFrom == *z) { 327 1.1 abhinav z++; 328 1.1 abhinav zFrom++; 329 1.1 abhinav } 330 1.1 abhinav if (*zFrom != 0) 331 1.1 abhinav return 0; 332 1.1 abhinav if (xCond && !xCond(z)) 333 1.1 abhinav return 1; 334 1.1 abhinav while (*zTo) { 335 1.1 abhinav *(--z) = *(zTo++); 336 1.1 abhinav } 337 1.1 abhinav *pz = z; 338 1.1 abhinav return 1; 339 1.1 abhinav } 340 1.1 abhinav 341 1.1 abhinav /* 342 1.1 abhinav * This is the fallback stemmer used when the porter stemmer is 343 1.1 abhinav * inappropriate. The input word is copied into the output with 344 1.1 abhinav * US-ASCII case folding. If the input word is too long (more 345 1.1 abhinav * than 20 bytes if it contains no digits or more than 6 bytes if 346 1.1 abhinav * it contains digits) then word is truncated to 20 or 6 bytes 347 1.1 abhinav * by taking 10 or 3 bytes from the beginning and end. 348 1.1 abhinav */ 349 1.5 rin static void 350 1.1 abhinav copy_stemmer(const char *zIn, size_t nIn, char *zOut, size_t *pnOut) 351 1.1 abhinav { 352 1.1 abhinav size_t i, mx, j; 353 1.1 abhinav int hasDigit = 0; 354 1.1 abhinav for (i = 0; i < nIn; i++) { 355 1.1 abhinav char c = zIn[i]; 356 1.1 abhinav if (c >= 'A' && c <= 'Z') { 357 1.1 abhinav zOut[i] = c - 'A' + 'a'; 358 1.1 abhinav } else { 359 1.1 abhinav if (c >= '0' && c <= '9') 360 1.1 abhinav hasDigit = 1; 361 1.1 abhinav zOut[i] = c; 362 1.1 abhinav } 363 1.1 abhinav } 364 1.1 abhinav mx = hasDigit ? 3 : 10; 365 1.1 abhinav if (nIn > mx * 2) { 366 1.1 abhinav for (j = mx, i = nIn - mx; i < nIn; i++, j++) { 367 1.1 abhinav zOut[j] = zOut[i]; 368 1.1 abhinav } 369 1.1 abhinav i = j; 370 1.1 abhinav } 371 1.1 abhinav zOut[i] = 0; 372 1.1 abhinav *pnOut = i; 373 1.1 abhinav } 374 1.1 abhinav 375 1.1 abhinav 376 1.1 abhinav /* 377 1.1 abhinav * Stem the input word zIn[0..nIn-1]. Store the output in zOut. 378 1.1 abhinav * zOut is at least big enough to hold nIn bytes. Write the actual 379 1.1 abhinav * size of the output word (exclusive of the '\0' terminator) into *pnOut. 380 1.1 abhinav * 381 1.1 abhinav * Any upper-case characters in the US-ASCII character set ([A-Z]) 382 1.1 abhinav * are converted to lower case. Upper-case UTF characters are 383 1.1 abhinav * unchanged. 384 1.1 abhinav * 385 1.1 abhinav * Words that are longer than about 20 bytes are stemmed by retaining 386 1.1 abhinav * a few bytes from the beginning and the end of the word. If the 387 1.1 abhinav * word contains digits, 3 bytes are taken from the beginning and 388 1.1 abhinav * 3 bytes from the end. For long words without digits, 10 bytes 389 1.1 abhinav * are taken from each end. US-ASCII case folding still applies. 390 1.5 rin * 391 1.5 rin * If the input word contains not digits but does characters not 392 1.5 rin * in [a-zA-Z] then no stemming is attempted and this routine just 393 1.1 abhinav * copies the input into the input into the output with US-ASCII 394 1.1 abhinav * case folding. 395 1.1 abhinav * 396 1.1 abhinav * Stemming never increases the length of the word. So there is 397 1.1 abhinav * no chance of overflowing the zOut buffer. 398 1.1 abhinav */ 399 1.5 rin static void 400 1.1 abhinav porter_stemmer(const char *zIn, size_t nIn, char *zOut, size_t *pnOut) 401 1.1 abhinav { 402 1.1 abhinav size_t i, j; 403 1.1 abhinav char zReverse[28]; 404 1.1 abhinav char *z, *z2; 405 1.1 abhinav if (nIn < 3 || nIn >= sizeof(zReverse) - 7) { 406 1.1 abhinav /* The word is too big or too small for the porter stemmer. 407 1.1 abhinav * Fallback to the copy stemmer 408 1.1 abhinav */ 409 1.1 abhinav copy_stemmer(zIn, nIn, zOut, pnOut); 410 1.1 abhinav return; 411 1.1 abhinav } 412 1.1 abhinav 413 1.1 abhinav for (i = 0, j = sizeof(zReverse) - 6; i < nIn; i++, j--) { 414 1.1 abhinav char c = zIn[i]; 415 1.1 abhinav if (c >= 'A' && c <= 'Z') { 416 1.1 abhinav zReverse[j] = c + 'a' - 'A'; 417 1.1 abhinav } else if (c >= 'a' && c <= 'z') { 418 1.1 abhinav zReverse[j] = c; 419 1.1 abhinav } else { 420 1.1 abhinav /* The use of a character not in [a-zA-Z] means that 421 1.1 abhinav * we fallback * to the copy stemmer 422 1.1 abhinav */ 423 1.1 abhinav copy_stemmer(zIn, nIn, zOut, pnOut); 424 1.1 abhinav return; 425 1.1 abhinav } 426 1.1 abhinav } 427 1.1 abhinav memset(&zReverse[sizeof(zReverse) - 5], 0, 5); 428 1.1 abhinav z = &zReverse[j + 1]; 429 1.1 abhinav 430 1.1 abhinav 431 1.1 abhinav /* Step 1a */ 432 1.1 abhinav if (z[0] == 's') { 433 1.1 abhinav if ( 434 1.1 abhinav !stem(&z, "sess", "ss", 0) && 435 1.1 abhinav !stem(&z, "sei", "i", 0) && 436 1.1 abhinav !stem(&z, "ss", "ss", 0) 437 1.1 abhinav ) { 438 1.1 abhinav z++; 439 1.1 abhinav } 440 1.1 abhinav } 441 1.1 abhinav /* Step 1b */ 442 1.1 abhinav z2 = z; 443 1.1 abhinav if (stem(&z, "dee", "ee", m_gt_0)) { 444 1.1 abhinav /* Do nothing. The work was all in the test */ 445 1.1 abhinav } else if ( 446 1.1 abhinav (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) 447 1.1 abhinav && z != z2 448 1.1 abhinav ) { 449 1.1 abhinav if (stem(&z, "ta", "ate", 0) || 450 1.1 abhinav stem(&z, "lb", "ble", 0) || 451 1.1 abhinav stem(&z, "zi", "ize", 0)) { 452 1.1 abhinav /* Do nothing. The work was all in the test */ 453 1.1 abhinav } else if (doubleConsonant(z) && (*z != 'l' && *z != 's' && *z != 'z')) { 454 1.1 abhinav z++; 455 1.1 abhinav } else if (m_eq_1(z) && star_oh(z)) { 456 1.1 abhinav *(--z) = 'e'; 457 1.1 abhinav } 458 1.1 abhinav } 459 1.1 abhinav /* Step 1c */ 460 1.1 abhinav if (z[0] == 'y' && hasVowel(z + 1)) { 461 1.1 abhinav z[0] = 'i'; 462 1.1 abhinav } 463 1.1 abhinav /* Step 2 */ 464 1.1 abhinav switch (z[1]) { 465 1.1 abhinav case 'a': 466 1.1 abhinav if (!stem(&z, "lanoita", "ate", m_gt_0)) { 467 1.1 abhinav stem(&z, "lanoit", "tion", m_gt_0); 468 1.1 abhinav } 469 1.1 abhinav break; 470 1.1 abhinav case 'c': 471 1.1 abhinav if (!stem(&z, "icne", "ence", m_gt_0)) { 472 1.1 abhinav stem(&z, "icna", "ance", m_gt_0); 473 1.1 abhinav } 474 1.1 abhinav break; 475 1.1 abhinav case 'e': 476 1.1 abhinav stem(&z, "rezi", "ize", m_gt_0); 477 1.1 abhinav break; 478 1.1 abhinav case 'g': 479 1.1 abhinav stem(&z, "igol", "log", m_gt_0); 480 1.1 abhinav break; 481 1.1 abhinav case 'l': 482 1.1 abhinav if (!stem(&z, "ilb", "ble", m_gt_0) 483 1.1 abhinav && !stem(&z, "illa", "al", m_gt_0) 484 1.1 abhinav && !stem(&z, "iltne", "ent", m_gt_0) 485 1.1 abhinav && !stem(&z, "ile", "e", m_gt_0) 486 1.1 abhinav ) { 487 1.1 abhinav stem(&z, "ilsuo", "ous", m_gt_0); 488 1.1 abhinav } 489 1.1 abhinav break; 490 1.1 abhinav case 'o': 491 1.1 abhinav if (!stem(&z, "noitazi", "ize", m_gt_0) 492 1.1 abhinav && !stem(&z, "noita", "ate", m_gt_0) 493 1.1 abhinav ) { 494 1.1 abhinav stem(&z, "rota", "ate", m_gt_0); 495 1.1 abhinav } 496 1.1 abhinav break; 497 1.1 abhinav case 's': 498 1.1 abhinav if (!stem(&z, "msila", "al", m_gt_0) 499 1.1 abhinav && !stem(&z, "ssenevi", "ive", m_gt_0) 500 1.1 abhinav && !stem(&z, "ssenluf", "ful", m_gt_0) 501 1.1 abhinav ) { 502 1.1 abhinav stem(&z, "ssensuo", "ous", m_gt_0); 503 1.1 abhinav } 504 1.1 abhinav break; 505 1.1 abhinav case 't': 506 1.1 abhinav if (!stem(&z, "itila", "al", m_gt_0) 507 1.1 abhinav && !stem(&z, "itivi", "ive", m_gt_0) 508 1.1 abhinav ) { 509 1.1 abhinav stem(&z, "itilib", "ble", m_gt_0); 510 1.1 abhinav } 511 1.1 abhinav break; 512 1.1 abhinav } 513 1.1 abhinav 514 1.1 abhinav /* Step 3 */ 515 1.1 abhinav switch (z[0]) { 516 1.1 abhinav case 'e': 517 1.1 abhinav if (!stem(&z, "etaci", "ic", m_gt_0) 518 1.1 abhinav && !stem(&z, "evita", "", m_gt_0) 519 1.1 abhinav ) { 520 1.1 abhinav stem(&z, "ezila", "al", m_gt_0); 521 1.1 abhinav } 522 1.1 abhinav break; 523 1.1 abhinav case 'i': 524 1.1 abhinav stem(&z, "itici", "ic", m_gt_0); 525 1.1 abhinav break; 526 1.1 abhinav case 'l': 527 1.1 abhinav if (!stem(&z, "laci", "ic", m_gt_0)) { 528 1.1 abhinav stem(&z, "luf", "", m_gt_0); 529 1.1 abhinav } 530 1.1 abhinav break; 531 1.1 abhinav case 's': 532 1.1 abhinav stem(&z, "ssen", "", m_gt_0); 533 1.1 abhinav break; 534 1.1 abhinav } 535 1.1 abhinav 536 1.1 abhinav /* Step 4 */ 537 1.1 abhinav switch (z[1]) { 538 1.1 abhinav case 'a': 539 1.1 abhinav if (z[0] == 'l' && m_gt_1(z + 2)) { 540 1.1 abhinav z += 2; 541 1.1 abhinav } 542 1.1 abhinav break; 543 1.1 abhinav case 'c': 544 1.1 abhinav if (z[0] == 'e' && z[2] == 'n' && (z[3] == 'a' || z[3] == 'e') && m_gt_1(z + 4)) { 545 1.1 abhinav z += 4; 546 1.1 abhinav } 547 1.1 abhinav break; 548 1.1 abhinav case 'e': 549 1.1 abhinav if (z[0] == 'r' && m_gt_1(z + 2)) { 550 1.1 abhinav z += 2; 551 1.1 abhinav } 552 1.1 abhinav break; 553 1.1 abhinav case 'i': 554 1.1 abhinav if (z[0] == 'c' && m_gt_1(z + 2)) { 555 1.1 abhinav z += 2; 556 1.1 abhinav } 557 1.1 abhinav break; 558 1.1 abhinav case 'l': 559 1.1 abhinav if (z[0] == 'e' && z[2] == 'b' && (z[3] == 'a' || z[3] == 'i') && m_gt_1(z + 4)) { 560 1.1 abhinav z += 4; 561 1.1 abhinav } 562 1.1 abhinav break; 563 1.1 abhinav case 'n': 564 1.1 abhinav if (z[0] == 't') { 565 1.1 abhinav if (z[2] == 'a') { 566 1.1 abhinav if (m_gt_1(z + 3)) { 567 1.1 abhinav z += 3; 568 1.1 abhinav } 569 1.1 abhinav } else if (z[2] == 'e') { 570 1.1 abhinav if (!stem(&z, "tneme", "", m_gt_1) 571 1.1 abhinav && !stem(&z, "tnem", "", m_gt_1) 572 1.1 abhinav ) { 573 1.1 abhinav stem(&z, "tne", "", m_gt_1); 574 1.1 abhinav } 575 1.1 abhinav } 576 1.1 abhinav } 577 1.1 abhinav break; 578 1.1 abhinav case 'o': 579 1.1 abhinav if (z[0] == 'u') { 580 1.1 abhinav if (m_gt_1(z + 2)) { 581 1.1 abhinav z += 2; 582 1.1 abhinav } 583 1.1 abhinav } else if (z[3] == 's' || z[3] == 't') { 584 1.1 abhinav stem(&z, "noi", "", m_gt_1); 585 1.1 abhinav } 586 1.1 abhinav break; 587 1.1 abhinav case 's': 588 1.1 abhinav if (z[0] == 'm' && z[2] == 'i' && m_gt_1(z + 3)) { 589 1.1 abhinav z += 3; 590 1.1 abhinav } 591 1.1 abhinav break; 592 1.1 abhinav case 't': 593 1.1 abhinav if (!stem(&z, "eta", "", m_gt_1)) { 594 1.1 abhinav stem(&z, "iti", "", m_gt_1); 595 1.1 abhinav } 596 1.1 abhinav break; 597 1.1 abhinav case 'u': 598 1.1 abhinav if (z[0] == 's' && z[2] == 'o' && m_gt_1(z + 3)) { 599 1.1 abhinav z += 3; 600 1.1 abhinav } 601 1.1 abhinav break; 602 1.1 abhinav case 'v': 603 1.1 abhinav case 'z': 604 1.1 abhinav if (z[0] == 'e' && z[2] == 'i' && m_gt_1(z + 3)) { 605 1.1 abhinav z += 3; 606 1.1 abhinav } 607 1.1 abhinav break; 608 1.1 abhinav } 609 1.1 abhinav 610 1.1 abhinav /* Step 5a */ 611 1.1 abhinav if (z[0] == 'e') { 612 1.1 abhinav if (m_gt_1(z + 1)) { 613 1.1 abhinav z++; 614 1.1 abhinav } else if (m_eq_1(z + 1) && !star_oh(z + 1)) { 615 1.1 abhinav z++; 616 1.1 abhinav } 617 1.1 abhinav } 618 1.1 abhinav /* Step 5b */ 619 1.1 abhinav if (m_gt_1(z) && z[0] == 'l' && z[1] == 'l') { 620 1.1 abhinav z++; 621 1.1 abhinav } 622 1.1 abhinav /* z[] is now the stemmed word in reverse order. Flip it back 623 1.1 abhinav * around into forward order and return. 624 1.1 abhinav */ 625 1.1 abhinav *pnOut = i = strlen(z); 626 1.1 abhinav zOut[i] = 0; 627 1.1 abhinav while (*z) { 628 1.1 abhinav zOut[--i] = *(z++); 629 1.1 abhinav } 630 1.1 abhinav } 631 1.1 abhinav 632 1.1 abhinav /* 633 1.1 abhinav * Based on whether the input word is in the nostem list or not 634 1.1 abhinav * call porter stemmer to stem it, or call copy_stemmer to keep it 635 1.5 rin * as it is (copy_stemmer converts simply converts it to lower case) 636 1.1 abhinav * Returns SQLITE_OK if stemming is successful, an error code for 637 1.1 abhinav * any errors 638 1.1 abhinav */ 639 1.1 abhinav static int 640 1.1 abhinav do_stem(const char *zIn, size_t nIn, char *zOut, size_t *pnOut) 641 1.1 abhinav { 642 1.1 abhinav /* Before looking up the word in the hash table, convert it to lower-case */ 643 1.1 abhinav char *dupword = malloc(nIn); 644 1.1 abhinav if (dupword == NULL) 645 1.1 abhinav return SQLITE_NOMEM; 646 1.1 abhinav 647 1.1 abhinav for (size_t i = 0; i < nIn; i++) 648 1.1 abhinav dupword[i] = tolower((unsigned char) zIn[i]); 649 1.1 abhinav 650 1.1 abhinav size_t idx = nostem_hash(dupword, nIn); 651 1.1 abhinav if (strncmp(nostem[idx], dupword, nIn) == 0 && nostem[idx][nIn] == 0) 652 1.1 abhinav copy_stemmer(zIn, nIn, zOut, pnOut); 653 1.1 abhinav else 654 1.1 abhinav porter_stemmer(zIn, nIn, zOut, pnOut); 655 1.1 abhinav 656 1.1 abhinav free(dupword); 657 1.1 abhinav return SQLITE_OK; 658 1.1 abhinav } 659 1.1 abhinav 660 1.1 abhinav 661 1.1 abhinav /* 662 1.1 abhinav * Characters that can be part of a token. We assume any character 663 1.1 abhinav * whose value is greater than 0x80 (any UTF character) can be 664 1.1 abhinav * part of a token. In other words, delimiters all must have 665 1.1 abhinav * values of 0x7f or lower. 666 1.1 abhinav */ 667 1.1 abhinav static const char porterIdChar[] = { 668 1.1 abhinav /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ 669 1.1 abhinav 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ 670 1.1 abhinav 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ 671 1.1 abhinav 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 5x */ 672 1.1 abhinav 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ 673 1.1 abhinav 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ 674 1.1 abhinav }; 675 1.1 abhinav 676 1.1 abhinav #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30])) 677 1.1 abhinav 678 1.1 abhinav /* 679 1.1 abhinav * Extract the next token from a tokenization cursor. The cursor must 680 1.1 abhinav * have been opened by a prior call to aproposPorterOpen(). 681 1.1 abhinav */ 682 1.5 rin static int 683 1.1 abhinav aproposPorterNext( 684 1.1 abhinav sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by aproposPorterOpen */ 685 1.1 abhinav const char **pzToken, /* OUT: *pzToken is the token text */ 686 1.1 abhinav int *pnBytes, /* OUT: Number of bytes in token */ 687 1.1 abhinav int *piStartOffset, /* OUT: Starting offset of token */ 688 1.1 abhinav int *piEndOffset, /* OUT: Ending offset of token */ 689 1.1 abhinav int *piPosition /* OUT: Position integer of token */ 690 1.1 abhinav ) 691 1.1 abhinav { 692 1.1 abhinav custom_apropos_tokenizer_cursor *c = (custom_apropos_tokenizer_cursor *) pCursor; 693 1.1 abhinav const char *z = c->zInput; 694 1.1 abhinav 695 1.1 abhinav while (c->iOffset < c->nInput) { 696 1.1 abhinav size_t iStartOffset, ch; 697 1.1 abhinav 698 1.1 abhinav /* Scan past delimiter characters */ 699 1.1 abhinav while (c->iOffset < c->nInput && isDelim(z[c->iOffset])) { 700 1.1 abhinav c->iOffset++; 701 1.1 abhinav } 702 1.1 abhinav 703 1.1 abhinav /* Count non-delimiter characters. */ 704 1.1 abhinav iStartOffset = c->iOffset; 705 1.1 abhinav while (c->iOffset < c->nInput && !isDelim(z[c->iOffset])) { 706 1.1 abhinav c->iOffset++; 707 1.1 abhinav } 708 1.1 abhinav 709 1.1 abhinav if (c->iOffset > iStartOffset) { 710 1.1 abhinav size_t n = c->iOffset - iStartOffset; 711 1.1 abhinav if (n > c->nAllocated) { 712 1.1 abhinav char *pNew; 713 1.1 abhinav c->nAllocated = n + 20; 714 1.1 abhinav pNew = realloc(c->zToken, c->nAllocated); 715 1.1 abhinav if (!pNew) 716 1.1 abhinav return SQLITE_NOMEM; 717 1.1 abhinav c->zToken = pNew; 718 1.1 abhinav } 719 1.5 rin 720 1.5 rin size_t temp; 721 1.2 abhinav int stemStatus = do_stem(&z[iStartOffset], n, c->zToken, &temp); 722 1.1 abhinav if (stemStatus != SQLITE_OK) 723 1.1 abhinav return stemStatus; 724 1.6 tnn *pnBytes = temp; 725 1.1 abhinav 726 1.1 abhinav *pzToken = c->zToken; 727 1.1 abhinav *piStartOffset = iStartOffset; 728 1.1 abhinav *piEndOffset = c->iOffset; 729 1.1 abhinav *piPosition = c->iToken++; 730 1.1 abhinav return SQLITE_OK; 731 1.1 abhinav } 732 1.1 abhinav } 733 1.1 abhinav return SQLITE_DONE; 734 1.1 abhinav } 735 1.1 abhinav 736 1.1 abhinav /* 737 1.1 abhinav * The set of routines that implement the porter-stemmer tokenizer 738 1.1 abhinav */ 739 1.1 abhinav static const sqlite3_tokenizer_module aproposPorterTokenizerModule = { 740 1.1 abhinav 0, 741 1.1 abhinav aproposPorterCreate, 742 1.1 abhinav aproposPorterDestroy, 743 1.1 abhinav aproposPorterOpen, 744 1.1 abhinav aproposPorterClose, 745 1.1 abhinav aproposPorterNext, 746 1.1 abhinav 0 747 1.1 abhinav }; 748 1.1 abhinav 749 1.1 abhinav /* 750 1.1 abhinav * Allocate a new porter tokenizer. Return a pointer to the new 751 1.1 abhinav * tokenizer in *ppModule 752 1.1 abhinav */ 753 1.5 rin void 754 1.1 abhinav get_custom_apropos_tokenizer(sqlite3_tokenizer_module const ** ppModule) 755 1.1 abhinav { 756 1.1 abhinav *ppModule = &aproposPorterTokenizerModule; 757 1.1 abhinav } 758