custom_apropos_tokenizer.c revision 1.2 1 1.2 abhinav /* $NetBSD: custom_apropos_tokenizer.c,v 1.2 2017/10/31 10:14:27 abhinav Exp $ */
2 1.1 abhinav /*
3 1.1 abhinav ** 2006 September 30
4 1.1 abhinav **
5 1.1 abhinav ** The author disclaims copyright to this source code. In place of
6 1.1 abhinav ** a legal notice, here is a blessing:
7 1.1 abhinav **
8 1.1 abhinav ** May you do good and not evil.
9 1.1 abhinav ** May you find forgiveness for yourself and forgive others.
10 1.1 abhinav ** May you share freely, never taking more than you give.
11 1.1 abhinav **
12 1.1 abhinav *************************************************************************
13 1.1 abhinav ** Implementation of the full-text-search tokenizer that implements
14 1.1 abhinav ** a Porter stemmer.
15 1.1 abhinav */
16 1.1 abhinav
17 1.1 abhinav /*
18 1.1 abhinav ** The code in this file is only compiled if:
19 1.1 abhinav **
20 1.1 abhinav ** * The FTS3 module is being built as an extension
21 1.1 abhinav ** (in which case SQLITE_CORE is not defined), or
22 1.1 abhinav **
23 1.1 abhinav ** * The FTS3 module is being built into the core of
24 1.1 abhinav ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
25 1.1 abhinav */
26 1.1 abhinav
27 1.1 abhinav #include <assert.h>
28 1.1 abhinav #include <ctype.h>
29 1.1 abhinav #include <stdlib.h>
30 1.1 abhinav #include <stdio.h>
31 1.1 abhinav #include <string.h>
32 1.1 abhinav
33 1.1 abhinav #include "custom_apropos_tokenizer.h"
34 1.1 abhinav #include "fts3_tokenizer.h"
35 1.1 abhinav #include "nostem.c"
36 1.1 abhinav
37 1.1 abhinav /*
38 1.1 abhinav * Class derived from sqlite3_tokenizer
39 1.1 abhinav */
40 1.1 abhinav typedef struct custom_apropos_tokenizer {
41 1.1 abhinav sqlite3_tokenizer base; /* Base class */
42 1.1 abhinav } custom_apropos_tokenizer;
43 1.1 abhinav
44 1.1 abhinav /*
45 1.1 abhinav * Class derived from sqlite3_tokenizer_cursor
46 1.1 abhinav */
47 1.1 abhinav typedef struct custom_apropos_tokenizer_cursor {
48 1.1 abhinav sqlite3_tokenizer_cursor base;
49 1.1 abhinav const char *zInput; /* input we are tokenizing */
50 1.1 abhinav size_t nInput; /* size of the input */
51 1.1 abhinav size_t iOffset; /* current position in zInput */
52 1.1 abhinav size_t iToken; /* index of next token to be returned */
53 1.1 abhinav char *zToken; /* storage for current token */
54 1.1 abhinav size_t nAllocated; /* space allocated to zToken buffer */
55 1.1 abhinav } custom_apropos_tokenizer_cursor;
56 1.1 abhinav
57 1.1 abhinav /*
58 1.1 abhinav * Create a new tokenizer instance.
59 1.1 abhinav */
60 1.1 abhinav static int
61 1.1 abhinav aproposPorterCreate(int argc, const char *const * argv,
62 1.1 abhinav sqlite3_tokenizer ** ppTokenizer)
63 1.1 abhinav {
64 1.1 abhinav custom_apropos_tokenizer *t;
65 1.1 abhinav t = calloc(1, sizeof(*t));
66 1.1 abhinav if (t == NULL)
67 1.1 abhinav return SQLITE_NOMEM;
68 1.1 abhinav *ppTokenizer = &t->base;
69 1.1 abhinav return SQLITE_OK;
70 1.1 abhinav }
71 1.1 abhinav
72 1.1 abhinav /*
73 1.1 abhinav * Destroy a tokenizer
74 1.1 abhinav */
75 1.1 abhinav static int
76 1.1 abhinav aproposPorterDestroy(sqlite3_tokenizer * pTokenizer)
77 1.1 abhinav {
78 1.1 abhinav free(pTokenizer);
79 1.1 abhinav return SQLITE_OK;
80 1.1 abhinav }
81 1.1 abhinav
82 1.1 abhinav /*
83 1.1 abhinav * Prepare to begin tokenizing a particular string. The input
84 1.1 abhinav * string to be tokenized is zInput[0..nInput-1]. A cursor
85 1.1 abhinav * used to incrementally tokenize this string is returned in
86 1.1 abhinav * *ppCursor.
87 1.1 abhinav */
88 1.1 abhinav static int
89 1.1 abhinav aproposPorterOpen(
90 1.1 abhinav sqlite3_tokenizer * pTokenizer, /* The tokenizer */
91 1.1 abhinav const char *zInput, int nInput, /* String to be tokenized */
92 1.1 abhinav sqlite3_tokenizer_cursor ** ppCursor /* OUT: Tokenization cursor */
93 1.1 abhinav )
94 1.1 abhinav {
95 1.1 abhinav custom_apropos_tokenizer_cursor *c;
96 1.1 abhinav
97 1.1 abhinav c = calloc(1, sizeof(*c));
98 1.1 abhinav if (c == NULL)
99 1.1 abhinav return SQLITE_NOMEM;
100 1.1 abhinav
101 1.1 abhinav c->zInput = zInput;
102 1.1 abhinav if (zInput != 0) {
103 1.1 abhinav if (nInput < 0)
104 1.1 abhinav c->nInput = strlen(zInput);
105 1.1 abhinav else
106 1.1 abhinav c->nInput = nInput;
107 1.1 abhinav }
108 1.1 abhinav
109 1.1 abhinav *ppCursor = &c->base;
110 1.1 abhinav return SQLITE_OK;
111 1.1 abhinav }
112 1.1 abhinav
113 1.1 abhinav /*
114 1.1 abhinav * Close a tokenization cursor previously opened by a call to
115 1.1 abhinav * aproposPorterOpen() above.
116 1.1 abhinav */
117 1.1 abhinav static int
118 1.1 abhinav aproposPorterClose(sqlite3_tokenizer_cursor *pCursor)
119 1.1 abhinav {
120 1.1 abhinav custom_apropos_tokenizer_cursor *c = (custom_apropos_tokenizer_cursor *) pCursor;
121 1.1 abhinav free(c->zToken);
122 1.1 abhinav free(c);
123 1.1 abhinav return SQLITE_OK;
124 1.1 abhinav }
125 1.1 abhinav
126 1.1 abhinav /*
127 1.1 abhinav * Vowel or consonant
128 1.1 abhinav */
129 1.1 abhinav static const char cType[] = {
130 1.1 abhinav 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
131 1.1 abhinav 1, 1, 1, 2, 1
132 1.1 abhinav };
133 1.1 abhinav
134 1.1 abhinav /*
135 1.1 abhinav * isConsonant() and isVowel() determine if their first character in
136 1.1 abhinav * the string they point to is a consonant or a vowel, according
137 1.1 abhinav * to Porter ruls.
138 1.1 abhinav *
139 1.1 abhinav * A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
140 1.1 abhinav * 'Y' is a consonant unless it follows another consonant,
141 1.1 abhinav * in which case it is a vowel.
142 1.1 abhinav *
143 1.1 abhinav * In these routine, the letters are in reverse order. So the 'y' rule
144 1.1 abhinav * is that 'y' is a consonant unless it is followed by another
145 1.1 abhinav * consonent.
146 1.1 abhinav */
147 1.1 abhinav static int isVowel(const char*);
148 1.1 abhinav
149 1.1 abhinav static int
150 1.1 abhinav isConsonant(const char *z)
151 1.1 abhinav {
152 1.1 abhinav int j;
153 1.1 abhinav char x = *z;
154 1.1 abhinav if (x == 0)
155 1.1 abhinav return 0;
156 1.1 abhinav assert(x >= 'a' && x <= 'z');
157 1.1 abhinav j = cType[x - 'a'];
158 1.1 abhinav if (j < 2)
159 1.1 abhinav return j;
160 1.1 abhinav return z[1] == 0 || isVowel(z + 1);
161 1.1 abhinav }
162 1.1 abhinav
163 1.1 abhinav static int
164 1.1 abhinav isVowel(const char *z)
165 1.1 abhinav {
166 1.1 abhinav int j;
167 1.1 abhinav char x = *z;
168 1.1 abhinav if (x == 0)
169 1.1 abhinav return 0;
170 1.1 abhinav assert(x >= 'a' && x <= 'z');
171 1.1 abhinav j = cType[x - 'a'];
172 1.1 abhinav if (j < 2)
173 1.1 abhinav return 1 - j;
174 1.1 abhinav return isConsonant(z + 1);
175 1.1 abhinav }
176 1.1 abhinav
177 1.1 abhinav /*
178 1.1 abhinav * Let any sequence of one or more vowels be represented by V and let
179 1.1 abhinav * C be sequence of one or more consonants. Then every word can be
180 1.1 abhinav * represented as:
181 1.1 abhinav *
182 1.1 abhinav * [C] (VC){m} [V]
183 1.1 abhinav *
184 1.1 abhinav * In prose: A word is an optional consonant followed by zero or
185 1.1 abhinav * vowel-consonant pairs followed by an optional vowel. "m" is the
186 1.1 abhinav * number of vowel consonant pairs. This routine computes the value
187 1.1 abhinav * of m for the first i bytes of a word.
188 1.1 abhinav *
189 1.1 abhinav * Return true if the m-value for z is 1 or more. In other words,
190 1.1 abhinav * return true if z contains at least one vowel that is followed
191 1.1 abhinav * by a consonant.
192 1.1 abhinav *
193 1.1 abhinav * In this routine z[] is in reverse order. So we are really looking
194 1.1 abhinav * for an instance of a consonant followed by a vowel.
195 1.1 abhinav */
196 1.1 abhinav static int
197 1.1 abhinav m_gt_0(const char *z)
198 1.1 abhinav {
199 1.1 abhinav while (isVowel(z)) {
200 1.1 abhinav z++;
201 1.1 abhinav }
202 1.1 abhinav if (*z == 0)
203 1.1 abhinav return 0;
204 1.1 abhinav while (isConsonant(z)) {
205 1.1 abhinav z++;
206 1.1 abhinav }
207 1.1 abhinav return *z != 0;
208 1.1 abhinav }
209 1.1 abhinav
210 1.1 abhinav /* Like mgt0 above except we are looking for a value of m which is
211 1.1 abhinav * exactly 1
212 1.1 abhinav */
213 1.1 abhinav static int
214 1.1 abhinav m_eq_1(const char *z)
215 1.1 abhinav {
216 1.1 abhinav while (isVowel(z)) {
217 1.1 abhinav z++;
218 1.1 abhinav }
219 1.1 abhinav if (*z == 0)
220 1.1 abhinav return 0;
221 1.1 abhinav while (isConsonant(z)) {
222 1.1 abhinav z++;
223 1.1 abhinav }
224 1.1 abhinav if (*z == 0)
225 1.1 abhinav return 0;
226 1.1 abhinav while (isVowel(z)) {
227 1.1 abhinav z++;
228 1.1 abhinav }
229 1.1 abhinav if (*z == 0)
230 1.1 abhinav return 1;
231 1.1 abhinav while (isConsonant(z)) {
232 1.1 abhinav z++;
233 1.1 abhinav }
234 1.1 abhinav return *z == 0;
235 1.1 abhinav }
236 1.1 abhinav
237 1.1 abhinav /* Like mgt0 above except we are looking for a value of m>1 instead
238 1.1 abhinav * or m>0
239 1.1 abhinav */
240 1.1 abhinav static int
241 1.1 abhinav m_gt_1(const char *z)
242 1.1 abhinav {
243 1.1 abhinav while (isVowel(z)) {
244 1.1 abhinav z++;
245 1.1 abhinav }
246 1.1 abhinav if (*z == 0)
247 1.1 abhinav return 0;
248 1.1 abhinav while (isConsonant(z)) {
249 1.1 abhinav z++;
250 1.1 abhinav }
251 1.1 abhinav if (*z == 0)
252 1.1 abhinav return 0;
253 1.1 abhinav while (isVowel(z)) {
254 1.1 abhinav z++;
255 1.1 abhinav }
256 1.1 abhinav if (*z == 0)
257 1.1 abhinav return 0;
258 1.1 abhinav while (isConsonant(z)) {
259 1.1 abhinav z++;
260 1.1 abhinav }
261 1.1 abhinav return *z != 0;
262 1.1 abhinav }
263 1.1 abhinav
264 1.1 abhinav /*
265 1.1 abhinav * Return TRUE if there is a vowel anywhere within z[0..n-1]
266 1.1 abhinav */
267 1.1 abhinav static int
268 1.1 abhinav hasVowel(const char *z)
269 1.1 abhinav {
270 1.1 abhinav while (isConsonant(z)) {
271 1.1 abhinav z++;
272 1.1 abhinav }
273 1.1 abhinav return *z != 0;
274 1.1 abhinav }
275 1.1 abhinav
276 1.1 abhinav /*
277 1.1 abhinav * Return TRUE if the word ends in a double consonant.
278 1.1 abhinav *
279 1.1 abhinav * The text is reversed here. So we are really looking at
280 1.1 abhinav * the first two characters of z[].
281 1.1 abhinav */
282 1.1 abhinav static int
283 1.1 abhinav doubleConsonant(const char *z)
284 1.1 abhinav {
285 1.1 abhinav return isConsonant(z) && z[0] == z[1];
286 1.1 abhinav }
287 1.1 abhinav
288 1.1 abhinav /*
289 1.1 abhinav * Return TRUE if the word ends with three letters which
290 1.1 abhinav * are consonant-vowel-consonent and where the final consonant
291 1.1 abhinav * is not 'w', 'x', or 'y'.
292 1.1 abhinav *
293 1.1 abhinav * The word is reversed here. So we are really checking the
294 1.1 abhinav * first three letters and the first one cannot be in [wxy].
295 1.1 abhinav */
296 1.1 abhinav static int
297 1.1 abhinav star_oh(const char *z)
298 1.1 abhinav {
299 1.1 abhinav return isConsonant(z) &&
300 1.1 abhinav z[0] != 'w' && z[0] != 'x' && z[0] != 'y' &&
301 1.1 abhinav isVowel(z + 1) &&
302 1.1 abhinav isConsonant(z + 2);
303 1.1 abhinav }
304 1.1 abhinav
305 1.1 abhinav /*
306 1.1 abhinav * If the word ends with zFrom and xCond() is true for the stem
307 1.1 abhinav * of the word that preceeds the zFrom ending, then change the
308 1.1 abhinav * ending to zTo.
309 1.1 abhinav *
310 1.1 abhinav * The input word *pz and zFrom are both in reverse order. zTo
311 1.1 abhinav * is in normal order.
312 1.1 abhinav *
313 1.1 abhinav * Return TRUE if zFrom matches. Return FALSE if zFrom does not
314 1.1 abhinav * match. Not that TRUE is returned even if xCond() fails and
315 1.1 abhinav * no substitution occurs.
316 1.1 abhinav */
317 1.1 abhinav static int
318 1.1 abhinav stem(
319 1.1 abhinav char **pz, /* The word being stemmed (Reversed) */
320 1.1 abhinav const char *zFrom, /* If the ending matches this... (Reversed) */
321 1.1 abhinav const char *zTo, /* ... change the ending to this (not reversed) */
322 1.1 abhinav int (*xCond) (const char *) /* Condition that must be true */
323 1.1 abhinav )
324 1.1 abhinav {
325 1.1 abhinav char *z = *pz;
326 1.1 abhinav while (*zFrom && *zFrom == *z) {
327 1.1 abhinav z++;
328 1.1 abhinav zFrom++;
329 1.1 abhinav }
330 1.1 abhinav if (*zFrom != 0)
331 1.1 abhinav return 0;
332 1.1 abhinav if (xCond && !xCond(z))
333 1.1 abhinav return 1;
334 1.1 abhinav while (*zTo) {
335 1.1 abhinav *(--z) = *(zTo++);
336 1.1 abhinav }
337 1.1 abhinav *pz = z;
338 1.1 abhinav return 1;
339 1.1 abhinav }
340 1.1 abhinav
341 1.1 abhinav /*
342 1.1 abhinav * This is the fallback stemmer used when the porter stemmer is
343 1.1 abhinav * inappropriate. The input word is copied into the output with
344 1.1 abhinav * US-ASCII case folding. If the input word is too long (more
345 1.1 abhinav * than 20 bytes if it contains no digits or more than 6 bytes if
346 1.1 abhinav * it contains digits) then word is truncated to 20 or 6 bytes
347 1.1 abhinav * by taking 10 or 3 bytes from the beginning and end.
348 1.1 abhinav */
349 1.1 abhinav static void
350 1.1 abhinav copy_stemmer(const char *zIn, size_t nIn, char *zOut, size_t *pnOut)
351 1.1 abhinav {
352 1.1 abhinav size_t i, mx, j;
353 1.1 abhinav int hasDigit = 0;
354 1.1 abhinav for (i = 0; i < nIn; i++) {
355 1.1 abhinav char c = zIn[i];
356 1.1 abhinav if (c >= 'A' && c <= 'Z') {
357 1.1 abhinav zOut[i] = c - 'A' + 'a';
358 1.1 abhinav } else {
359 1.1 abhinav if (c >= '0' && c <= '9')
360 1.1 abhinav hasDigit = 1;
361 1.1 abhinav zOut[i] = c;
362 1.1 abhinav }
363 1.1 abhinav }
364 1.1 abhinav mx = hasDigit ? 3 : 10;
365 1.1 abhinav if (nIn > mx * 2) {
366 1.1 abhinav for (j = mx, i = nIn - mx; i < nIn; i++, j++) {
367 1.1 abhinav zOut[j] = zOut[i];
368 1.1 abhinav }
369 1.1 abhinav i = j;
370 1.1 abhinav }
371 1.1 abhinav zOut[i] = 0;
372 1.1 abhinav *pnOut = i;
373 1.1 abhinav }
374 1.1 abhinav
375 1.1 abhinav
376 1.1 abhinav /*
377 1.1 abhinav * Stem the input word zIn[0..nIn-1]. Store the output in zOut.
378 1.1 abhinav * zOut is at least big enough to hold nIn bytes. Write the actual
379 1.1 abhinav * size of the output word (exclusive of the '\0' terminator) into *pnOut.
380 1.1 abhinav *
381 1.1 abhinav * Any upper-case characters in the US-ASCII character set ([A-Z])
382 1.1 abhinav * are converted to lower case. Upper-case UTF characters are
383 1.1 abhinav * unchanged.
384 1.1 abhinav *
385 1.1 abhinav * Words that are longer than about 20 bytes are stemmed by retaining
386 1.1 abhinav * a few bytes from the beginning and the end of the word. If the
387 1.1 abhinav * word contains digits, 3 bytes are taken from the beginning and
388 1.1 abhinav * 3 bytes from the end. For long words without digits, 10 bytes
389 1.1 abhinav * are taken from each end. US-ASCII case folding still applies.
390 1.1 abhinav *
391 1.1 abhinav * If the input word contains not digits but does characters not
392 1.1 abhinav * in [a-zA-Z] then no stemming is attempted and this routine just
393 1.1 abhinav * copies the input into the input into the output with US-ASCII
394 1.1 abhinav * case folding.
395 1.1 abhinav *
396 1.1 abhinav * Stemming never increases the length of the word. So there is
397 1.1 abhinav * no chance of overflowing the zOut buffer.
398 1.1 abhinav */
399 1.1 abhinav static void
400 1.1 abhinav porter_stemmer(const char *zIn, size_t nIn, char *zOut, size_t *pnOut)
401 1.1 abhinav {
402 1.1 abhinav size_t i, j;
403 1.1 abhinav char zReverse[28];
404 1.1 abhinav char *z, *z2;
405 1.1 abhinav if (nIn < 3 || nIn >= sizeof(zReverse) - 7) {
406 1.1 abhinav /* The word is too big or too small for the porter stemmer.
407 1.1 abhinav * Fallback to the copy stemmer
408 1.1 abhinav */
409 1.1 abhinav copy_stemmer(zIn, nIn, zOut, pnOut);
410 1.1 abhinav return;
411 1.1 abhinav }
412 1.1 abhinav
413 1.1 abhinav for (i = 0, j = sizeof(zReverse) - 6; i < nIn; i++, j--) {
414 1.1 abhinav char c = zIn[i];
415 1.1 abhinav if (c >= 'A' && c <= 'Z') {
416 1.1 abhinav zReverse[j] = c + 'a' - 'A';
417 1.1 abhinav } else if (c >= 'a' && c <= 'z') {
418 1.1 abhinav zReverse[j] = c;
419 1.1 abhinav } else {
420 1.1 abhinav /* The use of a character not in [a-zA-Z] means that
421 1.1 abhinav * we fallback * to the copy stemmer
422 1.1 abhinav */
423 1.1 abhinav copy_stemmer(zIn, nIn, zOut, pnOut);
424 1.1 abhinav return;
425 1.1 abhinav }
426 1.1 abhinav }
427 1.1 abhinav memset(&zReverse[sizeof(zReverse) - 5], 0, 5);
428 1.1 abhinav z = &zReverse[j + 1];
429 1.1 abhinav
430 1.1 abhinav
431 1.1 abhinav /* Step 1a */
432 1.1 abhinav if (z[0] == 's') {
433 1.1 abhinav if (
434 1.1 abhinav !stem(&z, "sess", "ss", 0) &&
435 1.1 abhinav !stem(&z, "sei", "i", 0) &&
436 1.1 abhinav !stem(&z, "ss", "ss", 0)
437 1.1 abhinav ) {
438 1.1 abhinav z++;
439 1.1 abhinav }
440 1.1 abhinav }
441 1.1 abhinav /* Step 1b */
442 1.1 abhinav z2 = z;
443 1.1 abhinav if (stem(&z, "dee", "ee", m_gt_0)) {
444 1.1 abhinav /* Do nothing. The work was all in the test */
445 1.1 abhinav } else if (
446 1.1 abhinav (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
447 1.1 abhinav && z != z2
448 1.1 abhinav ) {
449 1.1 abhinav if (stem(&z, "ta", "ate", 0) ||
450 1.1 abhinav stem(&z, "lb", "ble", 0) ||
451 1.1 abhinav stem(&z, "zi", "ize", 0)) {
452 1.1 abhinav /* Do nothing. The work was all in the test */
453 1.1 abhinav } else if (doubleConsonant(z) && (*z != 'l' && *z != 's' && *z != 'z')) {
454 1.1 abhinav z++;
455 1.1 abhinav } else if (m_eq_1(z) && star_oh(z)) {
456 1.1 abhinav *(--z) = 'e';
457 1.1 abhinav }
458 1.1 abhinav }
459 1.1 abhinav /* Step 1c */
460 1.1 abhinav if (z[0] == 'y' && hasVowel(z + 1)) {
461 1.1 abhinav z[0] = 'i';
462 1.1 abhinav }
463 1.1 abhinav /* Step 2 */
464 1.1 abhinav switch (z[1]) {
465 1.1 abhinav case 'a':
466 1.1 abhinav if (!stem(&z, "lanoita", "ate", m_gt_0)) {
467 1.1 abhinav stem(&z, "lanoit", "tion", m_gt_0);
468 1.1 abhinav }
469 1.1 abhinav break;
470 1.1 abhinav case 'c':
471 1.1 abhinav if (!stem(&z, "icne", "ence", m_gt_0)) {
472 1.1 abhinav stem(&z, "icna", "ance", m_gt_0);
473 1.1 abhinav }
474 1.1 abhinav break;
475 1.1 abhinav case 'e':
476 1.1 abhinav stem(&z, "rezi", "ize", m_gt_0);
477 1.1 abhinav break;
478 1.1 abhinav case 'g':
479 1.1 abhinav stem(&z, "igol", "log", m_gt_0);
480 1.1 abhinav break;
481 1.1 abhinav case 'l':
482 1.1 abhinav if (!stem(&z, "ilb", "ble", m_gt_0)
483 1.1 abhinav && !stem(&z, "illa", "al", m_gt_0)
484 1.1 abhinav && !stem(&z, "iltne", "ent", m_gt_0)
485 1.1 abhinav && !stem(&z, "ile", "e", m_gt_0)
486 1.1 abhinav ) {
487 1.1 abhinav stem(&z, "ilsuo", "ous", m_gt_0);
488 1.1 abhinav }
489 1.1 abhinav break;
490 1.1 abhinav case 'o':
491 1.1 abhinav if (!stem(&z, "noitazi", "ize", m_gt_0)
492 1.1 abhinav && !stem(&z, "noita", "ate", m_gt_0)
493 1.1 abhinav ) {
494 1.1 abhinav stem(&z, "rota", "ate", m_gt_0);
495 1.1 abhinav }
496 1.1 abhinav break;
497 1.1 abhinav case 's':
498 1.1 abhinav if (!stem(&z, "msila", "al", m_gt_0)
499 1.1 abhinav && !stem(&z, "ssenevi", "ive", m_gt_0)
500 1.1 abhinav && !stem(&z, "ssenluf", "ful", m_gt_0)
501 1.1 abhinav ) {
502 1.1 abhinav stem(&z, "ssensuo", "ous", m_gt_0);
503 1.1 abhinav }
504 1.1 abhinav break;
505 1.1 abhinav case 't':
506 1.1 abhinav if (!stem(&z, "itila", "al", m_gt_0)
507 1.1 abhinav && !stem(&z, "itivi", "ive", m_gt_0)
508 1.1 abhinav ) {
509 1.1 abhinav stem(&z, "itilib", "ble", m_gt_0);
510 1.1 abhinav }
511 1.1 abhinav break;
512 1.1 abhinav }
513 1.1 abhinav
514 1.1 abhinav /* Step 3 */
515 1.1 abhinav switch (z[0]) {
516 1.1 abhinav case 'e':
517 1.1 abhinav if (!stem(&z, "etaci", "ic", m_gt_0)
518 1.1 abhinav && !stem(&z, "evita", "", m_gt_0)
519 1.1 abhinav ) {
520 1.1 abhinav stem(&z, "ezila", "al", m_gt_0);
521 1.1 abhinav }
522 1.1 abhinav break;
523 1.1 abhinav case 'i':
524 1.1 abhinav stem(&z, "itici", "ic", m_gt_0);
525 1.1 abhinav break;
526 1.1 abhinav case 'l':
527 1.1 abhinav if (!stem(&z, "laci", "ic", m_gt_0)) {
528 1.1 abhinav stem(&z, "luf", "", m_gt_0);
529 1.1 abhinav }
530 1.1 abhinav break;
531 1.1 abhinav case 's':
532 1.1 abhinav stem(&z, "ssen", "", m_gt_0);
533 1.1 abhinav break;
534 1.1 abhinav }
535 1.1 abhinav
536 1.1 abhinav /* Step 4 */
537 1.1 abhinav switch (z[1]) {
538 1.1 abhinav case 'a':
539 1.1 abhinav if (z[0] == 'l' && m_gt_1(z + 2)) {
540 1.1 abhinav z += 2;
541 1.1 abhinav }
542 1.1 abhinav break;
543 1.1 abhinav case 'c':
544 1.1 abhinav if (z[0] == 'e' && z[2] == 'n' && (z[3] == 'a' || z[3] == 'e') && m_gt_1(z + 4)) {
545 1.1 abhinav z += 4;
546 1.1 abhinav }
547 1.1 abhinav break;
548 1.1 abhinav case 'e':
549 1.1 abhinav if (z[0] == 'r' && m_gt_1(z + 2)) {
550 1.1 abhinav z += 2;
551 1.1 abhinav }
552 1.1 abhinav break;
553 1.1 abhinav case 'i':
554 1.1 abhinav if (z[0] == 'c' && m_gt_1(z + 2)) {
555 1.1 abhinav z += 2;
556 1.1 abhinav }
557 1.1 abhinav break;
558 1.1 abhinav case 'l':
559 1.1 abhinav if (z[0] == 'e' && z[2] == 'b' && (z[3] == 'a' || z[3] == 'i') && m_gt_1(z + 4)) {
560 1.1 abhinav z += 4;
561 1.1 abhinav }
562 1.1 abhinav break;
563 1.1 abhinav case 'n':
564 1.1 abhinav if (z[0] == 't') {
565 1.1 abhinav if (z[2] == 'a') {
566 1.1 abhinav if (m_gt_1(z + 3)) {
567 1.1 abhinav z += 3;
568 1.1 abhinav }
569 1.1 abhinav } else if (z[2] == 'e') {
570 1.1 abhinav if (!stem(&z, "tneme", "", m_gt_1)
571 1.1 abhinav && !stem(&z, "tnem", "", m_gt_1)
572 1.1 abhinav ) {
573 1.1 abhinav stem(&z, "tne", "", m_gt_1);
574 1.1 abhinav }
575 1.1 abhinav }
576 1.1 abhinav }
577 1.1 abhinav break;
578 1.1 abhinav case 'o':
579 1.1 abhinav if (z[0] == 'u') {
580 1.1 abhinav if (m_gt_1(z + 2)) {
581 1.1 abhinav z += 2;
582 1.1 abhinav }
583 1.1 abhinav } else if (z[3] == 's' || z[3] == 't') {
584 1.1 abhinav stem(&z, "noi", "", m_gt_1);
585 1.1 abhinav }
586 1.1 abhinav break;
587 1.1 abhinav case 's':
588 1.1 abhinav if (z[0] == 'm' && z[2] == 'i' && m_gt_1(z + 3)) {
589 1.1 abhinav z += 3;
590 1.1 abhinav }
591 1.1 abhinav break;
592 1.1 abhinav case 't':
593 1.1 abhinav if (!stem(&z, "eta", "", m_gt_1)) {
594 1.1 abhinav stem(&z, "iti", "", m_gt_1);
595 1.1 abhinav }
596 1.1 abhinav break;
597 1.1 abhinav case 'u':
598 1.1 abhinav if (z[0] == 's' && z[2] == 'o' && m_gt_1(z + 3)) {
599 1.1 abhinav z += 3;
600 1.1 abhinav }
601 1.1 abhinav break;
602 1.1 abhinav case 'v':
603 1.1 abhinav case 'z':
604 1.1 abhinav if (z[0] == 'e' && z[2] == 'i' && m_gt_1(z + 3)) {
605 1.1 abhinav z += 3;
606 1.1 abhinav }
607 1.1 abhinav break;
608 1.1 abhinav }
609 1.1 abhinav
610 1.1 abhinav /* Step 5a */
611 1.1 abhinav if (z[0] == 'e') {
612 1.1 abhinav if (m_gt_1(z + 1)) {
613 1.1 abhinav z++;
614 1.1 abhinav } else if (m_eq_1(z + 1) && !star_oh(z + 1)) {
615 1.1 abhinav z++;
616 1.1 abhinav }
617 1.1 abhinav }
618 1.1 abhinav /* Step 5b */
619 1.1 abhinav if (m_gt_1(z) && z[0] == 'l' && z[1] == 'l') {
620 1.1 abhinav z++;
621 1.1 abhinav }
622 1.1 abhinav /* z[] is now the stemmed word in reverse order. Flip it back
623 1.1 abhinav * around into forward order and return.
624 1.1 abhinav */
625 1.1 abhinav *pnOut = i = strlen(z);
626 1.1 abhinav zOut[i] = 0;
627 1.1 abhinav while (*z) {
628 1.1 abhinav zOut[--i] = *(z++);
629 1.1 abhinav }
630 1.1 abhinav }
631 1.1 abhinav
632 1.1 abhinav /*
633 1.1 abhinav * Based on whether the input word is in the nostem list or not
634 1.1 abhinav * call porter stemmer to stem it, or call copy_stemmer to keep it
635 1.1 abhinav * as it is (copy_stemmer converts simply converts it to lower case)
636 1.1 abhinav * Returns SQLITE_OK if stemming is successful, an error code for
637 1.1 abhinav * any errors
638 1.1 abhinav */
639 1.1 abhinav static int
640 1.1 abhinav do_stem(const char *zIn, size_t nIn, char *zOut, size_t *pnOut)
641 1.1 abhinav {
642 1.1 abhinav /* Before looking up the word in the hash table, convert it to lower-case */
643 1.1 abhinav char *dupword = malloc(nIn);
644 1.1 abhinav if (dupword == NULL)
645 1.1 abhinav return SQLITE_NOMEM;
646 1.1 abhinav
647 1.1 abhinav for (size_t i = 0; i < nIn; i++)
648 1.1 abhinav dupword[i] = tolower((unsigned char) zIn[i]);
649 1.1 abhinav
650 1.1 abhinav size_t idx = nostem_hash(dupword, nIn);
651 1.1 abhinav if (strncmp(nostem[idx], dupword, nIn) == 0 && nostem[idx][nIn] == 0)
652 1.1 abhinav copy_stemmer(zIn, nIn, zOut, pnOut);
653 1.1 abhinav else
654 1.1 abhinav porter_stemmer(zIn, nIn, zOut, pnOut);
655 1.1 abhinav
656 1.1 abhinav free(dupword);
657 1.1 abhinav return SQLITE_OK;
658 1.1 abhinav }
659 1.1 abhinav
660 1.1 abhinav
661 1.1 abhinav /*
662 1.1 abhinav * Characters that can be part of a token. We assume any character
663 1.1 abhinav * whose value is greater than 0x80 (any UTF character) can be
664 1.1 abhinav * part of a token. In other words, delimiters all must have
665 1.1 abhinav * values of 0x7f or lower.
666 1.1 abhinav */
667 1.1 abhinav static const char porterIdChar[] = {
668 1.1 abhinav /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
669 1.1 abhinav 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
670 1.1 abhinav 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
671 1.1 abhinav 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 5x */
672 1.1 abhinav 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
673 1.1 abhinav 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
674 1.1 abhinav };
675 1.1 abhinav
676 1.1 abhinav #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
677 1.1 abhinav
678 1.1 abhinav /*
679 1.1 abhinav * Extract the next token from a tokenization cursor. The cursor must
680 1.1 abhinav * have been opened by a prior call to aproposPorterOpen().
681 1.1 abhinav */
682 1.1 abhinav static int
683 1.1 abhinav aproposPorterNext(
684 1.1 abhinav sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by aproposPorterOpen */
685 1.1 abhinav const char **pzToken, /* OUT: *pzToken is the token text */
686 1.1 abhinav int *pnBytes, /* OUT: Number of bytes in token */
687 1.1 abhinav int *piStartOffset, /* OUT: Starting offset of token */
688 1.1 abhinav int *piEndOffset, /* OUT: Ending offset of token */
689 1.1 abhinav int *piPosition /* OUT: Position integer of token */
690 1.1 abhinav )
691 1.1 abhinav {
692 1.1 abhinav custom_apropos_tokenizer_cursor *c = (custom_apropos_tokenizer_cursor *) pCursor;
693 1.1 abhinav const char *z = c->zInput;
694 1.1 abhinav
695 1.1 abhinav while (c->iOffset < c->nInput) {
696 1.1 abhinav size_t iStartOffset, ch;
697 1.1 abhinav
698 1.1 abhinav /* Scan past delimiter characters */
699 1.1 abhinav while (c->iOffset < c->nInput && isDelim(z[c->iOffset])) {
700 1.1 abhinav c->iOffset++;
701 1.1 abhinav }
702 1.1 abhinav
703 1.1 abhinav /* Count non-delimiter characters. */
704 1.1 abhinav iStartOffset = c->iOffset;
705 1.1 abhinav while (c->iOffset < c->nInput && !isDelim(z[c->iOffset])) {
706 1.1 abhinav c->iOffset++;
707 1.1 abhinav }
708 1.1 abhinav
709 1.1 abhinav if (c->iOffset > iStartOffset) {
710 1.1 abhinav size_t n = c->iOffset - iStartOffset;
711 1.1 abhinav if (n > c->nAllocated) {
712 1.1 abhinav char *pNew;
713 1.1 abhinav c->nAllocated = n + 20;
714 1.1 abhinav pNew = realloc(c->zToken, c->nAllocated);
715 1.1 abhinav if (!pNew)
716 1.1 abhinav return SQLITE_NOMEM;
717 1.1 abhinav c->zToken = pNew;
718 1.1 abhinav }
719 1.2 abhinav
720 1.2 abhinav size_t temp;
721 1.2 abhinav int stemStatus = do_stem(&z[iStartOffset], n, c->zToken, &temp);
722 1.2 abhinav *pnBytes = temp;
723 1.1 abhinav if (stemStatus != SQLITE_OK)
724 1.1 abhinav return stemStatus;
725 1.1 abhinav
726 1.1 abhinav *pzToken = c->zToken;
727 1.1 abhinav *piStartOffset = iStartOffset;
728 1.1 abhinav *piEndOffset = c->iOffset;
729 1.1 abhinav *piPosition = c->iToken++;
730 1.1 abhinav return SQLITE_OK;
731 1.1 abhinav }
732 1.1 abhinav }
733 1.1 abhinav return SQLITE_DONE;
734 1.1 abhinav }
735 1.1 abhinav
736 1.1 abhinav /*
737 1.1 abhinav * The set of routines that implement the porter-stemmer tokenizer
738 1.1 abhinav */
739 1.1 abhinav static const sqlite3_tokenizer_module aproposPorterTokenizerModule = {
740 1.1 abhinav 0,
741 1.1 abhinav aproposPorterCreate,
742 1.1 abhinav aproposPorterDestroy,
743 1.1 abhinav aproposPorterOpen,
744 1.1 abhinav aproposPorterClose,
745 1.1 abhinav aproposPorterNext,
746 1.1 abhinav 0
747 1.1 abhinav };
748 1.1 abhinav
749 1.1 abhinav /*
750 1.1 abhinav * Allocate a new porter tokenizer. Return a pointer to the new
751 1.1 abhinav * tokenizer in *ppModule
752 1.1 abhinav */
753 1.1 abhinav void
754 1.1 abhinav get_custom_apropos_tokenizer(sqlite3_tokenizer_module const ** ppModule)
755 1.1 abhinav {
756 1.1 abhinav *ppModule = &aproposPorterTokenizerModule;
757 1.1 abhinav }
758