Home | History | Annotate | Line # | Download | only in makemandb
      1  1.1  abhinav /*
      2  1.1  abhinav ** 2006 July 10
      3  1.1  abhinav **
      4  1.1  abhinav ** The author disclaims copyright to this source code.
      5  1.1  abhinav **
      6  1.1  abhinav *************************************************************************
      7  1.1  abhinav ** Defines the interface to tokenizers used by fulltext-search.  There
      8  1.1  abhinav ** are three basic components:
      9  1.1  abhinav **
     10  1.1  abhinav ** sqlite3_tokenizer_module is a singleton defining the tokenizer
     11  1.1  abhinav ** interface functions.  This is essentially the class structure for
     12  1.1  abhinav ** tokenizers.
     13  1.1  abhinav **
     14  1.1  abhinav ** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
     15  1.1  abhinav ** including customization information defined at creation time.
     16  1.1  abhinav **
     17  1.1  abhinav ** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
     18  1.1  abhinav ** tokens from a particular input.
     19  1.1  abhinav */
     20  1.1  abhinav #ifndef _FTS3_TOKENIZER_H_
     21  1.1  abhinav #define _FTS3_TOKENIZER_H_
     22  1.1  abhinav 
     23  1.1  abhinav /* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
     24  1.1  abhinav ** If tokenizers are to be allowed to call sqlite3_*() functions, then
     25  1.1  abhinav ** we will need a way to register the API consistently.
     26  1.1  abhinav */
     27  1.1  abhinav #include "sqlite3.h"
     28  1.1  abhinav 
     29  1.1  abhinav /*
     30  1.1  abhinav ** Structures used by the tokenizer interface. When a new tokenizer
     31  1.1  abhinav ** implementation is registered, the caller provides a pointer to
     32  1.1  abhinav ** an sqlite3_tokenizer_module containing pointers to the callback
     33  1.1  abhinav ** functions that make up an implementation.
     34  1.1  abhinav **
     35  1.1  abhinav ** When an fts3 table is created, it passes any arguments passed to
     36  1.1  abhinav ** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
     37  1.1  abhinav ** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
     38  1.2      rin ** implementation. The xCreate() function in turn returns an
     39  1.1  abhinav ** sqlite3_tokenizer structure representing the specific tokenizer to
     40  1.1  abhinav ** be used for the fts3 table (customized by the tokenizer clause arguments).
     41  1.1  abhinav **
     42  1.1  abhinav ** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
     43  1.1  abhinav ** method is called. It returns an sqlite3_tokenizer_cursor object
     44  1.1  abhinav ** that may be used to tokenize a specific input buffer based on
     45  1.1  abhinav ** the tokenization rules supplied by a specific sqlite3_tokenizer
     46  1.1  abhinav ** object.
     47  1.1  abhinav */
     48  1.1  abhinav typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
     49  1.1  abhinav typedef struct sqlite3_tokenizer sqlite3_tokenizer;
     50  1.1  abhinav typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
     51  1.1  abhinav 
     52  1.1  abhinav struct sqlite3_tokenizer_module {
     53  1.1  abhinav 
     54  1.1  abhinav   /*
     55  1.1  abhinav   ** Structure version. Should always be set to 0 or 1.
     56  1.1  abhinav   */
     57  1.1  abhinav   int iVersion;
     58  1.1  abhinav 
     59  1.1  abhinav   /*
     60  1.1  abhinav   ** Create a new tokenizer. The values in the argv[] array are the
     61  1.1  abhinav   ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
     62  1.1  abhinav   ** TABLE statement that created the fts3 table. For example, if
     63  1.1  abhinav   ** the following SQL is executed:
     64  1.1  abhinav   **
     65  1.1  abhinav   **   CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
     66  1.1  abhinav   **
     67  1.1  abhinav   ** then argc is set to 2, and the argv[] array contains pointers
     68  1.1  abhinav   ** to the strings "arg1" and "arg2".
     69  1.1  abhinav   **
     70  1.2      rin   ** This method should return either SQLITE_OK (0), or an SQLite error
     71  1.1  abhinav   ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
     72  1.1  abhinav   ** to point at the newly created tokenizer structure. The generic
     73  1.1  abhinav   ** sqlite3_tokenizer.pModule variable should not be initialized by
     74  1.1  abhinav   ** this callback. The caller will do so.
     75  1.1  abhinav   */
     76  1.1  abhinav   int (*xCreate)(
     77  1.1  abhinav     int argc,                           /* Size of argv array */
     78  1.1  abhinav     const char *const*argv,             /* Tokenizer argument strings */
     79  1.1  abhinav     sqlite3_tokenizer **ppTokenizer     /* OUT: Created tokenizer */
     80  1.1  abhinav   );
     81  1.1  abhinav 
     82  1.1  abhinav   /*
     83  1.1  abhinav   ** Destroy an existing tokenizer. The fts3 module calls this method
     84  1.1  abhinav   ** exactly once for each successful call to xCreate().
     85  1.1  abhinav   */
     86  1.1  abhinav   int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
     87  1.1  abhinav 
     88  1.1  abhinav   /*
     89  1.1  abhinav   ** Create a tokenizer cursor to tokenize an input buffer. The caller
     90  1.1  abhinav   ** is responsible for ensuring that the input buffer remains valid
     91  1.2      rin   ** until the cursor is closed (using the xClose() method).
     92  1.1  abhinav   */
     93  1.1  abhinav   int (*xOpen)(
     94  1.1  abhinav     sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
     95  1.1  abhinav     const char *pInput, int nBytes,      /* Input buffer */
     96  1.1  abhinav     sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
     97  1.1  abhinav   );
     98  1.1  abhinav 
     99  1.1  abhinav   /*
    100  1.2      rin   ** Destroy an existing tokenizer cursor. The fts3 module calls this
    101  1.1  abhinav   ** method exactly once for each successful call to xOpen().
    102  1.1  abhinav   */
    103  1.1  abhinav   int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
    104  1.1  abhinav 
    105  1.1  abhinav   /*
    106  1.1  abhinav   ** Retrieve the next token from the tokenizer cursor pCursor. This
    107  1.1  abhinav   ** method should either return SQLITE_OK and set the values of the
    108  1.1  abhinav   ** "OUT" variables identified below, or SQLITE_DONE to indicate that
    109  1.1  abhinav   ** the end of the buffer has been reached, or an SQLite error code.
    110  1.1  abhinav   **
    111  1.2      rin   ** *ppToken should be set to point at a buffer containing the
    112  1.1  abhinav   ** normalized version of the token (i.e. after any case-folding and/or
    113  1.1  abhinav   ** stemming has been performed). *pnBytes should be set to the length
    114  1.1  abhinav   ** of this buffer in bytes. The input text that generated the token is
    115  1.1  abhinav   ** identified by the byte offsets returned in *piStartOffset and
    116  1.1  abhinav   ** *piEndOffset. *piStartOffset should be set to the index of the first
    117  1.1  abhinav   ** byte of the token in the input buffer. *piEndOffset should be set
    118  1.1  abhinav   ** to the index of the first byte just past the end of the token in
    119  1.1  abhinav   ** the input buffer.
    120  1.1  abhinav   **
    121  1.1  abhinav   ** The buffer *ppToken is set to point at is managed by the tokenizer
    122  1.1  abhinav   ** implementation. It is only required to be valid until the next call
    123  1.2      rin   ** to xNext() or xClose().
    124  1.1  abhinav   */
    125  1.1  abhinav   /* TODO(shess) current implementation requires pInput to be
    126  1.1  abhinav   ** nul-terminated.  This should either be fixed, or pInput/nBytes
    127  1.1  abhinav   ** should be converted to zInput.
    128  1.1  abhinav   */
    129  1.1  abhinav   int (*xNext)(
    130  1.1  abhinav     sqlite3_tokenizer_cursor *pCursor,   /* Tokenizer cursor */
    131  1.1  abhinav     const char **ppToken, int *pnBytes,  /* OUT: Normalized text for token */
    132  1.1  abhinav     int *piStartOffset,  /* OUT: Byte offset of token in input buffer */
    133  1.1  abhinav     int *piEndOffset,    /* OUT: Byte offset of end of token in input buffer */
    134  1.1  abhinav     int *piPosition      /* OUT: Number of tokens returned before this one */
    135  1.1  abhinav   );
    136  1.1  abhinav 
    137  1.1  abhinav   /***********************************************************************
    138  1.1  abhinav   ** Methods below this point are only available if iVersion>=1.
    139  1.1  abhinav   */
    140  1.1  abhinav 
    141  1.2      rin   /*
    142  1.1  abhinav   ** Configure the language id of a tokenizer cursor.
    143  1.1  abhinav   */
    144  1.1  abhinav   int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid);
    145  1.1  abhinav };
    146  1.1  abhinav 
    147  1.1  abhinav struct sqlite3_tokenizer {
    148  1.1  abhinav   const sqlite3_tokenizer_module *pModule;  /* The module for this tokenizer */
    149  1.1  abhinav   /* Tokenizer implementations will typically add additional fields */
    150  1.1  abhinav };
    151  1.1  abhinav 
    152  1.1  abhinav struct sqlite3_tokenizer_cursor {
    153  1.1  abhinav   sqlite3_tokenizer *pTokenizer;       /* Tokenizer for this cursor. */
    154  1.1  abhinav   /* Tokenizer implementations will typically add additional fields */
    155  1.1  abhinav };
    156  1.1  abhinav 
    157  1.1  abhinav int fts3_global_term_cnt(int iTerm, int iCol);
    158  1.1  abhinav int fts3_term_cnt(int iTerm, int iCol);
    159  1.1  abhinav 
    160  1.1  abhinav 
    161  1.1  abhinav #endif /* _FTS3_TOKENIZER_H_ */
    162