Home | History | Annotate | Line # | Download | only in ucdata
      1 #
      2 # Id: api.txt,v 1.3 2001/01/02 18:46:20 mleisher Exp 
      3 #
      4 
      5                              The MUTT UCData API
      6                              -------------------
      7 
      8 
      9 ####
     10 NOTE: This library has been customized for use with OpenLDAP. The character
     11 data tables are hardcoded into the library and the load/unload/reload
     12 functions are no-ops. Also, the MUTT API claimed to be compatible with
     13 John Cowan's library but its ucnumber behavior was broken. This has been
     14 fixed in the OpenLDAP release.
     15 
     16 By default, the implementation specific properties in MUTTUCData.txt are
     17 not incorporated into the OpenLDAP build. You can supply them to ucgendat
     18 and recreate uctable.h if you need them.
     19   -- hyc (at] openldap.org
     20 ####
     21 
     22 
     23 -----------------------------------------------------------------------------
     24 
     25 Macros that combine to select data tables for ucdata_load(), ucdata_unload(),
     26 and ucdata_reload().
     27 
     28 #define UCDATA_CASE   0x01
     29 #define UCDATA_CTYPE  0x02
     30 #define UCDATA_DECOMP 0x04
     31 #define UCDATA_CMBCL  0x08
     32 #define UCDATA_NUM    0x10
     33 #define UCDATA_COMP   0x20
     34 #define UCATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\
     35                    UCDATA_CMBCL|UCDATA_NUM|UCDATA_COMP)
     36 -----------------------------------------------------------------------------
     37 
     38 void ucdata_load(char *paths, int masks)
     39 
     40   This function initializes the UCData library by locating the data files in
     41   one of the colon-separated directories in the `paths' parameter.  The data
     42   files to be loaded are specified in the `masks' parameter as a bitwise
     43   combination of the macros listed above.
     44 
     45   This should be called before using any of the other functions.
     46 
     47   NOTE: the ucdata_setup(char *paths) function is now a macro that expands
     48         into this function at compile time.
     49 
     50 -----------------------------------------------------------------------------
     51 
     52 void ucdata_unload(int masks)
     53 
     54   This function unloads the data tables specified in the `masks' parameter.
     55 
     56   This function should be called when the application is done using the UCData
     57   package.
     58 
     59   NOTE: the ucdata_cleanup() function is now a macro that expands into this
     60         function at compile time.
     61 
     62 -----------------------------------------------------------------------------
     63 
     64 void ucdata_reload(char *paths, int masks)
     65 
     66   This function reloads the data files from one of the colon-separated
     67   directories in the `paths' parameter.  The data files to be reloaded are
     68   specified in the `masks' parameter as a bitwise combination of the macros
     69   listed above.
     70 
     71   If the data files have already been loaded, they are unloaded before the
     72   data files are loaded again.
     73 
     74 -----------------------------------------------------------------------------
     75 
     76 int ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
     77 
     78   This function determines if a character has a decomposition and returns the
     79   decomposition information if it exists.
     80 
     81   If a zero is returned, there is no decomposition.  If a non-zero is
     82   returned, then the `num' and `decomp' variables are filled in with the
     83   appropriate values.
     84 
     85   Example call:
     86 
     87     unsigned long i, num, *decomp;
     88 
     89     if (ucdecomp(0x1d5, &num, &decomp) != 0) {
     90        for (i = 0; i < num; i++)
     91          printf("0x%08lX,", decomp[i]);
     92        putchar('\n');
     93     }
     94 
     95 int uccanondecomp(const unsigned long *in, int inlen, unsigned long **out,
     96                   int *outlen)
     97 
     98   This function decomposes an input string and does canonical reordering of
     99   the characters at the same time.
    100 
    101   If a -1 is returned, memory allocation was not successful.  If a zero is
    102   returned, no decomposition occurred.  Any other value means the output string
    103   contains the fully decomposed string in canonical order.
    104 
    105   If the "outlen" parameter comes back with a value > 0, then the string
    106   returned in the "out" parameter needs to be deallocated by the caller. 
    107 
    108 -----------------------------------------------------------------------------
    109 
    110 int ucdecomp_hangul(unsigned long code, unsigned long *num,
    111                     unsigned long decomp[])
    112 
    113   This function determines if a Hangul syllable has a decomposition and
    114   returns the decomposition information.
    115 
    116   An array of at least size 3 should be passed to the function for the
    117   decomposition of the syllable.
    118 
    119   If a zero is returned, the character is not a Hangul syllable.  If a
    120   non-zero is returned, the `num' field will be 2 or 3 and the syllable will
    121   be decomposed into the `decomp' array arithmetically.
    122 
    123   Example call:
    124 
    125     unsigned long i, num, decomp[3];
    126 
    127     if (ucdecomp_hangul(0xb1ba, &num, &decomp) != 0) {
    128        for (i = 0; i < num; i++)
    129          printf("0x%08lX,", decomp[i]);
    130        putchar('\n');
    131     }
    132 
    133 -----------------------------------------------------------------------------
    134 
    135 int uccomp(unsigned long ch1, unsigned long ch2, unsigned long *comp)
    136 
    137   This function takes a pair of characters and determines if they combine to
    138   form another character.
    139 
    140   If a zero is returned, no composition is formed by the character pair.  Any
    141   other value indicates the "comp" parameter has a value.
    142 
    143 int uccomp_hangul(unsigned long *str, int len)
    144 
    145   This function composes the Hangul Jamo in the string.  The composition is
    146   done in-place.
    147 
    148   The return value provides the new length of the string.  This will be
    149   smaller than "len" if compositions occurred.
    150 
    151 int uccanoncomp(unsigned long *str, int len)
    152 
    153   This function does a canonical composition of characters in the string.
    154 
    155   The return value is the new length of the string.
    156 
    157 -----------------------------------------------------------------------------
    158 
    159 struct ucnumber {
    160   int numerator;
    161   int denominator;
    162 };
    163 
    164 int ucnumber_lookup(unsigned long code, struct ucnumber *num)
    165 
    166   This function determines if the code is a number and fills in the `num'
    167   field with the numerator and denominator.  If the code happens to be a
    168   single digit, the denominator field will be 1.
    169 
    170 ####
    171 The original code would set numerator = denominator for regular digits.
    172 However, the Readme also claimed to be compatible with John Cowan's uctype
    173 library, but this behavior is both nonsensical and incompatible with the
    174 Cowan library. As such, it has been fixed here as described above.
    175   -- hyc@openldap.org
    176 ####
    177 
    178   If the function returns 0, the code is not a number.  Any other return
    179   value means the code is a number.
    180 
    181 int ucdigit_lookup(unsigned long code, int *digit)
    182 
    183   This function determines if the code is a digit and fills in the `digit'
    184   field with the digit value.
    185 
    186   If the function returns 0, the code is not a number.  Any other return
    187   value means the code is a number.
    188 
    189 struct ucnumber ucgetnumber(unsigned long code)
    190 
    191   This is a compatibility function with John Cowan's "uctype" package.  It
    192   uses ucnumber_lookup().
    193 
    194 int ucgetdigit(unsigned long code)
    195 
    196   This is a compatibility function with John Cowan's "uctype" package.  It
    197   uses ucdigit_lookup().
    198 
    199 -----------------------------------------------------------------------------
    200 
    201 unsigned long uctoupper(unsigned long code)
    202 
    203   This function returns the code unchanged if it is already upper case or has
    204   no upper case equivalent.  Otherwise the upper case equivalent is returned.
    205 
    206 -----------------------------------------------------------------------------
    207 
    208 unsigned long uctolower(unsigned long code)
    209 
    210   This function returns the code unchanged if it is already lower case or has
    211   no lower case equivalent.  Otherwise the lower case equivalent is returned.
    212 
    213 -----------------------------------------------------------------------------
    214 
    215 unsigned long uctotitle(unsigned long code)
    216 
    217   This function returns the code unchanged if it is already title case or has
    218   no title case equivalent.  Otherwise the title case equivalent is returned.
    219 
    220 -----------------------------------------------------------------------------
    221 
    222 int ucisalpha(unsigned long code)
    223 int ucisalnum(unsigned long code)
    224 int ucisdigit(unsigned long code)
    225 int uciscntrl(unsigned long code)
    226 int ucisspace(unsigned long code)
    227 int ucisblank(unsigned long code)
    228 int ucispunct(unsigned long code)
    229 int ucisgraph(unsigned long code)
    230 int ucisprint(unsigned long code)
    231 int ucisxdigit(unsigned long code)
    232 
    233 int ucisupper(unsigned long code)
    234 int ucislower(unsigned long code)
    235 int ucistitle(unsigned long code)
    236 
    237   These functions (actually macros) determine if a character has these
    238   properties.  These behave in a fashion very similar to the venerable ctype
    239   package.
    240 
    241 -----------------------------------------------------------------------------
    242 
    243 int ucisisocntrl(unsigned long code)
    244 
    245   Is the character a C0 control character (< 32) ?
    246 
    247 int ucisfmtcntrl(unsigned long code)
    248 
    249   Is the character a format control character?
    250 
    251 int ucissymbol(unsigned long code)
    252 
    253   Is the character a symbol?
    254 
    255 int ucisnumber(unsigned long code)
    256 
    257   Is the character a number or digit?
    258 
    259 int ucisnonspacing(unsigned long code)
    260 
    261   Is the character non-spacing?
    262 
    263 int ucisopenpunct(unsigned long code)
    264 
    265   Is the character an open/left punctuation (i.e. '[')
    266 
    267 int ucisclosepunct(unsigned long code)
    268 
    269   Is the character an close/right punctuation (i.e. ']')
    270 
    271 int ucisinitialpunct(unsigned long code)
    272 
    273   Is the character an initial punctuation (i.e. U+2018 LEFT SINGLE QUOTATION
    274   MARK)
    275 
    276 int ucisfinalpunct(unsigned long code)
    277 
    278   Is the character a final punctuation (i.e. U+2019 RIGHT SINGLE QUOTATION
    279   MARK)
    280 
    281 int uciscomposite(unsigned long code)
    282 
    283   Can the character be decomposed into a set of other characters?
    284 
    285 int ucisquote(unsigned long code)
    286 
    287   Is the character one of the many quotation marks?
    288 
    289 int ucissymmetric(unsigned long code)
    290 
    291   Is the character one that has an opposite form (i.e. <>)
    292 
    293 int ucismirroring(unsigned long code)
    294 
    295   Is the character mirroring (superset of symmetric)?
    296 
    297 int ucisnonbreaking(unsigned long code)
    298 
    299   Is the character non-breaking (i.e. non-breaking space)?
    300 
    301 int ucisrtl(unsigned long code)
    302 
    303   Does the character have strong right-to-left directionality (i.e. Arabic
    304   letters)?
    305 
    306 int ucisltr(unsigned long code)
    307 
    308   Does the character have strong left-to-right directionality (i.e. Latin
    309   letters)?
    310 
    311 int ucisstrong(unsigned long code)
    312 
    313   Does the character have strong directionality?
    314 
    315 int ucisweak(unsigned long code)
    316 
    317   Does the character have weak directionality (i.e. numbers)?
    318 
    319 int ucisneutral(unsigned long code)
    320 
    321   Does the character have neutral directionality (i.e. whitespace)?
    322 
    323 int ucisseparator(unsigned long code)
    324 
    325   Is the character a block or segment separator?
    326 
    327 int ucislsep(unsigned long code)
    328 
    329   Is the character a line separator?
    330 
    331 int ucispsep(unsigned long code)
    332 
    333   Is the character a paragraph separator?
    334 
    335 int ucismark(unsigned long code)
    336 
    337   Is the character a mark of some kind?
    338 
    339 int ucisnsmark(unsigned long code)
    340 
    341   Is the character a non-spacing mark?
    342 
    343 int ucisspmark(unsigned long code)
    344 
    345   Is the character a spacing mark?
    346 
    347 int ucismodif(unsigned long code)
    348 
    349   Is the character a modifier letter?
    350 
    351 int ucismodifsymbol(unsigned long code)
    352 
    353   Is the character a modifier symbol?
    354 
    355 int ucisletnum(unsigned long code)
    356 
    357   Is the character a number represented by a letter?
    358 
    359 int ucisconnect(unsigned long code)
    360 
    361   Is the character connecting punctuation?
    362 
    363 int ucisdash(unsigned long code)
    364 
    365   Is the character dash punctuation?
    366 
    367 int ucismath(unsigned long code)
    368 
    369   Is the character a math character?
    370 
    371 int uciscurrency(unsigned long code)
    372 
    373   Is the character a currency character?
    374 
    375 int ucisenclosing(unsigned long code)
    376 
    377   Is the character enclosing (i.e. enclosing box)?
    378 
    379 int ucisprivate(unsigned long code)
    380 
    381   Is the character from the Private Use Area?
    382 
    383 int ucissurrogate(unsigned long code)
    384 
    385   Is the character one of the surrogate codes?
    386 
    387 int ucisdefined(unsigned long code)
    388 
    389   Is the character defined (appeared in one of the data files)?
    390 
    391 int ucisundefined(unsigned long code)
    392 
    393   Is the character not defined (non-Unicode)?
    394 
    395 int ucishan(unsigned long code)
    396 
    397   Is the character a Han ideograph?
    398 
    399 int ucishangul(unsigned long code)
    400 
    401   Is the character a pre-composed Hangul syllable?
    402