utf8.h revision 1ab64890
11ab64890Smrg/* $XFree86: xc/lib/X11/lcUniConv/utf8.h,v 1.2 2000/11/28 16:10:32 dawes Exp $ */ 21ab64890Smrg 31ab64890Smrg/* 41ab64890Smrg * UTF-8 51ab64890Smrg */ 61ab64890Smrg 71ab64890Smrg/* Specification: RFC 2279 */ 81ab64890Smrg 91ab64890Smrgstatic int 101ab64890Smrgutf8_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) 111ab64890Smrg{ 121ab64890Smrg unsigned char c = s[0]; 131ab64890Smrg 141ab64890Smrg if (c < 0x80) { 151ab64890Smrg *pwc = c; 161ab64890Smrg return 1; 171ab64890Smrg } else if (c < 0xc2) { 181ab64890Smrg return RET_ILSEQ; 191ab64890Smrg } else if (c < 0xe0) { 201ab64890Smrg if (n < 2) 211ab64890Smrg return RET_TOOFEW(0); 221ab64890Smrg if (!((s[1] ^ 0x80) < 0x40)) 231ab64890Smrg return RET_ILSEQ; 241ab64890Smrg *pwc = ((ucs4_t) (c & 0x1f) << 6) 251ab64890Smrg | (ucs4_t) (s[1] ^ 0x80); 261ab64890Smrg return 2; 271ab64890Smrg } else if (c < 0xf0) { 281ab64890Smrg if (n < 3) 291ab64890Smrg return RET_TOOFEW(0); 301ab64890Smrg if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 311ab64890Smrg && (c >= 0xe1 || s[1] >= 0xa0))) 321ab64890Smrg return RET_ILSEQ; 331ab64890Smrg *pwc = ((ucs4_t) (c & 0x0f) << 12) 341ab64890Smrg | ((ucs4_t) (s[1] ^ 0x80) << 6) 351ab64890Smrg | (ucs4_t) (s[2] ^ 0x80); 361ab64890Smrg return 3; 371ab64890Smrg } else if (c < 0xf8) { 381ab64890Smrg if (n < 4) 391ab64890Smrg return RET_TOOFEW(0); 401ab64890Smrg if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 411ab64890Smrg && (s[3] ^ 0x80) < 0x40 421ab64890Smrg && (c >= 0xf1 || s[1] >= 0x90))) 431ab64890Smrg return RET_ILSEQ; 441ab64890Smrg *pwc = ((ucs4_t) (c & 0x07) << 18) 451ab64890Smrg | ((ucs4_t) (s[1] ^ 0x80) << 12) 461ab64890Smrg | ((ucs4_t) (s[2] ^ 0x80) << 6) 471ab64890Smrg | (ucs4_t) (s[3] ^ 0x80); 481ab64890Smrg return 4; 491ab64890Smrg } else if (c < 0xfc) { 501ab64890Smrg if (n < 5) 511ab64890Smrg return RET_TOOFEW(0); 521ab64890Smrg if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 531ab64890Smrg && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 541ab64890Smrg && (c >= 0xf9 || s[1] >= 0x88))) 551ab64890Smrg return RET_ILSEQ; 561ab64890Smrg *pwc = ((ucs4_t) (c & 0x03) << 24) 571ab64890Smrg | ((ucs4_t) (s[1] ^ 0x80) << 18) 581ab64890Smrg | ((ucs4_t) (s[2] ^ 0x80) << 12) 591ab64890Smrg | ((ucs4_t) (s[3] ^ 0x80) << 6) 601ab64890Smrg | (ucs4_t) (s[4] ^ 0x80); 611ab64890Smrg return 5; 621ab64890Smrg } else if (c < 0xfe) { 631ab64890Smrg if (n < 6) 641ab64890Smrg return RET_TOOFEW(0); 651ab64890Smrg if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 661ab64890Smrg && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 671ab64890Smrg && (s[5] ^ 0x80) < 0x40 681ab64890Smrg && (c >= 0xfd || s[1] >= 0x84))) 691ab64890Smrg return RET_ILSEQ; 701ab64890Smrg *pwc = ((ucs4_t) (c & 0x01) << 30) 711ab64890Smrg | ((ucs4_t) (s[1] ^ 0x80) << 24) 721ab64890Smrg | ((ucs4_t) (s[2] ^ 0x80) << 18) 731ab64890Smrg | ((ucs4_t) (s[3] ^ 0x80) << 12) 741ab64890Smrg | ((ucs4_t) (s[4] ^ 0x80) << 6) 751ab64890Smrg | (ucs4_t) (s[5] ^ 0x80); 761ab64890Smrg return 6; 771ab64890Smrg } else 781ab64890Smrg return RET_ILSEQ; 791ab64890Smrg} 801ab64890Smrg 811ab64890Smrgstatic int 821ab64890Smrgutf8_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) /* n == 0 is acceptable */ 831ab64890Smrg{ 841ab64890Smrg int count; 851ab64890Smrg if (wc < 0x80) 861ab64890Smrg count = 1; 871ab64890Smrg else if (wc < 0x800) 881ab64890Smrg count = 2; 891ab64890Smrg else if (wc < 0x10000) 901ab64890Smrg count = 3; 911ab64890Smrg else if (wc < 0x200000) 921ab64890Smrg count = 4; 931ab64890Smrg else if (wc < 0x4000000) 941ab64890Smrg count = 5; 951ab64890Smrg else if (wc <= 0x7fffffff) 961ab64890Smrg count = 6; 971ab64890Smrg else 981ab64890Smrg return RET_ILSEQ; 991ab64890Smrg if (n < count) 1001ab64890Smrg return RET_TOOSMALL; 1011ab64890Smrg switch (count) { /* note: code falls through cases! */ 1021ab64890Smrg case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000; 1031ab64890Smrg case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000; 1041ab64890Smrg case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000; 1051ab64890Smrg case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800; 1061ab64890Smrg case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0; 1071ab64890Smrg case 1: r[0] = wc; 1081ab64890Smrg } 1091ab64890Smrg return count; 1101ab64890Smrg} 111