utf8.h revision 1ab64890
1/* $XFree86: xc/lib/X11/lcUniConv/utf8.h,v 1.2 2000/11/28 16:10:32 dawes Exp $ */ 2 3/* 4 * UTF-8 5 */ 6 7/* Specification: RFC 2279 */ 8 9static int 10utf8_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) 11{ 12 unsigned char c = s[0]; 13 14 if (c < 0x80) { 15 *pwc = c; 16 return 1; 17 } else if (c < 0xc2) { 18 return RET_ILSEQ; 19 } else if (c < 0xe0) { 20 if (n < 2) 21 return RET_TOOFEW(0); 22 if (!((s[1] ^ 0x80) < 0x40)) 23 return RET_ILSEQ; 24 *pwc = ((ucs4_t) (c & 0x1f) << 6) 25 | (ucs4_t) (s[1] ^ 0x80); 26 return 2; 27 } else if (c < 0xf0) { 28 if (n < 3) 29 return RET_TOOFEW(0); 30 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 31 && (c >= 0xe1 || s[1] >= 0xa0))) 32 return RET_ILSEQ; 33 *pwc = ((ucs4_t) (c & 0x0f) << 12) 34 | ((ucs4_t) (s[1] ^ 0x80) << 6) 35 | (ucs4_t) (s[2] ^ 0x80); 36 return 3; 37 } else if (c < 0xf8) { 38 if (n < 4) 39 return RET_TOOFEW(0); 40 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 41 && (s[3] ^ 0x80) < 0x40 42 && (c >= 0xf1 || s[1] >= 0x90))) 43 return RET_ILSEQ; 44 *pwc = ((ucs4_t) (c & 0x07) << 18) 45 | ((ucs4_t) (s[1] ^ 0x80) << 12) 46 | ((ucs4_t) (s[2] ^ 0x80) << 6) 47 | (ucs4_t) (s[3] ^ 0x80); 48 return 4; 49 } else if (c < 0xfc) { 50 if (n < 5) 51 return RET_TOOFEW(0); 52 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 53 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 54 && (c >= 0xf9 || s[1] >= 0x88))) 55 return RET_ILSEQ; 56 *pwc = ((ucs4_t) (c & 0x03) << 24) 57 | ((ucs4_t) (s[1] ^ 0x80) << 18) 58 | ((ucs4_t) (s[2] ^ 0x80) << 12) 59 | ((ucs4_t) (s[3] ^ 0x80) << 6) 60 | (ucs4_t) (s[4] ^ 0x80); 61 return 5; 62 } else if (c < 0xfe) { 63 if (n < 6) 64 return RET_TOOFEW(0); 65 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 66 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 67 && (s[5] ^ 0x80) < 0x40 68 && (c >= 0xfd || s[1] >= 0x84))) 69 return RET_ILSEQ; 70 *pwc = ((ucs4_t) (c & 0x01) << 30) 71 | ((ucs4_t) (s[1] ^ 0x80) << 24) 72 | ((ucs4_t) (s[2] ^ 0x80) << 18) 73 | ((ucs4_t) (s[3] ^ 0x80) << 12) 74 | ((ucs4_t) (s[4] ^ 0x80) << 6) 75 | (ucs4_t) (s[5] ^ 0x80); 76 return 6; 77 } else 78 return RET_ILSEQ; 79} 80 81static int 82utf8_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) /* n == 0 is acceptable */ 83{ 84 int count; 85 if (wc < 0x80) 86 count = 1; 87 else if (wc < 0x800) 88 count = 2; 89 else if (wc < 0x10000) 90 count = 3; 91 else if (wc < 0x200000) 92 count = 4; 93 else if (wc < 0x4000000) 94 count = 5; 95 else if (wc <= 0x7fffffff) 96 count = 6; 97 else 98 return RET_ILSEQ; 99 if (n < count) 100 return RET_TOOSMALL; 101 switch (count) { /* note: code falls through cases! */ 102 case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000; 103 case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000; 104 case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000; 105 case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800; 106 case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0; 107 case 1: r[0] = wc; 108 } 109 return count; 110} 111