1 2/* 3 * UTF-8 4 */ 5 6/* Specification: RFC 2279 */ 7 8static int 9utf8_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) 10{ 11 unsigned char c = s[0]; 12 13 if (c < 0x80) { 14 *pwc = c; 15 return 1; 16 } else if (c < 0xc2) { 17 return RET_ILSEQ; 18 } else if (c < 0xe0) { 19 if (n < 2) 20 return RET_TOOFEW(0); 21 if (!((s[1] ^ 0x80) < 0x40)) 22 return RET_ILSEQ; 23 *pwc = ((ucs4_t) (c & 0x1f) << 6) 24 | (ucs4_t) (s[1] ^ 0x80); 25 return 2; 26 } else if (c < 0xf0) { 27 if (n < 3) 28 return RET_TOOFEW(0); 29 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 30 && (c >= 0xe1 || s[1] >= 0xa0))) 31 return RET_ILSEQ; 32 *pwc = ((ucs4_t) (c & 0x0f) << 12) 33 | ((ucs4_t) (s[1] ^ 0x80) << 6) 34 | (ucs4_t) (s[2] ^ 0x80); 35 return 3; 36 } else if (c < 0xf8) { 37 if (n < 4) 38 return RET_TOOFEW(0); 39 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 40 && (s[3] ^ 0x80) < 0x40 41 && (c >= 0xf1 || s[1] >= 0x90))) 42 return RET_ILSEQ; 43 *pwc = ((ucs4_t) (c & 0x07) << 18) 44 | ((ucs4_t) (s[1] ^ 0x80) << 12) 45 | ((ucs4_t) (s[2] ^ 0x80) << 6) 46 | (ucs4_t) (s[3] ^ 0x80); 47 return 4; 48 } else if (c < 0xfc) { 49 if (n < 5) 50 return RET_TOOFEW(0); 51 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 52 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 53 && (c >= 0xf9 || s[1] >= 0x88))) 54 return RET_ILSEQ; 55 *pwc = ((ucs4_t) (c & 0x03) << 24) 56 | ((ucs4_t) (s[1] ^ 0x80) << 18) 57 | ((ucs4_t) (s[2] ^ 0x80) << 12) 58 | ((ucs4_t) (s[3] ^ 0x80) << 6) 59 | (ucs4_t) (s[4] ^ 0x80); 60 return 5; 61 } else if (c < 0xfe) { 62 if (n < 6) 63 return RET_TOOFEW(0); 64 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 65 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 66 && (s[5] ^ 0x80) < 0x40 67 && (c >= 0xfd || s[1] >= 0x84))) 68 return RET_ILSEQ; 69 *pwc = ((ucs4_t) (c & 0x01) << 30) 70 | ((ucs4_t) (s[1] ^ 0x80) << 24) 71 | ((ucs4_t) (s[2] ^ 0x80) << 18) 72 | ((ucs4_t) (s[3] ^ 0x80) << 12) 73 | ((ucs4_t) (s[4] ^ 0x80) << 6) 74 | (ucs4_t) (s[5] ^ 0x80); 75 return 6; 76 } else 77 return RET_ILSEQ; 78} 79 80static int 81utf8_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) /* n == 0 is acceptable */ 82{ 83 int count; 84 if (wc < 0x80) 85 count = 1; 86 else if (wc < 0x800) 87 count = 2; 88 else if (wc < 0x10000) 89 count = 3; 90 else if (wc < 0x200000) 91 count = 4; 92 else if (wc < 0x4000000) 93 count = 5; 94 else if (wc <= 0x7fffffff) 95 count = 6; 96 else 97 return RET_ILSEQ; 98 if (n < count) 99 return RET_TOOSMALL; 100 switch (count) { /* note: code falls through cases! */ 101 case 6: r[5] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x4000000; 102 case 5: r[4] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x200000; 103 case 4: r[3] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x10000; 104 case 3: r[2] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x800; 105 case 2: r[1] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0xc0; 106 case 1: r[0] = (unsigned char) wc; 107 } 108 return count; 109} 110