1
2/*
3 * UTF-8
4 */
5
6/* Specification: RFC 2279 */
7
8static int
9utf8_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
10{
11  unsigned char c = s[0];
12
13  if (c < 0x80) {
14    *pwc = c;
15    return 1;
16  } else if (c < 0xc2) {
17    return RET_ILSEQ;
18  } else if (c < 0xe0) {
19    if (n < 2)
20      return RET_TOOFEW(0);
21    if (!((s[1] ^ 0x80) < 0x40))
22      return RET_ILSEQ;
23    *pwc = ((ucs4_t) (c & 0x1f) << 6)
24           | (ucs4_t) (s[1] ^ 0x80);
25    return 2;
26  } else if (c < 0xf0) {
27    if (n < 3)
28      return RET_TOOFEW(0);
29    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
30          && (c >= 0xe1 || s[1] >= 0xa0)))
31      return RET_ILSEQ;
32    *pwc = ((ucs4_t) (c & 0x0f) << 12)
33           | ((ucs4_t) (s[1] ^ 0x80) << 6)
34           | (ucs4_t) (s[2] ^ 0x80);
35    return 3;
36  } else if (c < 0xf8) {
37    if (n < 4)
38      return RET_TOOFEW(0);
39    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
40          && (s[3] ^ 0x80) < 0x40
41          && (c >= 0xf1 || s[1] >= 0x90)))
42      return RET_ILSEQ;
43    *pwc = ((ucs4_t) (c & 0x07) << 18)
44           | ((ucs4_t) (s[1] ^ 0x80) << 12)
45           | ((ucs4_t) (s[2] ^ 0x80) << 6)
46           | (ucs4_t) (s[3] ^ 0x80);
47    return 4;
48  } else if (c < 0xfc) {
49    if (n < 5)
50      return RET_TOOFEW(0);
51    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
52          && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
53          && (c >= 0xf9 || s[1] >= 0x88)))
54      return RET_ILSEQ;
55    *pwc = ((ucs4_t) (c & 0x03) << 24)
56           | ((ucs4_t) (s[1] ^ 0x80) << 18)
57           | ((ucs4_t) (s[2] ^ 0x80) << 12)
58           | ((ucs4_t) (s[3] ^ 0x80) << 6)
59           | (ucs4_t) (s[4] ^ 0x80);
60    return 5;
61  } else if (c < 0xfe) {
62    if (n < 6)
63      return RET_TOOFEW(0);
64    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
65          && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
66          && (s[5] ^ 0x80) < 0x40
67          && (c >= 0xfd || s[1] >= 0x84)))
68      return RET_ILSEQ;
69    *pwc = ((ucs4_t) (c & 0x01) << 30)
70           | ((ucs4_t) (s[1] ^ 0x80) << 24)
71           | ((ucs4_t) (s[2] ^ 0x80) << 18)
72           | ((ucs4_t) (s[3] ^ 0x80) << 12)
73           | ((ucs4_t) (s[4] ^ 0x80) << 6)
74           | (ucs4_t) (s[5] ^ 0x80);
75    return 6;
76  } else
77    return RET_ILSEQ;
78}
79
80static int
81utf8_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) /* n == 0 is acceptable */
82{
83  int count;
84  if (wc < 0x80)
85    count = 1;
86  else if (wc < 0x800)
87    count = 2;
88  else if (wc < 0x10000)
89    count = 3;
90  else if (wc < 0x200000)
91    count = 4;
92  else if (wc < 0x4000000)
93    count = 5;
94  else if (wc <= 0x7fffffff)
95    count = 6;
96  else
97    return RET_ILSEQ;
98  if (n < count)
99    return RET_TOOSMALL;
100  switch (count) { /* note: code falls through cases! */
101    case 6: r[5] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x4000000;
102    case 5: r[4] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x200000;
103    case 4: r[3] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x10000;
104    case 3: r[2] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x800;
105    case 2: r[1] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0xc0;
106    case 1: r[0] = (unsigned char) wc;
107  }
108  return count;
109}
110