utf8.h revision 1ab64890
1/* $XFree86: xc/lib/X11/lcUniConv/utf8.h,v 1.2 2000/11/28 16:10:32 dawes Exp $ */
2
3/*
4 * UTF-8
5 */
6
7/* Specification: RFC 2279 */
8
9static int
10utf8_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
11{
12  unsigned char c = s[0];
13
14  if (c < 0x80) {
15    *pwc = c;
16    return 1;
17  } else if (c < 0xc2) {
18    return RET_ILSEQ;
19  } else if (c < 0xe0) {
20    if (n < 2)
21      return RET_TOOFEW(0);
22    if (!((s[1] ^ 0x80) < 0x40))
23      return RET_ILSEQ;
24    *pwc = ((ucs4_t) (c & 0x1f) << 6)
25           | (ucs4_t) (s[1] ^ 0x80);
26    return 2;
27  } else if (c < 0xf0) {
28    if (n < 3)
29      return RET_TOOFEW(0);
30    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
31          && (c >= 0xe1 || s[1] >= 0xa0)))
32      return RET_ILSEQ;
33    *pwc = ((ucs4_t) (c & 0x0f) << 12)
34           | ((ucs4_t) (s[1] ^ 0x80) << 6)
35           | (ucs4_t) (s[2] ^ 0x80);
36    return 3;
37  } else if (c < 0xf8) {
38    if (n < 4)
39      return RET_TOOFEW(0);
40    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
41          && (s[3] ^ 0x80) < 0x40
42          && (c >= 0xf1 || s[1] >= 0x90)))
43      return RET_ILSEQ;
44    *pwc = ((ucs4_t) (c & 0x07) << 18)
45           | ((ucs4_t) (s[1] ^ 0x80) << 12)
46           | ((ucs4_t) (s[2] ^ 0x80) << 6)
47           | (ucs4_t) (s[3] ^ 0x80);
48    return 4;
49  } else if (c < 0xfc) {
50    if (n < 5)
51      return RET_TOOFEW(0);
52    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
53          && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
54          && (c >= 0xf9 || s[1] >= 0x88)))
55      return RET_ILSEQ;
56    *pwc = ((ucs4_t) (c & 0x03) << 24)
57           | ((ucs4_t) (s[1] ^ 0x80) << 18)
58           | ((ucs4_t) (s[2] ^ 0x80) << 12)
59           | ((ucs4_t) (s[3] ^ 0x80) << 6)
60           | (ucs4_t) (s[4] ^ 0x80);
61    return 5;
62  } else if (c < 0xfe) {
63    if (n < 6)
64      return RET_TOOFEW(0);
65    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
66          && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
67          && (s[5] ^ 0x80) < 0x40
68          && (c >= 0xfd || s[1] >= 0x84)))
69      return RET_ILSEQ;
70    *pwc = ((ucs4_t) (c & 0x01) << 30)
71           | ((ucs4_t) (s[1] ^ 0x80) << 24)
72           | ((ucs4_t) (s[2] ^ 0x80) << 18)
73           | ((ucs4_t) (s[3] ^ 0x80) << 12)
74           | ((ucs4_t) (s[4] ^ 0x80) << 6)
75           | (ucs4_t) (s[5] ^ 0x80);
76    return 6;
77  } else
78    return RET_ILSEQ;
79}
80
81static int
82utf8_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) /* n == 0 is acceptable */
83{
84  int count;
85  if (wc < 0x80)
86    count = 1;
87  else if (wc < 0x800)
88    count = 2;
89  else if (wc < 0x10000)
90    count = 3;
91  else if (wc < 0x200000)
92    count = 4;
93  else if (wc < 0x4000000)
94    count = 5;
95  else if (wc <= 0x7fffffff)
96    count = 6;
97  else
98    return RET_ILSEQ;
99  if (n < count)
100    return RET_TOOSMALL;
101  switch (count) { /* note: code falls through cases! */
102    case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
103    case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
104    case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
105    case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
106    case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
107    case 1: r[0] = wc;
108  }
109  return count;
110}
111