utf8.h revision 1ab64890
11ab64890Smrg/* $XFree86: xc/lib/X11/lcUniConv/utf8.h,v 1.2 2000/11/28 16:10:32 dawes Exp $ */
21ab64890Smrg
31ab64890Smrg/*
41ab64890Smrg * UTF-8
51ab64890Smrg */
61ab64890Smrg
71ab64890Smrg/* Specification: RFC 2279 */
81ab64890Smrg
91ab64890Smrgstatic int
101ab64890Smrgutf8_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
111ab64890Smrg{
121ab64890Smrg  unsigned char c = s[0];
131ab64890Smrg
141ab64890Smrg  if (c < 0x80) {
151ab64890Smrg    *pwc = c;
161ab64890Smrg    return 1;
171ab64890Smrg  } else if (c < 0xc2) {
181ab64890Smrg    return RET_ILSEQ;
191ab64890Smrg  } else if (c < 0xe0) {
201ab64890Smrg    if (n < 2)
211ab64890Smrg      return RET_TOOFEW(0);
221ab64890Smrg    if (!((s[1] ^ 0x80) < 0x40))
231ab64890Smrg      return RET_ILSEQ;
241ab64890Smrg    *pwc = ((ucs4_t) (c & 0x1f) << 6)
251ab64890Smrg           | (ucs4_t) (s[1] ^ 0x80);
261ab64890Smrg    return 2;
271ab64890Smrg  } else if (c < 0xf0) {
281ab64890Smrg    if (n < 3)
291ab64890Smrg      return RET_TOOFEW(0);
301ab64890Smrg    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
311ab64890Smrg          && (c >= 0xe1 || s[1] >= 0xa0)))
321ab64890Smrg      return RET_ILSEQ;
331ab64890Smrg    *pwc = ((ucs4_t) (c & 0x0f) << 12)
341ab64890Smrg           | ((ucs4_t) (s[1] ^ 0x80) << 6)
351ab64890Smrg           | (ucs4_t) (s[2] ^ 0x80);
361ab64890Smrg    return 3;
371ab64890Smrg  } else if (c < 0xf8) {
381ab64890Smrg    if (n < 4)
391ab64890Smrg      return RET_TOOFEW(0);
401ab64890Smrg    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
411ab64890Smrg          && (s[3] ^ 0x80) < 0x40
421ab64890Smrg          && (c >= 0xf1 || s[1] >= 0x90)))
431ab64890Smrg      return RET_ILSEQ;
441ab64890Smrg    *pwc = ((ucs4_t) (c & 0x07) << 18)
451ab64890Smrg           | ((ucs4_t) (s[1] ^ 0x80) << 12)
461ab64890Smrg           | ((ucs4_t) (s[2] ^ 0x80) << 6)
471ab64890Smrg           | (ucs4_t) (s[3] ^ 0x80);
481ab64890Smrg    return 4;
491ab64890Smrg  } else if (c < 0xfc) {
501ab64890Smrg    if (n < 5)
511ab64890Smrg      return RET_TOOFEW(0);
521ab64890Smrg    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
531ab64890Smrg          && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
541ab64890Smrg          && (c >= 0xf9 || s[1] >= 0x88)))
551ab64890Smrg      return RET_ILSEQ;
561ab64890Smrg    *pwc = ((ucs4_t) (c & 0x03) << 24)
571ab64890Smrg           | ((ucs4_t) (s[1] ^ 0x80) << 18)
581ab64890Smrg           | ((ucs4_t) (s[2] ^ 0x80) << 12)
591ab64890Smrg           | ((ucs4_t) (s[3] ^ 0x80) << 6)
601ab64890Smrg           | (ucs4_t) (s[4] ^ 0x80);
611ab64890Smrg    return 5;
621ab64890Smrg  } else if (c < 0xfe) {
631ab64890Smrg    if (n < 6)
641ab64890Smrg      return RET_TOOFEW(0);
651ab64890Smrg    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
661ab64890Smrg          && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
671ab64890Smrg          && (s[5] ^ 0x80) < 0x40
681ab64890Smrg          && (c >= 0xfd || s[1] >= 0x84)))
691ab64890Smrg      return RET_ILSEQ;
701ab64890Smrg    *pwc = ((ucs4_t) (c & 0x01) << 30)
711ab64890Smrg           | ((ucs4_t) (s[1] ^ 0x80) << 24)
721ab64890Smrg           | ((ucs4_t) (s[2] ^ 0x80) << 18)
731ab64890Smrg           | ((ucs4_t) (s[3] ^ 0x80) << 12)
741ab64890Smrg           | ((ucs4_t) (s[4] ^ 0x80) << 6)
751ab64890Smrg           | (ucs4_t) (s[5] ^ 0x80);
761ab64890Smrg    return 6;
771ab64890Smrg  } else
781ab64890Smrg    return RET_ILSEQ;
791ab64890Smrg}
801ab64890Smrg
811ab64890Smrgstatic int
821ab64890Smrgutf8_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) /* n == 0 is acceptable */
831ab64890Smrg{
841ab64890Smrg  int count;
851ab64890Smrg  if (wc < 0x80)
861ab64890Smrg    count = 1;
871ab64890Smrg  else if (wc < 0x800)
881ab64890Smrg    count = 2;
891ab64890Smrg  else if (wc < 0x10000)
901ab64890Smrg    count = 3;
911ab64890Smrg  else if (wc < 0x200000)
921ab64890Smrg    count = 4;
931ab64890Smrg  else if (wc < 0x4000000)
941ab64890Smrg    count = 5;
951ab64890Smrg  else if (wc <= 0x7fffffff)
961ab64890Smrg    count = 6;
971ab64890Smrg  else
981ab64890Smrg    return RET_ILSEQ;
991ab64890Smrg  if (n < count)
1001ab64890Smrg    return RET_TOOSMALL;
1011ab64890Smrg  switch (count) { /* note: code falls through cases! */
1021ab64890Smrg    case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
1031ab64890Smrg    case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
1041ab64890Smrg    case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
1051ab64890Smrg    case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
1061ab64890Smrg    case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
1071ab64890Smrg    case 1: r[0] = wc;
1081ab64890Smrg  }
1091ab64890Smrg  return count;
1101ab64890Smrg}
111