11ab64890Smrg 21ab64890Smrg/* 31ab64890Smrg * UTF-8 41ab64890Smrg */ 51ab64890Smrg 61ab64890Smrg/* Specification: RFC 2279 */ 71ab64890Smrg 81ab64890Smrgstatic int 91ab64890Smrgutf8_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) 101ab64890Smrg{ 111ab64890Smrg unsigned char c = s[0]; 121ab64890Smrg 131ab64890Smrg if (c < 0x80) { 141ab64890Smrg *pwc = c; 151ab64890Smrg return 1; 161ab64890Smrg } else if (c < 0xc2) { 171ab64890Smrg return RET_ILSEQ; 181ab64890Smrg } else if (c < 0xe0) { 191ab64890Smrg if (n < 2) 201ab64890Smrg return RET_TOOFEW(0); 211ab64890Smrg if (!((s[1] ^ 0x80) < 0x40)) 221ab64890Smrg return RET_ILSEQ; 231ab64890Smrg *pwc = ((ucs4_t) (c & 0x1f) << 6) 241ab64890Smrg | (ucs4_t) (s[1] ^ 0x80); 251ab64890Smrg return 2; 261ab64890Smrg } else if (c < 0xf0) { 271ab64890Smrg if (n < 3) 281ab64890Smrg return RET_TOOFEW(0); 291ab64890Smrg if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 301ab64890Smrg && (c >= 0xe1 || s[1] >= 0xa0))) 311ab64890Smrg return RET_ILSEQ; 321ab64890Smrg *pwc = ((ucs4_t) (c & 0x0f) << 12) 331ab64890Smrg | ((ucs4_t) (s[1] ^ 0x80) << 6) 341ab64890Smrg | (ucs4_t) (s[2] ^ 0x80); 351ab64890Smrg return 3; 361ab64890Smrg } else if (c < 0xf8) { 371ab64890Smrg if (n < 4) 381ab64890Smrg return RET_TOOFEW(0); 391ab64890Smrg if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 401ab64890Smrg && (s[3] ^ 0x80) < 0x40 411ab64890Smrg && (c >= 0xf1 || s[1] >= 0x90))) 421ab64890Smrg return RET_ILSEQ; 431ab64890Smrg *pwc = ((ucs4_t) (c & 0x07) << 18) 441ab64890Smrg | ((ucs4_t) (s[1] ^ 0x80) << 12) 451ab64890Smrg | ((ucs4_t) (s[2] ^ 0x80) << 6) 461ab64890Smrg | (ucs4_t) (s[3] ^ 0x80); 471ab64890Smrg return 4; 481ab64890Smrg } else if (c < 0xfc) { 491ab64890Smrg if (n < 5) 501ab64890Smrg return RET_TOOFEW(0); 511ab64890Smrg if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 521ab64890Smrg && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 531ab64890Smrg && (c >= 0xf9 || s[1] >= 0x88))) 541ab64890Smrg return RET_ILSEQ; 551ab64890Smrg *pwc = ((ucs4_t) (c & 0x03) << 24) 561ab64890Smrg | ((ucs4_t) (s[1] ^ 0x80) << 18) 571ab64890Smrg | ((ucs4_t) (s[2] ^ 0x80) << 12) 581ab64890Smrg | ((ucs4_t) (s[3] ^ 0x80) << 6) 591ab64890Smrg | (ucs4_t) (s[4] ^ 0x80); 601ab64890Smrg return 5; 611ab64890Smrg } else if (c < 0xfe) { 621ab64890Smrg if (n < 6) 631ab64890Smrg return RET_TOOFEW(0); 641ab64890Smrg if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 651ab64890Smrg && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 661ab64890Smrg && (s[5] ^ 0x80) < 0x40 671ab64890Smrg && (c >= 0xfd || s[1] >= 0x84))) 681ab64890Smrg return RET_ILSEQ; 691ab64890Smrg *pwc = ((ucs4_t) (c & 0x01) << 30) 701ab64890Smrg | ((ucs4_t) (s[1] ^ 0x80) << 24) 711ab64890Smrg | ((ucs4_t) (s[2] ^ 0x80) << 18) 721ab64890Smrg | ((ucs4_t) (s[3] ^ 0x80) << 12) 731ab64890Smrg | ((ucs4_t) (s[4] ^ 0x80) << 6) 741ab64890Smrg | (ucs4_t) (s[5] ^ 0x80); 751ab64890Smrg return 6; 761ab64890Smrg } else 771ab64890Smrg return RET_ILSEQ; 781ab64890Smrg} 791ab64890Smrg 801ab64890Smrgstatic int 811ab64890Smrgutf8_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) /* n == 0 is acceptable */ 821ab64890Smrg{ 831ab64890Smrg int count; 841ab64890Smrg if (wc < 0x80) 851ab64890Smrg count = 1; 861ab64890Smrg else if (wc < 0x800) 871ab64890Smrg count = 2; 881ab64890Smrg else if (wc < 0x10000) 891ab64890Smrg count = 3; 901ab64890Smrg else if (wc < 0x200000) 911ab64890Smrg count = 4; 921ab64890Smrg else if (wc < 0x4000000) 931ab64890Smrg count = 5; 941ab64890Smrg else if (wc <= 0x7fffffff) 951ab64890Smrg count = 6; 961ab64890Smrg else 971ab64890Smrg return RET_ILSEQ; 981ab64890Smrg if (n < count) 991ab64890Smrg return RET_TOOSMALL; 1001ab64890Smrg switch (count) { /* note: code falls through cases! */ 1019c019ec5Smaya case 6: r[5] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x4000000; 1029c019ec5Smaya case 5: r[4] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x200000; 1039c019ec5Smaya case 4: r[3] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x10000; 1049c019ec5Smaya case 3: r[2] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x800; 1059c019ec5Smaya case 2: r[1] = (unsigned char) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0xc0; 1069c019ec5Smaya case 1: r[0] = (unsigned char) wc; 1071ab64890Smrg } 1081ab64890Smrg return count; 1091ab64890Smrg} 110