1 /* Copyright libuv contributors. All rights reserved. 2 * 3 * Permission to use, copy, modify, and/or distribute this software for any 4 * purpose with or without fee is hereby granted, provided that the above 5 * copyright notice and this permission notice appear in all copies. 6 * 7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 */ 15 16 /* Derived from https://github.com/bnoordhuis/punycode 17 * but updated to support IDNA 2008. 18 */ 19 20 #include "uv.h" 21 #include "uv-common.h" 22 #include "idna.h" 23 #include <assert.h> 24 #include <string.h> 25 #include <limits.h> /* UINT_MAX */ 26 27 28 static int32_t uv__wtf8_decode1(const char** input) { 29 uint32_t code_point; 30 uint8_t b1; 31 uint8_t b2; 32 uint8_t b3; 33 uint8_t b4; 34 35 b1 = **input; 36 if (b1 <= 0x7F) 37 return b1; /* ASCII code point */ 38 if (b1 < 0xC2) 39 return -1; /* invalid: continuation byte */ 40 code_point = b1; 41 42 b2 = *++*input; 43 if ((b2 & 0xC0) != 0x80) 44 return -1; /* invalid: not a continuation byte */ 45 code_point = (code_point << 6) | (b2 & 0x3F); 46 if (b1 <= 0xDF) 47 return 0x7FF & code_point; /* two-byte character */ 48 49 b3 = *++*input; 50 if ((b3 & 0xC0) != 0x80) 51 return -1; /* invalid: not a continuation byte */ 52 code_point = (code_point << 6) | (b3 & 0x3F); 53 if (b1 <= 0xEF) 54 return 0xFFFF & code_point; /* three-byte character */ 55 56 b4 = *++*input; 57 if ((b4 & 0xC0) != 0x80) 58 return -1; /* invalid: not a continuation byte */ 59 code_point = (code_point << 6) | (b4 & 0x3F); 60 if (b1 <= 0xF4) { 61 code_point &= 0x1FFFFF; 62 if (code_point <= 0x10FFFF) 63 return code_point; /* four-byte character */ 64 } 65 66 /* code point too large */ 67 return -1; 68 } 69 70 71 static unsigned uv__utf8_decode1_slow(const char** p, 72 const char* pe, 73 unsigned a) { 74 unsigned b; 75 unsigned c; 76 unsigned d; 77 unsigned min; 78 79 if (a > 0xF7) 80 return -1; 81 82 switch (pe - *p) { 83 default: 84 if (a > 0xEF) { 85 min = 0x10000; 86 a = a & 7; 87 b = (unsigned char) *(*p)++; 88 c = (unsigned char) *(*p)++; 89 d = (unsigned char) *(*p)++; 90 break; 91 } 92 /* Fall through. */ 93 case 2: 94 if (a > 0xDF) { 95 min = 0x800; 96 b = 0x80 | (a & 15); 97 c = (unsigned char) *(*p)++; 98 d = (unsigned char) *(*p)++; 99 a = 0; 100 break; 101 } 102 /* Fall through. */ 103 case 1: 104 if (a > 0xBF) { 105 min = 0x80; 106 b = 0x80; 107 c = 0x80 | (a & 31); 108 d = (unsigned char) *(*p)++; 109 a = 0; 110 break; 111 } 112 /* Fall through. */ 113 case 0: 114 return -1; /* Invalid continuation byte. */ 115 } 116 117 if (0x80 != (0xC0 & (b ^ c ^ d))) 118 return -1; /* Invalid sequence. */ 119 120 b &= 63; 121 c &= 63; 122 d &= 63; 123 a = (a << 18) | (b << 12) | (c << 6) | d; 124 125 if (a < min) 126 return -1; /* Overlong sequence. */ 127 128 if (a > 0x10FFFF) 129 return -1; /* Four-byte sequence > U+10FFFF. */ 130 131 if (a >= 0xD800 && a <= 0xDFFF) 132 return -1; /* Surrogate pair. */ 133 134 return a; 135 } 136 137 138 unsigned uv__utf8_decode1(const char** p, const char* pe) { 139 unsigned a; 140 141 assert(*p < pe); 142 143 a = (unsigned char) *(*p)++; 144 145 if (a < 128) 146 return a; /* ASCII, common case. */ 147 148 return uv__utf8_decode1_slow(p, pe, a); 149 } 150 151 152 static int uv__idna_toascii_label(const char* s, const char* se, 153 char** d, char* de) { 154 static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789"; 155 const char* ss; 156 unsigned c; 157 unsigned h; 158 unsigned k; 159 unsigned n; 160 unsigned m; 161 unsigned q; 162 unsigned t; 163 unsigned x; 164 unsigned y; 165 unsigned bias; 166 unsigned delta; 167 unsigned todo; 168 int first; 169 170 h = 0; 171 ss = s; 172 todo = 0; 173 174 /* Note: after this loop we've visited all UTF-8 characters and know 175 * they're legal so we no longer need to check for decode errors. 176 */ 177 while (s < se) { 178 c = uv__utf8_decode1(&s, se); 179 180 if (c == UINT_MAX) 181 return UV_EINVAL; 182 183 if (c < 128) 184 h++; 185 else 186 todo++; 187 } 188 189 /* Only write "xn--" when there are non-ASCII characters. */ 190 if (todo > 0) { 191 if (*d < de) *(*d)++ = 'x'; 192 if (*d < de) *(*d)++ = 'n'; 193 if (*d < de) *(*d)++ = '-'; 194 if (*d < de) *(*d)++ = '-'; 195 } 196 197 /* Write ASCII characters. */ 198 x = 0; 199 s = ss; 200 while (s < se) { 201 c = uv__utf8_decode1(&s, se); 202 assert(c != UINT_MAX); 203 204 if (c > 127) 205 continue; 206 207 if (*d < de) 208 *(*d)++ = c; 209 210 if (++x == h) 211 break; /* Visited all ASCII characters. */ 212 } 213 214 if (todo == 0) 215 return h; 216 217 /* Only write separator when we've written ASCII characters first. */ 218 if (h > 0) 219 if (*d < de) 220 *(*d)++ = '-'; 221 222 n = 128; 223 bias = 72; 224 delta = 0; 225 first = 1; 226 227 while (todo > 0) { 228 m = -1; 229 s = ss; 230 231 while (s < se) { 232 c = uv__utf8_decode1(&s, se); 233 assert(c != UINT_MAX); 234 235 if (c >= n) 236 if (c < m) 237 m = c; 238 } 239 240 x = m - n; 241 y = h + 1; 242 243 if (x > ~delta / y) 244 return UV_E2BIG; /* Overflow. */ 245 246 delta += x * y; 247 n = m; 248 249 s = ss; 250 while (s < se) { 251 c = uv__utf8_decode1(&s, se); 252 assert(c != UINT_MAX); 253 254 if (c < n) 255 if (++delta == 0) 256 return UV_E2BIG; /* Overflow. */ 257 258 if (c != n) 259 continue; 260 261 for (k = 36, q = delta; /* empty */; k += 36) { 262 t = 1; 263 264 if (k > bias) 265 t = k - bias; 266 267 if (t > 26) 268 t = 26; 269 270 if (q < t) 271 break; 272 273 /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore 274 * 10 <= y <= 35, we can optimize the long division 275 * into a table-based reciprocal multiplication. 276 */ 277 x = q - t; 278 y = 36 - t; /* 10 <= y <= 35 since 1 <= t <= 26. */ 279 q = x / y; 280 t = t + x % y; /* 1 <= t <= 35 because of y. */ 281 282 if (*d < de) 283 *(*d)++ = alphabet[t]; 284 } 285 286 if (*d < de) 287 *(*d)++ = alphabet[q]; 288 289 delta /= 2; 290 291 if (first) { 292 delta /= 350; 293 first = 0; 294 } 295 296 /* No overflow check is needed because |delta| was just 297 * divided by 2 and |delta+delta >= delta + delta/h|. 298 */ 299 h++; 300 delta += delta / h; 301 302 for (bias = 0; delta > 35 * 26 / 2; bias += 36) 303 delta /= 35; 304 305 bias += 36 * delta / (delta + 38); 306 delta = 0; 307 todo--; 308 } 309 310 delta++; 311 n++; 312 } 313 314 return 0; 315 } 316 317 318 ssize_t uv__idna_toascii(const char* s, const char* se, char* d, char* de) { 319 const char* si; 320 const char* st; 321 unsigned c; 322 char* ds; 323 int rc; 324 325 if (s == se) 326 return UV_EINVAL; 327 328 ds = d; 329 330 si = s; 331 while (si < se) { 332 st = si; 333 c = uv__utf8_decode1(&si, se); 334 335 if (c == UINT_MAX) 336 return UV_EINVAL; 337 338 if (c != '.') 339 if (c != 0x3002) /* */ 340 if (c != 0xFF0E) /* */ 341 if (c != 0xFF61) /* */ 342 continue; 343 344 rc = uv__idna_toascii_label(s, st, &d, de); 345 346 if (rc < 0) 347 return rc; 348 349 if (d < de) 350 *d++ = '.'; 351 352 s = si; 353 } 354 355 if (s < se) { 356 rc = uv__idna_toascii_label(s, se, &d, de); 357 358 if (rc < 0) 359 return rc; 360 } 361 362 if (d >= de) 363 return UV_EINVAL; 364 365 *d++ = '\0'; 366 return d - ds; /* Number of bytes written. */ 367 } 368 369 370 ssize_t uv_wtf8_length_as_utf16(const char* source_ptr) { 371 size_t w_target_len = 0; 372 int32_t code_point; 373 374 do { 375 code_point = uv__wtf8_decode1(&source_ptr); 376 if (code_point < 0) 377 return -1; 378 if (code_point > 0xFFFF) 379 w_target_len++; 380 w_target_len++; 381 } while (*source_ptr++); 382 383 return w_target_len; 384 } 385 386 387 void uv_wtf8_to_utf16(const char* source_ptr, 388 uint16_t* w_target, 389 size_t w_target_len) { 390 int32_t code_point; 391 392 do { 393 code_point = uv__wtf8_decode1(&source_ptr); 394 /* uv_wtf8_length_as_utf16 should have been called and checked first. */ 395 assert(code_point >= 0); 396 if (code_point > 0xFFFF) { 397 assert(code_point < 0x10FFFF); 398 *w_target++ = (((code_point - 0x10000) >> 10) + 0xD800); 399 *w_target++ = ((code_point - 0x10000) & 0x3FF) + 0xDC00; 400 w_target_len -= 2; 401 } else { 402 *w_target++ = code_point; 403 w_target_len -= 1; 404 } 405 } while (*source_ptr++); 406 407 (void)w_target_len; 408 assert(w_target_len == 0); 409 } 410 411 412 static int32_t uv__get_surrogate_value(const uint16_t* w_source_ptr, 413 ssize_t w_source_len) { 414 uint16_t u; 415 uint16_t next; 416 417 u = w_source_ptr[0]; 418 if (u >= 0xD800 && u <= 0xDBFF && w_source_len != 1) { 419 next = w_source_ptr[1]; 420 if (next >= 0xDC00 && next <= 0xDFFF) 421 return 0x10000 + ((u - 0xD800) << 10) + (next - 0xDC00); 422 } 423 return u; 424 } 425 426 427 size_t uv_utf16_length_as_wtf8(const uint16_t* w_source_ptr, 428 ssize_t w_source_len) { 429 size_t target_len; 430 int32_t code_point; 431 432 target_len = 0; 433 while (w_source_len) { 434 code_point = uv__get_surrogate_value(w_source_ptr, w_source_len); 435 /* Can be invalid UTF-8 but must be valid WTF-8. */ 436 assert(code_point >= 0); 437 if (w_source_len < 0 && code_point == 0) 438 break; 439 if (code_point < 0x80) 440 target_len += 1; 441 else if (code_point < 0x800) 442 target_len += 2; 443 else if (code_point < 0x10000) 444 target_len += 3; 445 else { 446 target_len += 4; 447 w_source_ptr++; 448 if (w_source_len > 0) 449 w_source_len--; 450 } 451 w_source_ptr++; 452 if (w_source_len > 0) 453 w_source_len--; 454 } 455 456 return target_len; 457 } 458 459 460 int uv_utf16_to_wtf8(const uint16_t* w_source_ptr, 461 ssize_t w_source_len, 462 char** target_ptr, 463 size_t* target_len_ptr) { 464 size_t target_len; 465 char* target; 466 char* target_end; 467 int32_t code_point; 468 469 /* If *target_ptr is provided, then *target_len_ptr must be its length 470 * (excluding space for NUL), otherwise we will compute the target_len_ptr 471 * length and may return a new allocation in *target_ptr if target_ptr is 472 * provided. */ 473 if (target_ptr == NULL || *target_ptr == NULL) { 474 target_len = uv_utf16_length_as_wtf8(w_source_ptr, w_source_len); 475 if (target_len_ptr != NULL) 476 *target_len_ptr = target_len; 477 } else { 478 target_len = *target_len_ptr; 479 } 480 481 if (target_ptr == NULL) 482 return 0; 483 484 if (*target_ptr == NULL) { 485 target = uv__malloc(target_len + 1); 486 if (target == NULL) { 487 return UV_ENOMEM; 488 } 489 *target_ptr = target; 490 } else { 491 target = *target_ptr; 492 } 493 494 target_end = target + target_len; 495 496 while (target != target_end && w_source_len) { 497 code_point = uv__get_surrogate_value(w_source_ptr, w_source_len); 498 /* Can be invalid UTF-8 but must be valid WTF-8. */ 499 assert(code_point >= 0); 500 if (w_source_len < 0 && code_point == 0) { 501 w_source_len = 0; 502 break; 503 } 504 if (code_point < 0x80) { 505 *target++ = code_point; 506 } else if (code_point < 0x800) { 507 *target++ = 0xC0 | (code_point >> 6); 508 if (target == target_end) 509 break; 510 *target++ = 0x80 | (code_point & 0x3F); 511 } else if (code_point < 0x10000) { 512 *target++ = 0xE0 | (code_point >> 12); 513 if (target == target_end) 514 break; 515 *target++ = 0x80 | ((code_point >> 6) & 0x3F); 516 if (target == target_end) 517 break; 518 *target++ = 0x80 | (code_point & 0x3F); 519 } else { 520 *target++ = 0xF0 | (code_point >> 18); 521 if (target == target_end) 522 break; 523 *target++ = 0x80 | ((code_point >> 12) & 0x3F); 524 if (target == target_end) 525 break; 526 *target++ = 0x80 | ((code_point >> 6) & 0x3F); 527 if (target == target_end) 528 break; 529 *target++ = 0x80 | (code_point & 0x3F); 530 /* uv__get_surrogate_value consumed 2 input characters */ 531 w_source_ptr++; 532 if (w_source_len > 0) 533 w_source_len--; 534 } 535 target_len = target - *target_ptr; 536 w_source_ptr++; 537 if (w_source_len > 0) 538 w_source_len--; 539 } 540 541 if (target != target_end && target_len_ptr != NULL) 542 /* Did not fill all of the provided buffer, so update the target_len_ptr 543 * output with the space used. */ 544 *target_len_ptr = target - *target_ptr; 545 546 /* Check if input fit into target exactly. */ 547 if (w_source_len < 0 && target == target_end && w_source_ptr[0] == 0) 548 w_source_len = 0; 549 550 *target++ = '\0'; 551 552 /* Characters remained after filling the buffer, compute the remaining length now. */ 553 if (w_source_len) { 554 if (target_len_ptr != NULL) 555 *target_len_ptr = target_len + uv_utf16_length_as_wtf8(w_source_ptr, w_source_len); 556 return UV_ENOBUFS; 557 } 558 559 return 0; 560 } 561