1 1.1 christos /* $NetBSD: utf8.c,v 1.1 2024/02/18 20:57:51 christos Exp $ */ 2 1.1 christos 3 1.1 christos /* 4 1.1 christos * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 1.1 christos * 6 1.1 christos * SPDX-License-Identifier: MPL-2.0 7 1.1 christos * 8 1.1 christos * This Source Code Form is subject to the terms of the Mozilla Public 9 1.1 christos * License, v. 2.0. If a copy of the MPL was not distributed with this 10 1.1 christos * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 1.1 christos * 12 1.1 christos * See the COPYRIGHT file distributed with this work for additional 13 1.1 christos * information regarding copyright ownership. 14 1.1 christos */ 15 1.1 christos 16 1.1 christos #include <string.h> 17 1.1 christos 18 1.1 christos #include <isc/utf8.h> 19 1.1 christos #include <isc/util.h> 20 1.1 christos 21 1.1 christos /* 22 1.1 christos * UTF-8 is defined in "The Unicode Standard -- Version 4.0" 23 1.1 christos * Also see RFC 3629. 24 1.1 christos * 25 1.1 christos * Char. number range | UTF-8 octet sequence 26 1.1 christos * (hexadecimal) | (binary) 27 1.1 christos * --------------------+--------------------------------------------- 28 1.1 christos * 0000 0000-0000 007F | 0xxxxxxx 29 1.1 christos * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 30 1.1 christos * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 31 1.1 christos * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 32 1.1 christos */ 33 1.1 christos bool 34 1.1 christos isc_utf8_valid(const unsigned char *buf, size_t len) { 35 1.1 christos REQUIRE(buf != NULL); 36 1.1 christos 37 1.1 christos for (size_t i = 0; i < len; i++) { 38 1.1 christos if (buf[i] <= 0x7f) { 39 1.1 christos continue; 40 1.1 christos } 41 1.1 christos if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 && 42 1.1 christos (buf[i + 1] & 0xc0) == 0x80) 43 1.1 christos { 44 1.1 christos unsigned int w; 45 1.1 christos w = (buf[i] & 0x1f) << 6; 46 1.1 christos w |= (buf[++i] & 0x3f); 47 1.1 christos if (w < 0x80) { 48 1.1 christos return (false); 49 1.1 christos } 50 1.1 christos continue; 51 1.1 christos } 52 1.1 christos if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 && 53 1.1 christos (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80) 54 1.1 christos { 55 1.1 christos unsigned int w; 56 1.1 christos w = (buf[i] & 0x0f) << 12; 57 1.1 christos w |= (buf[++i] & 0x3f) << 6; 58 1.1 christos w |= (buf[++i] & 0x3f); 59 1.1 christos if (w < 0x0800) { 60 1.1 christos return (false); 61 1.1 christos } 62 1.1 christos continue; 63 1.1 christos } 64 1.1 christos if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 && 65 1.1 christos (buf[i + 1] & 0xc0) == 0x80 && 66 1.1 christos (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80) 67 1.1 christos { 68 1.1 christos unsigned int w; 69 1.1 christos w = (buf[i] & 0x07) << 18; 70 1.1 christos w |= (buf[++i] & 0x3f) << 12; 71 1.1 christos w |= (buf[++i] & 0x3f) << 6; 72 1.1 christos w |= (buf[++i] & 0x3f); 73 1.1 christos if (w < 0x10000 || w > 0x10FFFF) { 74 1.1 christos return (false); 75 1.1 christos } 76 1.1 christos continue; 77 1.1 christos } 78 1.1 christos return (false); 79 1.1 christos } 80 1.1 christos return (true); 81 1.1 christos } 82 1.1 christos 83 1.1 christos bool 84 1.1 christos isc_utf8_bom(const unsigned char *buf, size_t len) { 85 1.1 christos REQUIRE(buf != NULL); 86 1.1 christos 87 1.1 christos if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) { 88 1.1 christos return (true); 89 1.1 christos } 90 1.1 christos return (false); 91 1.1 christos } 92