Home | History | Annotate | Line # | Download | only in isc
      1  1.1  christos /*	$NetBSD: utf8.c,v 1.1 2024/02/18 20:57:51 christos Exp $	*/
      2  1.1  christos 
      3  1.1  christos /*
      4  1.1  christos  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
      5  1.1  christos  *
      6  1.1  christos  * SPDX-License-Identifier: MPL-2.0
      7  1.1  christos  *
      8  1.1  christos  * This Source Code Form is subject to the terms of the Mozilla Public
      9  1.1  christos  * License, v. 2.0. If a copy of the MPL was not distributed with this
     10  1.1  christos  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
     11  1.1  christos  *
     12  1.1  christos  * See the COPYRIGHT file distributed with this work for additional
     13  1.1  christos  * information regarding copyright ownership.
     14  1.1  christos  */
     15  1.1  christos 
     16  1.1  christos #include <string.h>
     17  1.1  christos 
     18  1.1  christos #include <isc/utf8.h>
     19  1.1  christos #include <isc/util.h>
     20  1.1  christos 
     21  1.1  christos /*
     22  1.1  christos  * UTF-8 is defined in "The Unicode Standard -- Version 4.0"
     23  1.1  christos  * Also see RFC 3629.
     24  1.1  christos  *
     25  1.1  christos  * Char. number range  |        UTF-8 octet sequence
     26  1.1  christos  *    (hexadecimal)    |              (binary)
     27  1.1  christos  *  --------------------+---------------------------------------------
     28  1.1  christos  * 0000 0000-0000 007F | 0xxxxxxx
     29  1.1  christos  * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
     30  1.1  christos  * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
     31  1.1  christos  * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     32  1.1  christos  */
     33  1.1  christos bool
     34  1.1  christos isc_utf8_valid(const unsigned char *buf, size_t len) {
     35  1.1  christos 	REQUIRE(buf != NULL);
     36  1.1  christos 
     37  1.1  christos 	for (size_t i = 0; i < len; i++) {
     38  1.1  christos 		if (buf[i] <= 0x7f) {
     39  1.1  christos 			continue;
     40  1.1  christos 		}
     41  1.1  christos 		if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 &&
     42  1.1  christos 		    (buf[i + 1] & 0xc0) == 0x80)
     43  1.1  christos 		{
     44  1.1  christos 			unsigned int w;
     45  1.1  christos 			w = (buf[i] & 0x1f) << 6;
     46  1.1  christos 			w |= (buf[++i] & 0x3f);
     47  1.1  christos 			if (w < 0x80) {
     48  1.1  christos 				return (false);
     49  1.1  christos 			}
     50  1.1  christos 			continue;
     51  1.1  christos 		}
     52  1.1  christos 		if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 &&
     53  1.1  christos 		    (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80)
     54  1.1  christos 		{
     55  1.1  christos 			unsigned int w;
     56  1.1  christos 			w = (buf[i] & 0x0f) << 12;
     57  1.1  christos 			w |= (buf[++i] & 0x3f) << 6;
     58  1.1  christos 			w |= (buf[++i] & 0x3f);
     59  1.1  christos 			if (w < 0x0800) {
     60  1.1  christos 				return (false);
     61  1.1  christos 			}
     62  1.1  christos 			continue;
     63  1.1  christos 		}
     64  1.1  christos 		if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 &&
     65  1.1  christos 		    (buf[i + 1] & 0xc0) == 0x80 &&
     66  1.1  christos 		    (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80)
     67  1.1  christos 		{
     68  1.1  christos 			unsigned int w;
     69  1.1  christos 			w = (buf[i] & 0x07) << 18;
     70  1.1  christos 			w |= (buf[++i] & 0x3f) << 12;
     71  1.1  christos 			w |= (buf[++i] & 0x3f) << 6;
     72  1.1  christos 			w |= (buf[++i] & 0x3f);
     73  1.1  christos 			if (w < 0x10000 || w > 0x10FFFF) {
     74  1.1  christos 				return (false);
     75  1.1  christos 			}
     76  1.1  christos 			continue;
     77  1.1  christos 		}
     78  1.1  christos 		return (false);
     79  1.1  christos 	}
     80  1.1  christos 	return (true);
     81  1.1  christos }
     82  1.1  christos 
     83  1.1  christos bool
     84  1.1  christos isc_utf8_bom(const unsigned char *buf, size_t len) {
     85  1.1  christos 	REQUIRE(buf != NULL);
     86  1.1  christos 
     87  1.1  christos 	if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) {
     88  1.1  christos 		return (true);
     89  1.1  christos 	}
     90  1.1  christos 	return (false);
     91  1.1  christos }
     92