Home | History | Annotate | Line # | Download | only in isc
      1 /*	$NetBSD: utf8.c,v 1.5 2025/01/26 16:25:39 christos Exp $	*/
      2 
      3 /*
      4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
      5  *
      6  * SPDX-License-Identifier: MPL-2.0
      7  *
      8  * This Source Code Form is subject to the terms of the Mozilla Public
      9  * License, v. 2.0. If a copy of the MPL was not distributed with this
     10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
     11  *
     12  * See the COPYRIGHT file distributed with this work for additional
     13  * information regarding copyright ownership.
     14  */
     15 
     16 #include <string.h>
     17 
     18 #include <isc/utf8.h>
     19 #include <isc/util.h>
     20 
     21 /*
     22  * UTF-8 is defined in "The Unicode Standard -- Version 4.0"
     23  * Also see RFC 3629.
     24  *
     25  * Char. number range  |        UTF-8 octet sequence
     26  *    (hexadecimal)    |              (binary)
     27  *  --------------------+---------------------------------------------
     28  * 0000 0000-0000 007F | 0xxxxxxx
     29  * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
     30  * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
     31  * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     32  */
     33 bool
     34 isc_utf8_valid(const unsigned char *buf, size_t len) {
     35 	REQUIRE(buf != NULL);
     36 
     37 	for (size_t i = 0; i < len; i++) {
     38 		if (buf[i] <= 0x7f) {
     39 			continue;
     40 		}
     41 		if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 &&
     42 		    (buf[i + 1] & 0xc0) == 0x80)
     43 		{
     44 			unsigned int w;
     45 			w = (buf[i] & 0x1f) << 6;
     46 			w |= (buf[++i] & 0x3f);
     47 			if (w < 0x80) {
     48 				return false;
     49 			}
     50 			continue;
     51 		}
     52 		if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 &&
     53 		    (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80)
     54 		{
     55 			unsigned int w;
     56 			w = (buf[i] & 0x0f) << 12;
     57 			w |= (buf[++i] & 0x3f) << 6;
     58 			w |= (buf[++i] & 0x3f);
     59 			if (w < 0x0800) {
     60 				return false;
     61 			}
     62 			continue;
     63 		}
     64 		if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 &&
     65 		    (buf[i + 1] & 0xc0) == 0x80 &&
     66 		    (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80)
     67 		{
     68 			unsigned int w;
     69 			w = (buf[i] & 0x07) << 18;
     70 			w |= (buf[++i] & 0x3f) << 12;
     71 			w |= (buf[++i] & 0x3f) << 6;
     72 			w |= (buf[++i] & 0x3f);
     73 			if (w < 0x10000 || w > 0x10FFFF) {
     74 				return false;
     75 			}
     76 			continue;
     77 		}
     78 		return false;
     79 	}
     80 	return true;
     81 }
     82 
     83 bool
     84 isc_utf8_bom(const unsigned char *buf, size_t len) {
     85 	REQUIRE(buf != NULL);
     86 
     87 	if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) {
     88 		return true;
     89 	}
     90 	return false;
     91 }
     92