Home | History | Annotate | Line # | Download | only in libldap
      1 /*	$NetBSD: utf-8.c,v 1.4 2025/09/05 21:16:22 christos Exp $	*/
      2 
      3 /* utf-8.c -- Basic UTF-8 routines */
      4 /* $OpenLDAP$ */
      5 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
      6  *
      7  * Copyright 1998-2024 The OpenLDAP Foundation.
      8  * All rights reserved.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted only as authorized by the OpenLDAP
     12  * Public License.
     13  *
     14  * A copy of this license is available in the file LICENSE in the
     15  * top-level directory of the distribution or, alternatively, at
     16  * <http://www.OpenLDAP.org/license.html>.
     17  */
     18 /* Basic UTF-8 routines
     19  *
     20  * These routines are "dumb".  Though they understand UTF-8,
     21  * they don't grok Unicode.  That is, they can push bits,
     22  * but don't have a clue what the bits represent.  That's
     23  * good enough for use with the LDAP Client SDK.
     24  *
     25  * These routines are not optimized.
     26  */
     27 
     28 #include <sys/cdefs.h>
     29 __RCSID("$NetBSD: utf-8.c,v 1.4 2025/09/05 21:16:22 christos Exp $");
     30 
     31 #include "portable.h"
     32 
     33 #include <stdio.h>
     34 
     35 #include <ac/stdlib.h>
     36 
     37 #include <ac/socket.h>
     38 #include <ac/string.h>
     39 #include <ac/time.h>
     40 
     41 #include "ldap_utf8.h"
     42 
     43 #include "ldap-int.h"
     44 #include "ldap_defaults.h"
     45 
     46 /*
     47  * return the number of bytes required to hold the
     48  * NULL-terminated UTF-8 string NOT INCLUDING the
     49  * termination.
     50  */
     51 ber_len_t ldap_utf8_bytes( const char * p )
     52 {
     53 	ber_len_t bytes;
     54 
     55 	for( bytes=0; p[bytes]; bytes++ ) {
     56 		/* EMPTY */ ;
     57 	}
     58 
     59 	return bytes;
     60 }
     61 
     62 ber_len_t ldap_utf8_chars( const char * p )
     63 {
     64 	/* could be optimized and could check for invalid sequences */
     65 	ber_len_t chars=0;
     66 
     67 	for( ; *p ; LDAP_UTF8_INCR(p) ) {
     68 		chars++;
     69 	}
     70 
     71 	return chars;
     72 }
     73 
     74 /* return offset to next character */
     75 int ldap_utf8_offset( const char * p )
     76 {
     77 	return LDAP_UTF8_NEXT(p) - p;
     78 }
     79 
     80 /*
     81  * Returns length indicated by first byte.
     82  */
     83 const char ldap_utf8_lentab[] = {
     84 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     85 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     86 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     87 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     88 	0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     89 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     90 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
     91 	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
     92 
     93 int ldap_utf8_charlen( const char * p )
     94 {
     95 	if (!(*p & 0x80))
     96 		return 1;
     97 
     98 	return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
     99 }
    100 
    101 /*
    102  * Make sure the UTF-8 char used the shortest possible encoding
    103  * returns charlen if valid, 0 if not.
    104  *
    105  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
    106  * The table is slightly modified from that of the RFC.
    107  *
    108  * UCS-4 range (hex)      UTF-8 sequence (binary)
    109  * 0000 0000-0000 007F   0.......
    110  * 0000 0080-0000 07FF   110++++. 10......
    111  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
    112  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
    113  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
    114  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
    115  *
    116  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
    117  * at least one of the '+' bits must be set, otherwise the character
    118  * should have been encoded in fewer octets. Note that in the two-octet
    119  * case, only the first octet needs to be validated, and this is done
    120  * in the ldap_utf8_lentab[] above.
    121  */
    122 
    123 /* mask of required bits in second octet */
    124 #undef c
    125 #define c const char
    126 c ldap_utf8_mintab[] = {
    127 	(c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
    128 	(c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
    129 	(c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
    130 	(c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
    131 #undef c
    132 
    133 int ldap_utf8_charlen2( const char * p )
    134 {
    135 	int i = LDAP_UTF8_CHARLEN( p );
    136 
    137 	if ( i > 2 ) {
    138 		if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
    139 			i = 0;
    140 	}
    141 	return i;
    142 }
    143 
    144 /* conv UTF-8 to UCS-4, useful for comparisons */
    145 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
    146 {
    147     const unsigned char *c = (const unsigned char *) p;
    148     ldap_ucs4_t ch;
    149 	int len, i;
    150 	static unsigned char mask[] = {
    151 		0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
    152 
    153 	len = LDAP_UTF8_CHARLEN2(p, len);
    154 
    155 	if( len == 0 ) return LDAP_UCS4_INVALID;
    156 
    157 	ch = c[0] & mask[len];
    158 
    159 	for(i=1; i < len; i++) {
    160 		if ((c[i] & 0xc0) != 0x80) {
    161 			return LDAP_UCS4_INVALID;
    162 		}
    163 
    164 		ch <<= 6;
    165 		ch |= c[i] & 0x3f;
    166 	}
    167 
    168 	return ch;
    169 }
    170 
    171 /* conv UCS-4 to UTF-8, not used */
    172 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
    173 {
    174 	int len=0;
    175 	unsigned char* p = (unsigned char *) buf;
    176 
    177 	/* not a valid Unicode character */
    178 	if ( c < 0 ) return 0;
    179 
    180 	/* Just return length, don't convert */
    181 	if(buf == NULL) {
    182 		if( c < 0x80 ) return 1;
    183 		else if( c < 0x800 ) return 2;
    184 		else if( c < 0x10000 ) return 3;
    185 		else if( c < 0x200000 ) return 4;
    186 		else if( c < 0x4000000 ) return 5;
    187 		else return 6;
    188 	}
    189 
    190 	if( c < 0x80 ) {
    191 		p[len++] = c;
    192 
    193 	} else if( c < 0x800 ) {
    194 		p[len++] = 0xc0 | ( c >> 6 );
    195 		p[len++] = 0x80 | ( c & 0x3f );
    196 
    197 	} else if( c < 0x10000 ) {
    198 		p[len++] = 0xe0 | ( c >> 12 );
    199 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
    200 		p[len++] = 0x80 | ( c & 0x3f );
    201 
    202 	} else if( c < 0x200000 ) {
    203 		p[len++] = 0xf0 | ( c >> 18 );
    204 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
    205 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
    206 		p[len++] = 0x80 | ( c & 0x3f );
    207 
    208 	} else if( c < 0x4000000 ) {
    209 		p[len++] = 0xf8 | ( c >> 24 );
    210 		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
    211 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
    212 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
    213 		p[len++] = 0x80 | ( c & 0x3f );
    214 
    215 	} else /* if( c < 0x80000000 ) */ {
    216 		p[len++] = 0xfc | ( c >> 30 );
    217 		p[len++] = 0x80 | ( (c >> 24) & 0x3f );
    218 		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
    219 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
    220 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
    221 		p[len++] = 0x80 | ( c & 0x3f );
    222 	}
    223 
    224 	return len;
    225 }
    226 
    227 #define LDAP_UCS_UTF8LEN(c)	\
    228 	c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
    229 	(c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
    230 
    231 /* Convert a string to UTF-8 format. The input string is expected to
    232  * have characters of 1, 2, or 4 octets (in network byte order)
    233  * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
    234  * types respectively. (Here T61STRING just means that there is one
    235  * octet per character and characters may use the high bit of the octet.
    236  * The characters are assumed to use ISO mappings, no provision is made
    237  * for converting from T.61 coding rules to Unicode.)
    238  */
    239 
    240 int
    241 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
    242 {
    243 	unsigned char *in, *end;
    244 	char *ptr;
    245 	ldap_ucs4_t u;
    246 	int i, l = 0;
    247 
    248 	utf8s->bv_val = NULL;
    249 	utf8s->bv_len = 0;
    250 
    251 	in = (unsigned char *)ucs->bv_val;
    252 
    253 	/* Make sure we stop at an even multiple of csize */
    254 	end = in + ( ucs->bv_len & ~(csize-1) );
    255 
    256 	for (; in < end; ) {
    257 		u = *in++;
    258 		if (csize > 1) {
    259 			u <<= 8;
    260 			u |= *in++;
    261 		}
    262 		if (csize > 2) {
    263 			u <<= 8;
    264 			u |= *in++;
    265 			u <<= 8;
    266 			u |= *in++;
    267 		}
    268 		i = LDAP_UCS_UTF8LEN(u);
    269 		if (i == 0)
    270 			return LDAP_INVALID_SYNTAX;
    271 		l += i;
    272 	}
    273 
    274 	utf8s->bv_val = LDAP_MALLOC( l+1 );
    275 	if (utf8s->bv_val == NULL)
    276 		return LDAP_NO_MEMORY;
    277 	utf8s->bv_len = l;
    278 
    279 	ptr = utf8s->bv_val;
    280 	for (in = (unsigned char *)ucs->bv_val; in < end; ) {
    281 		u = *in++;
    282 		if (csize > 1) {
    283 			u <<= 8;
    284 			u |= *in++;
    285 		}
    286 		if (csize > 2) {
    287 			u <<= 8;
    288 			u |= *in++;
    289 			u <<= 8;
    290 			u |= *in++;
    291 		}
    292 		ptr += ldap_x_ucs4_to_utf8(u, ptr);
    293 	}
    294 	*ptr = '\0';
    295 	return LDAP_SUCCESS;
    296 }
    297 
    298 /*
    299  * Advance to the next UTF-8 character
    300  *
    301  * Ignores length of multibyte character, instead rely on
    302  * continuation markers to find start of next character.
    303  * This allows for "resyncing" of when invalid characters
    304  * are provided provided the start of the next character
    305  * is appears within the 6 bytes examined.
    306  */
    307 char* ldap_utf8_next( const char * p )
    308 {
    309 	int i;
    310 	const unsigned char *u = (const unsigned char *) p;
    311 
    312 	if( LDAP_UTF8_ISASCII(u) ) {
    313 		return (char *) &p[1];
    314 	}
    315 
    316 	for( i=1; i<6; i++ ) {
    317 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
    318 			return (char *) &p[i];
    319 		}
    320 	}
    321 
    322 	return (char *) &p[i];
    323 }
    324 
    325 /*
    326  * Advance to the previous UTF-8 character
    327  *
    328  * Ignores length of multibyte character, instead rely on
    329  * continuation markers to find start of next character.
    330  * This allows for "resyncing" of when invalid characters
    331  * are provided provided the start of the next character
    332  * is appears within the 6 bytes examined.
    333  */
    334 char* ldap_utf8_prev( const char * p )
    335 {
    336 	int i;
    337 	const unsigned char *u = (const unsigned char *) p;
    338 
    339 	for( i=-1; i>-6 ; i-- ) {
    340 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
    341 			return (char *) &p[i];
    342 		}
    343 	}
    344 
    345 	return (char *) &p[i];
    346 }
    347 
    348 /*
    349  * Copy one UTF-8 character from src to dst returning
    350  * number of bytes copied.
    351  *
    352  * Ignores length of multibyte character, instead rely on
    353  * continuation markers to find start of next character.
    354  * This allows for "resyncing" of when invalid characters
    355  * are provided provided the start of the next character
    356  * is appears within the 6 bytes examined.
    357  */
    358 int ldap_utf8_copy( char* dst, const char *src )
    359 {
    360 	int i;
    361 	const unsigned char *u = (const unsigned char *) src;
    362 
    363 	dst[0] = src[0];
    364 
    365 	if( LDAP_UTF8_ISASCII(u) ) {
    366 		return 1;
    367 	}
    368 
    369 	for( i=1; i<6; i++ ) {
    370 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
    371 			return i;
    372 		}
    373 		dst[i] = src[i];
    374 	}
    375 
    376 	return i;
    377 }
    378 
    379 #ifndef UTF8_ALPHA_CTYPE
    380 /*
    381  * UTF-8 ctype routines
    382  * Only deals with characters < 0x80 (ie: US-ASCII)
    383  */
    384 
    385 int ldap_utf8_isascii( const char * p )
    386 {
    387 	unsigned c = * (const unsigned char *) p;
    388 	return LDAP_ASCII(c);
    389 }
    390 
    391 int ldap_utf8_isdigit( const char * p )
    392 {
    393 	unsigned c = * (const unsigned char *) p;
    394 
    395 	if(!LDAP_ASCII(c)) return 0;
    396 
    397 	return LDAP_DIGIT( c );
    398 }
    399 
    400 int ldap_utf8_isxdigit( const char * p )
    401 {
    402 	unsigned c = * (const unsigned char *) p;
    403 
    404 	if(!LDAP_ASCII(c)) return 0;
    405 
    406 	return LDAP_HEX(c);
    407 }
    408 
    409 int ldap_utf8_isspace( const char * p )
    410 {
    411 	unsigned c = * (const unsigned char *) p;
    412 
    413 	if(!LDAP_ASCII(c)) return 0;
    414 
    415 	switch(c) {
    416 	case ' ':
    417 	case '\t':
    418 	case '\n':
    419 	case '\r':
    420 	case '\v':
    421 	case '\f':
    422 		return 1;
    423 	}
    424 
    425 	return 0;
    426 }
    427 
    428 /*
    429  * These are not needed by the C SDK and are
    430  * not "good enough" for general use.
    431  */
    432 int ldap_utf8_isalpha( const char * p )
    433 {
    434 	unsigned c = * (const unsigned char *) p;
    435 
    436 	if(!LDAP_ASCII(c)) return 0;
    437 
    438 	return LDAP_ALPHA(c);
    439 }
    440 
    441 int ldap_utf8_isalnum( const char * p )
    442 {
    443 	unsigned c = * (const unsigned char *) p;
    444 
    445 	if(!LDAP_ASCII(c)) return 0;
    446 
    447 	return LDAP_ALNUM(c);
    448 }
    449 
    450 int ldap_utf8_islower( const char * p )
    451 {
    452 	unsigned c = * (const unsigned char *) p;
    453 
    454 	if(!LDAP_ASCII(c)) return 0;
    455 
    456 	return LDAP_LOWER(c);
    457 }
    458 
    459 int ldap_utf8_isupper( const char * p )
    460 {
    461 	unsigned c = * (const unsigned char *) p;
    462 
    463 	if(!LDAP_ASCII(c)) return 0;
    464 
    465 	return LDAP_UPPER(c);
    466 }
    467 #endif
    468 
    469 
    470 /*
    471  * UTF-8 string routines
    472  */
    473 
    474 /* like strchr() */
    475 char * (ldap_utf8_strchr)( const char *str, const char *chr )
    476 {
    477 	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
    478 		if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
    479 			return (char *) str;
    480 		}
    481 	}
    482 
    483 	return NULL;
    484 }
    485 
    486 /* like strcspn() but returns number of bytes, not characters */
    487 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
    488 {
    489 	const char *cstr;
    490 	const char *cset;
    491 
    492 	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
    493 		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
    494 			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
    495 				return cstr - str;
    496 			}
    497 		}
    498 	}
    499 
    500 	return cstr - str;
    501 }
    502 
    503 /* like strspn() but returns number of bytes, not characters */
    504 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
    505 {
    506 	const char *cstr;
    507 	const char *cset;
    508 
    509 	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
    510 		for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
    511 			if( *cset == '\0' ) {
    512 				return cstr - str;
    513 			}
    514 
    515 			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
    516 				break;
    517 			}
    518 		}
    519 	}
    520 
    521 	return cstr - str;
    522 }
    523 
    524 /* like strpbrk(), replaces strchr() as well */
    525 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
    526 {
    527 	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
    528 		const char *cset;
    529 
    530 		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
    531 			if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
    532 				return (char *) str;
    533 			}
    534 		}
    535 	}
    536 
    537 	return NULL;
    538 }
    539 
    540 /* like strtok_r(), not strtok() */
    541 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
    542 {
    543 	char *begin;
    544 	char *end;
    545 
    546 	if( last == NULL ) return NULL;
    547 
    548 	begin = str ? str : *last;
    549 
    550 	begin += ldap_utf8_strspn( begin, sep );
    551 
    552 	if( *begin == '\0' ) {
    553 		*last = NULL;
    554 		return NULL;
    555 	}
    556 
    557 	end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
    558 
    559 	if( *end != '\0' ) {
    560 		char *next = LDAP_UTF8_NEXT( end );
    561 		*end = '\0';
    562 		end = next;
    563 	}
    564 
    565 	*last = end;
    566 	return begin;
    567 }
    568