Home | History | Annotate | Line # | Download | only in liblunicode
      1 /*	$NetBSD: ucstr.c,v 1.4 2025/09/05 21:16:22 christos Exp $	*/
      2 
      3 /* $OpenLDAP$ */
      4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
      5  *
      6  * Copyright 1998-2024 The OpenLDAP Foundation.
      7  * All rights reserved.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted only as authorized by the OpenLDAP
     11  * Public License.
     12  *
     13  * A copy of this license is available in file LICENSE in the
     14  * top-level directory of the distribution or, alternatively, at
     15  * <http://www.OpenLDAP.org/license.html>.
     16  */
     17 
     18 #include <sys/cdefs.h>
     19 __RCSID("$NetBSD: ucstr.c,v 1.4 2025/09/05 21:16:22 christos Exp $");
     20 
     21 #include "portable.h"
     22 
     23 #include <ac/bytes.h>
     24 #include <ac/ctype.h>
     25 #include <ac/string.h>
     26 #include <ac/stdlib.h>
     27 
     28 #include <lber_pvt.h>
     29 
     30 #include <ldap_utf8.h>
     31 #include <ldap_pvt_uc.h>
     32 
     33 #define	malloc(x)	ber_memalloc_x(x,ctx)
     34 #define	realloc(x,y)	ber_memrealloc_x(x,y,ctx)
     35 #define	free(x)		ber_memfree_x(x,ctx)
     36 
     37 int ucstrncmp(
     38 	const ldap_unicode_t *u1,
     39 	const ldap_unicode_t *u2,
     40 	ber_len_t n )
     41 {
     42 	for(; 0 < n; ++u1, ++u2, --n ) {
     43 		if( *u1 != *u2 ) {
     44 			return *u1 < *u2 ? -1 : +1;
     45 		}
     46 		if ( *u1 == 0 ) {
     47 			return 0;
     48 		}
     49 	}
     50 	return 0;
     51 }
     52 
     53 int ucstrncasecmp(
     54 	const ldap_unicode_t *u1,
     55 	const ldap_unicode_t *u2,
     56 	ber_len_t n )
     57 {
     58 	for(; 0 < n; ++u1, ++u2, --n ) {
     59 		ldap_unicode_t uu1 = uctolower( *u1 );
     60 		ldap_unicode_t uu2 = uctolower( *u2 );
     61 
     62 		if( uu1 != uu2 ) {
     63 			return uu1 < uu2 ? -1 : +1;
     64 		}
     65 		if ( uu1 == 0 ) {
     66 			return 0;
     67 		}
     68 	}
     69 	return 0;
     70 }
     71 
     72 ldap_unicode_t * ucstrnchr(
     73 	const ldap_unicode_t *u,
     74 	ber_len_t n,
     75 	ldap_unicode_t c )
     76 {
     77 	for(; 0 < n; ++u, --n ) {
     78 		if( *u == c ) {
     79 			return (ldap_unicode_t *) u;
     80 		}
     81 	}
     82 
     83 	return NULL;
     84 }
     85 
     86 ldap_unicode_t * ucstrncasechr(
     87 	const ldap_unicode_t *u,
     88 	ber_len_t n,
     89 	ldap_unicode_t c )
     90 {
     91 	c = uctolower( c );
     92 	for(; 0 < n; ++u, --n ) {
     93 		if( uctolower( *u ) == c ) {
     94 			return (ldap_unicode_t *) u;
     95 		}
     96 	}
     97 
     98 	return NULL;
     99 }
    100 
    101 void ucstr2upper(
    102 	ldap_unicode_t *u,
    103 	ber_len_t n )
    104 {
    105 	for(; 0 < n; ++u, --n ) {
    106 		*u = uctoupper( *u );
    107 	}
    108 }
    109 
    110 struct berval * UTF8bvnormalize(
    111 	struct berval *bv,
    112 	struct berval *newbv,
    113 	unsigned flags,
    114 	void *ctx )
    115 {
    116 	int i, j, len, clen, outpos, ucsoutlen, outsize, last;
    117 	int didnewbv = 0;
    118 	char *out, *outtmp, *s;
    119 	ac_uint4 *ucs, *p, *ucsout;
    120 
    121 	static unsigned char mask[] = {
    122 		0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
    123 
    124 	unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
    125 	unsigned approx = flags & LDAP_UTF8_APPROX;
    126 
    127 	if ( bv == NULL ) {
    128 		return NULL;
    129 	}
    130 
    131 	s = bv->bv_val;
    132 	len = bv->bv_len;
    133 
    134 	if ( len == 0 ) {
    135 		return ber_dupbv_x( newbv, bv, ctx );
    136 	}
    137 
    138 	if ( !newbv ) {
    139 		newbv = ber_memalloc_x( sizeof(struct berval), ctx );
    140 		if ( !newbv ) return NULL;
    141 		didnewbv = 1;
    142 	}
    143 
    144 	/* Should first check to see if string is already in proper
    145 	 * normalized form. This is almost as time consuming as
    146 	 * the normalization though.
    147 	 */
    148 
    149 	/* finish off everything up to character before first non-ascii */
    150 	if ( LDAP_UTF8_ISASCII( s ) ) {
    151 		if ( casefold ) {
    152 			outsize = len + 7;
    153 			out = (char *) ber_memalloc_x( outsize, ctx );
    154 			if ( out == NULL ) {
    155 fail:
    156 				if ( didnewbv )
    157 					ber_memfree_x( newbv, ctx );
    158 				return NULL;
    159 			}
    160 			outpos = 0;
    161 
    162 			for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
    163 				out[outpos++] = TOLOWER( s[i-1] );
    164 			}
    165 			if ( i == len ) {
    166 				out[outpos++] = TOLOWER( s[len-1] );
    167 				out[outpos] = '\0';
    168 				newbv->bv_val = out;
    169 				newbv->bv_len = outpos;
    170 				return newbv;
    171 			}
    172 		} else {
    173 			for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
    174 				/* empty */
    175 			}
    176 
    177 			if ( i == len ) {
    178 				return ber_str2bv_x( s, len, 1, newbv, ctx );
    179 			}
    180 
    181 			outsize = len + 7;
    182 			out = (char *) ber_memalloc_x( outsize, ctx );
    183 			if ( out == NULL ) {
    184 				goto fail;
    185 			}
    186 			outpos = i - 1;
    187 			memcpy(out, s, outpos);
    188 		}
    189 	} else {
    190 		outsize = len + 7;
    191 		out = (char *) ber_memalloc_x( outsize, ctx );
    192 		if ( out == NULL ) {
    193 			goto fail;
    194 		}
    195 		outpos = 0;
    196 		i = 0;
    197 	}
    198 
    199 	p = ucs = ber_memalloc_x( len * sizeof(*ucs), ctx );
    200 	if ( ucs == NULL ) {
    201 		ber_memfree_x(out, ctx);
    202 		goto fail;
    203 	}
    204 
    205 	/* convert character before first non-ascii to ucs-4 */
    206 	if ( i > 0 ) {
    207 		*p = casefold ? TOLOWER( s[i-1] ) : s[i-1];
    208 		p++;
    209 	}
    210 
    211 	/* s[i] is now first non-ascii character */
    212 	for (;;) {
    213 		/* s[i] is non-ascii */
    214 		/* convert everything up to next ascii to ucs-4 */
    215 		while ( i < len ) {
    216 			clen = LDAP_UTF8_CHARLEN2( s + i, clen );
    217 			if ( clen == 0 ) {
    218 				ber_memfree_x( ucs, ctx );
    219 				ber_memfree_x( out, ctx );
    220 				goto fail;
    221 			}
    222 			if ( clen == 1 ) {
    223 				/* ascii */
    224 				break;
    225 			}
    226 			*p = s[i] & mask[clen];
    227 			i++;
    228 			for( j = 1; j < clen; j++ ) {
    229 				if ( (s[i] & 0xc0) != 0x80 ) {
    230 					ber_memfree_x( ucs, ctx );
    231 					ber_memfree_x( out, ctx );
    232 					goto fail;
    233 				}
    234 				*p <<= 6;
    235 				*p |= s[i] & 0x3f;
    236 				i++;
    237 			}
    238 			if ( casefold ) {
    239 				*p = uctolower( *p );
    240 			}
    241 			p++;
    242 		}
    243 		/* normalize ucs of length p - ucs */
    244 		uccompatdecomp( ucs, p - ucs, &ucsout, &ucsoutlen, ctx );
    245 		if ( approx ) {
    246 			for ( j = 0; j < ucsoutlen; j++ ) {
    247 				if ( ucsout[j] < 0x80 ) {
    248 					if ( outpos >= outsize ) {
    249 						outsize += ( ucsoutlen - j ) + 1;
    250 						outtmp = (char *) ber_memrealloc_x( out, outsize, ctx );
    251 						if ( outtmp == NULL ) {
    252 							ber_memfree_x( ucsout, ctx );
    253 							ber_memfree_x( ucs, ctx );
    254 							ber_memfree_x( out, ctx );
    255 							goto fail;
    256 						}
    257 						out = outtmp;
    258 					}
    259 					out[outpos++] = ucsout[j];
    260 				}
    261 			}
    262 		} else {
    263 			ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
    264 			/* convert ucs to utf-8 and store in out */
    265 			for ( j = 0; j < ucsoutlen; j++ ) {
    266 				/* allocate more space if not enough room for
    267 				   6 bytes and terminator */
    268 				if ( outsize - outpos < 7 ) {
    269 					outsize = ucsoutlen - j + outpos + 6;
    270 					outtmp = (char *) ber_memrealloc_x( out, outsize, ctx );
    271 					if ( outtmp == NULL ) {
    272 						ber_memfree_x( ucsout, ctx );
    273 						ber_memfree_x( ucs, ctx );
    274 						ber_memfree_x( out, ctx );
    275 						goto fail;
    276 					}
    277 					out = outtmp;
    278 				}
    279 				outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
    280 			}
    281 		}
    282 
    283 		ber_memfree_x( ucsout, ctx );
    284 		ucsout = NULL;
    285 
    286 		if ( i == len ) {
    287 			break;
    288 		}
    289 
    290 		last = i;
    291 
    292 		/* Allocate more space in out if necessary */
    293 		if (len - i >= outsize - outpos) {
    294 			outsize += 1 + ((len - i) - (outsize - outpos));
    295 			outtmp = (char *) ber_memrealloc_x(out, outsize, ctx);
    296 			if (outtmp == NULL) {
    297 				ber_memfree_x( ucs, ctx );
    298 				ber_memfree_x( out, ctx );
    299 				goto fail;
    300 			}
    301 			out = outtmp;
    302 		}
    303 
    304 		/* s[i] is ascii */
    305 		/* finish off everything up to char before next non-ascii */
    306 		for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
    307 			out[outpos++] = casefold ? TOLOWER( s[i-1] ) : s[i-1];
    308 		}
    309 		if ( i == len ) {
    310 			out[outpos++] = casefold ? TOLOWER( s[len-1] ) : s[len-1];
    311 			break;
    312 		}
    313 
    314 		/* convert character before next non-ascii to ucs-4 */
    315 		*ucs = casefold ? TOLOWER( s[i-1] ) : s[i-1];
    316 		p = ucs + 1;
    317 	}
    318 
    319 	ber_memfree_x( ucs, ctx );
    320 	out[outpos] = '\0';
    321 	newbv->bv_val = out;
    322 	newbv->bv_len = outpos;
    323 	return newbv;
    324 }
    325 
    326 /* compare UTF8-strings, optionally ignore casing */
    327 /* slow, should be optimized */
    328 int UTF8bvnormcmp(
    329 	struct berval *bv1,
    330 	struct berval *bv2,
    331 	unsigned flags,
    332 	void *ctx )
    333 {
    334 	int i, l1, l2, len, ulen, res = 0;
    335 	char *s1, *s2, *done;
    336 	ac_uint4 *ucs, *ucsout1, *ucsout2;
    337 
    338 	unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
    339 	unsigned norm1 = flags & LDAP_UTF8_ARG1NFC;
    340 	unsigned norm2 = flags & LDAP_UTF8_ARG2NFC;
    341 
    342 	if (bv1 == NULL) {
    343 		return bv2 == NULL ? 0 : -1;
    344 
    345 	} else if (bv2 == NULL) {
    346 		return 1;
    347 	}
    348 
    349 	l1 = bv1->bv_len;
    350 	l2 = bv2->bv_len;
    351 
    352 	len = (l1 < l2) ? l1 : l2;
    353 	if (len == 0) {
    354 		return l1 == 0 ? (l2 == 0 ? 0 : -1) : 1;
    355 	}
    356 
    357 	s1 = bv1->bv_val;
    358 	s2 = bv2->bv_val;
    359 	done = s1 + len;
    360 
    361 	while ( (s1 < done) && LDAP_UTF8_ISASCII(s1) && LDAP_UTF8_ISASCII(s2) ) {
    362 		if (casefold) {
    363 			char c1 = TOLOWER(*s1);
    364 			char c2 = TOLOWER(*s2);
    365 			res = c1 - c2;
    366 		} else {
    367 			res = *s1 - *s2;
    368 		}
    369 		s1++;
    370 		s2++;
    371 		if (res) {
    372 			/* done unless next character in s1 or s2 is non-ascii */
    373 			if (s1 < done) {
    374 				if (!LDAP_UTF8_ISASCII(s1) || !LDAP_UTF8_ISASCII(s2)) {
    375 					break;
    376 				}
    377 			} else if (((len < l1) && !LDAP_UTF8_ISASCII(s1)) ||
    378 				((len < l2) && !LDAP_UTF8_ISASCII(s2)))
    379 			{
    380 				break;
    381 			}
    382 			return res;
    383 		}
    384 	}
    385 
    386 	/* We have encountered non-ascii or strings equal up to len */
    387 
    388 	/* set i to number of iterations */
    389 	i = s1 - done + len;
    390 	/* passed through loop at least once? */
    391 	if (i > 0) {
    392 		if (!res && (s1 == done) &&
    393 		    ((len == l1) || LDAP_UTF8_ISASCII(s1)) &&
    394 		    ((len == l2) || LDAP_UTF8_ISASCII(s2))) {
    395 			/* all ascii and equal up to len */
    396 			return l1 - l2;
    397 		}
    398 
    399 		/* rewind one char, and do normalized compare from there */
    400 		s1--;
    401 		s2--;
    402 		l1 -= i - 1;
    403 		l2 -= i - 1;
    404 	}
    405 
    406 	/* Should first check to see if strings are already in
    407 	 * proper normalized form.
    408 	 */
    409 	ucs = malloc( ( ( norm1 || l1 > l2 ) ? l1 : l2 ) * sizeof(*ucs) );
    410 	if ( ucs == NULL ) {
    411 		return l1 > l2 ? 1 : -1; /* what to do??? */
    412 	}
    413 
    414 	/*
    415 	 * XXYYZ: we convert to ucs4 even though -llunicode
    416 	 * expects ucs2 in an ac_uint4
    417 	 */
    418 
    419 	/* convert and normalize 1st string */
    420 	for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) {
    421 		ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i );
    422 		if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
    423 			free( ucs );
    424 			return -1; /* what to do??? */
    425 		}
    426 		len = LDAP_UTF8_CHARLEN( s1 + i );
    427 	}
    428 
    429 	if ( norm1 ) {
    430 		ucsout1 = ucs;
    431 		l1 = ulen;
    432 		ucs = malloc( l2 * sizeof(*ucs) );
    433 		if ( ucs == NULL ) {
    434 			free( ucsout1 );
    435 			return l1 > l2 ? 1 : -1; /* what to do??? */
    436 		}
    437 	} else {
    438 		uccompatdecomp( ucs, ulen, &ucsout1, &l1, ctx );
    439 		l1 = uccanoncomp( ucsout1, l1 );
    440 	}
    441 
    442 	/* convert and normalize 2nd string */
    443 	for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) {
    444 		ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i );
    445 		if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
    446 			free( ucsout1 );
    447 			free( ucs );
    448 			return 1; /* what to do??? */
    449 		}
    450 		len = LDAP_UTF8_CHARLEN( s2 + i );
    451 	}
    452 
    453 	if ( norm2 ) {
    454 		ucsout2 = ucs;
    455 		l2 = ulen;
    456 	} else {
    457 		uccompatdecomp( ucs, ulen, &ucsout2, &l2, ctx );
    458 		l2 = uccanoncomp( ucsout2, l2 );
    459 		free( ucs );
    460 	}
    461 
    462 	res = casefold
    463 		? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 )
    464 		: ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 );
    465 	free( ucsout1 );
    466 	free( ucsout2 );
    467 
    468 	if ( res != 0 ) {
    469 		return res;
    470 	}
    471 	if ( l1 == l2 ) {
    472 		return 0;
    473 	}
    474 	return l1 > l2 ? 1 : -1;
    475 }
    476