[Date Prev][Date Next] [Chronological] [Thread] [Top]

Add flag to UTF8normalize and pals to allow accent stripping



Some of us are too lazy to type accents when we do searches.

Here's a patch to UTF8normalize to make it strip accents if
the caller wants.

This could be used to implement accent free searches.

(On reflection maybe the stripping should be done inside
uccanondecomp, but that's not how I wrote it).

Index: include/ldap_pvt_uc.h
===================================================================
RCS file: /repo/OpenLDAP/pkg/ldap/include/ldap_pvt_uc.h,v
retrieving revision 1.16
diff -u -r1.16 ldap_pvt_uc.h
--- include/ldap_pvt_uc.h	2002/02/14 15:01:48	1.16
+++ include/ldap_pvt_uc.h	2002/02/24 12:31:22
@@ -137,6 +137,7 @@
 	ldap_unicode_t *,
 	ber_len_t );
 
+#define LDAP_UTF8_STRIPACCENT		0x2U
 #define LDAP_UTF8_CASEFOLD		0x1U
 #define LDAP_UTF8_NOCASEFOLD	0x0U
 
Index: libraries/liblunicode/ucstr.c
===================================================================
RCS file: /repo/OpenLDAP/pkg/ldap/libraries/liblunicode/ucstr.c,v
retrieving revision 1.15
diff -u -r1.15 ucstr.c
--- libraries/liblunicode/ucstr.c	2002/02/14 13:03:27	1.15
+++ libraries/liblunicode/ucstr.c	2002/02/24 12:31:23
@@ -6,6 +6,8 @@
 
 #include "portable.h"
 
+#include <stdio.h>
+
 #include <ac/ctype.h>
 #include <ac/string.h>
 #include <ac/stdlib.h>
@@ -95,12 +97,14 @@
 
 char * UTF8normalize(
 	struct berval *bv,
-	unsigned casefold )
+	unsigned flags )
 {
 	int i, j, len, clen, outpos, ucsoutlen, outsize, last;
 	char *out, *s;
 	unsigned long *ucs, *p, *ucsout;
-
+	int casefold = flags & LDAP_UTF8_CASEFOLD;
+	int strip = flags & LDAP_UTF8_STRIPACCENT;
+	
 	static unsigned char mask[] = {
                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 
@@ -202,6 +206,15 @@
                 }
 		/* normalize ucs of length p - ucs */
 		uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );    
+		if (strip) {
+			int in,ex;
+			for (in = 1, ex = 1; in < ucsoutlen; ++in) { 
+				if (ucisnonspacing (ucsout[in])) continue;
+				ucsout[ex] = ucsout[in];
+				++ex;
+			}
+			ucsoutlen = ex;
+		}
 		ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
 		/* convert ucs to utf-8 and store in out */
 		for ( j = 0; j < ucsoutlen; j++ ) {
@@ -246,11 +259,13 @@
 struct berval * UTF8bvnormalize(
 	struct berval *bv,
 	struct berval *newbv,
-	unsigned casefold )
+	unsigned flags )
 {
 	int i, j, len, clen, outpos, ucsoutlen, outsize, last;
 	char *out, *s;
 	unsigned long *ucs, *p, *ucsout;
+	int casefold = flags & LDAP_UTF8_CASEFOLD;
+	int strip = flags & LDAP_UTF8_STRIPACCENT;
 	
 	static unsigned char mask[] = {
                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
@@ -362,6 +377,15 @@
                 }
 		/* normalize ucs of length p - ucs */
 		uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );    
+		if (strip) {
+			int in,ex;
+			for (in = 1, ex = 1; in < ucsoutlen; ++in) { 
+				if (ucisnonspacing (ucsout[in])) continue;
+				ucsout[ex] = ucsout[in];
+				++ex;
+			}
+			ucsoutlen = ex;
+		}
 		ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
 		/* convert ucs to utf-8 and store in out */
 		for ( j = 0; j < ucsoutlen; j++ ) {
@@ -408,10 +432,12 @@
 int UTF8normcmp(
 	const char *s1,
 	const char *s2,
-	unsigned casefold )
+	unsigned flags )
 {
 	int i, l1, l2, len, ulen, res;
 	unsigned long *ucs, *ucsout1, *ucsout2;
+	int casefold = flags & LDAP_UTF8_CASEFOLD;
+	int strip = flags & LDAP_UTF8_STRIPACCENT;
 
 	l1 = strlen( s1 );
 	l2 = strlen( s2 );
@@ -467,6 +493,15 @@
 		len = LDAP_UTF8_CHARLEN( s1 + i );
 	}
 	uccanondecomp( ucs, ulen, &ucsout1, &l1 );
+	if (strip) {
+		int in,ex;
+		for (in = 1, ex = 1; in < l1; ++in) { 
+			if (ucisnonspacing (ucsout1[in])) continue;
+			ucsout1[ex] = ucsout1[in];
+			++ex;
+		}
+		l1 = ex;
+	}
 	l1 = uccanoncomp( ucsout1, l1 );
 
 	/* convert and normalize 2nd string */
@@ -480,6 +515,15 @@
 		len = LDAP_UTF8_CHARLEN( s2 + i );
 	}
 	uccanondecomp( ucs, ulen, &ucsout2, &l2 );
+	if (strip) {
+		int in,ex;
+		for (in = 1, ex = 1; in < l2; ++in) { 
+			if (ucisnonspacing (ucsout2[in])) continue;
+			ucsout2[ex] = ucsout2[in];
+			++ex;
+		}
+		l2 = ex;
+	}
 	l2 = uccanoncomp( ucsout2, l2 );
 
 	free( ucs );