[Date Prev][Date Next] [Chronological] [Thread] [Top]

(ITS#4655) is_utf8 function for ldapsearch w/ UTF-8 strings



Full_Name: Michael B Allen
Version: 
OS: 
URL: ftp://ftp.openldap.org/incoming/
Submission from: (NULL) (69.142.196.170)


I would like to contribute the below function in the public domain. This could
be used by ldapsearch and friends as an alternative to ldif_is_not_printable to
determine if an attribute value is a string or if it should be represented in
base64 (currently non-ASCII strings are simply base64 encoded).

Sorry for not submitting a patch. I'm setup for stock packages at the moment.

Mike

int
is_utf8(const unsigned char *src, int n)
{
    const unsigned char *slim = src + n;

    while (src < slim) {
        int wc;

        if (*src < 0x80) {
            src++;
        } else if ((*src & 0xE0) == 0xC0) {
            if ((slim - src) < 2) return 0;
            wc = (*src++ & 0x1F) << 6;
            if ((*src & 0xC0) != 0x80) {
                return 0;
            } else {
                wc |= *src & 0x3F;
            }
            if (wc < 0x80) {
                return 0;
            }
            src++;
        } else if ((*src & 0xF0) == 0xE0) {
            /* less common */
            if ((slim - src) < 3) return 0;
            wc = (*src++ & 0x0F) << 12;
            if ((*src & 0xC0) != 0x80) {
                return 0;
            } else {
                wc |= (*src++ & 0x3F) << 6;
                if ((*src & 0xC0) != 0x80) {
                    return 0;
                } else {
                    wc |= *src & 0x3F;
                }
            }
            if (wc < 0x800) {
                return 0;
            }
            src++;
        } else {
            /* very unlikely */
            return 0;
        }
    }

    /* it's UTF-8 */
    return 1;
}