[Date Prev][Date Next] [Chronological] [Thread] [Top]

is_utf8 function for ldapsearch w/ UTF-8 strings



Hey,

I would like to contribute the below function in the public
domain. This could be used by ldapsearch and friends as an alternative
to ldif_is_not_printable to determine if an attribute value is a string
or if it should be represented in base64 (currently non-ASCII strings
are simply base64 encoded).

Mike

int  
is_utf8(const unsigned char *src, int n)
{    
    const unsigned char *slim = src + n;
     
    while (src < slim) { 
        int wc;
     
        if (*src < 0x80) { 
            wc = *src; 
            src++;
        } else if ((*src & 0xE0) == 0xC0) { 
            if ((slim - src) < 2) return 0;
            wc = (*src++ & 0x1F) << 6;
            if ((*src & 0xC0) != 0x80) { 
                return 0;
            } else {
                wc |= *src & 0x3F; 
            }
            if (wc < 0x80) { 
                return 0;
            }       
            src++;
        } else if ((*src & 0xF0) == 0xE0) { 
            /* less common */
            if ((slim - src) < 3) return 0;
            wc = (*src++ & 0x0F) << 12;
            if ((*src & 0xC0) != 0x80) { 
                return 0;
            } else {
                wc |= (*src++ & 0x3F) << 6;
                if ((*src & 0xC0) != 0x80) { 
                    return 0;
                } else {
                    wc |= *src & 0x3F; 
                }
            }
            if (wc < 0x800) {
                return 0;
            }
            src++;
        } else {
            /* very unlikely */
            return 0;
        }
    }

    /* it's UTF-8 */
    return 1;
}