[Date Prev][Date Next] [Chronological] [Thread] [Top]

Re: Case ignore search on non-ascii letters



I haven't really found a FAQ our a How-To. I had to search hard before
finding the magic command to use, which is iconv. If you're on Unix, use
the same. For me, it's:

$ echo "Émile Belphégore" | iconv -f ISO8859-1 -t UTF-8

The different encodings that are supported can be found by looking in the
directory /usr/lib/iconv.

I suppose that for Czech you will want ISO8859-something; for Japanese you
surely know better than I do.

What comes out of the iconv has non-ascii characters in it and must be
base-64 encoded if you want to put it into an LDIF file. I found no program that base-64 encodes, though I'm sure they exist; so I wrote my own; source is attached.


With that compiled, it comes out as:

$ echo "Émile Belphégore" | \
iconv -f ISO8859-1 -t UTF-8 | \
base64 -f

Which gives me:

w4ltaWxlIEJlbHBow6lnb3JlCg==

And that goes into the ldif file:

cn:: w4ltaWxlIEJlbHBow6lnb3JlCg==

The double colon (::) says it's base64.

I use the LDAP browser by Jarek Gawor
(<http://www.iit.edu/~gawojar/ldap/>). It seems to take care of the UTF-8
encoding-decoding itself.

--Le jeudi 10 mai 2001 4:18 -0400 yan@cardinalengineering.com écrivait:

Is there a FAQ or a HOW-To for UTF-8 support?  I'd like to play with
Czech and Japanese a little bit, and don't know where to start.

Thanks,

--Yan

David Olivier wrote:

After coming to terms with UTF-8 and BASE64 I have managed to set up one of my entries with:

      cn: Émile Belphégore

correctly UTF-8 encoded.

If I search for that entry with '(cn=*Belph*)'  I find it.
And  if I  search  for it with  '(cn=*BELPH*)'  I still find it.
Now if I try  to get it through '(cn=*Belphé*)' I get it.
But if I am brash enough to ask '(cn=*BELPHÉ*)' I get nothing.

I other words, case is correctly ignored for plain ascii, i.e. for 26
letters; but for non-ascii letters, such as French accented letters,
which can be capitalized, it doesn't work.

Can this be seen as a bug or at least a caveat, or something like that?

ÑIt IS a problem for us!

---
David Olivier
Klebs gardien Alpages CRI courrier brebis Lyon 2 Lumiãre



--- David Olivier Klebs gardien Alpages CRI courrier brebis Lyon 2 Lumière
#include <stdio.h>

void Usage(char *argv0) {
 fprintf(stderr, "Usage: %s [-d] [-f] <infile >outfile\n", argv0);
 fprintf(stderr, "-d: décode\n");
 fprintf(stderr, "-f: (encodage seulement) fold lines at 40 chars\n");
 exit(2);
}

/*---------- doDecode ------------ */

unsigned char decodeC(unsigned char c) {
 if ((c >= 'A') && (c <= 'Z')) return c - 'A';
 if ((c >= 'a') && (c <= 'z')) return c + (26 - 'a');
 if ((c >= '0') && (c <= '9')) return c + (52 - '0');
 if (c == '+') return 0x3E;
 if (c == '/') return 0x3F;
 if (c == '=') return 0x40;
 return 0xFF;
}

int read4(unsigned char *c) {
 int n = 0;
 do {
  unsigned char c1;
  do {
   int n1 = read(0, &c1, 1);
   if (!n1) { if (n) return 0; c[0] = c[1] = c[2] = c[3] = 0x40; return 1; }
   c1 = decodeC(c1);
  } while (c1 == 0xFF);
  c[n++] = c1;
 } while (n < 4);
 return 1;
}

void doDecode() {

 unsigned char c[4];
 unsigned char d[3];

 while (1) {

  if (! read4(c)) { fprintf(stderr, "Unexpected EOF\n"); exit(1); }

  if (c[0] == 0x40) return;

  if (c[1] == 0x40) { fprintf(stderr, "= sign out of sync\n"); exit(1); }

  if (c[2] == 0x40) {
   d[0] = (c[0] << 2) | ((c[1] & 0x30) >> 4);
   write(1, d, 1);
   return;
  }

  if (c[3] == 0x40) {
   d[0] = (c[0] << 2) | ((c[1] & 0x30) >> 4);
   d[1] = ((c[1] & 0x0F) << 4) | ((c[2] & 0x3C) >> 2);
   write(1, d, 2);
   return;
  }

  {
   char d[3];
   d[0] = (c[0] << 2) | ((c[1] & 0x30) >> 4);
   d[1] = ((c[1] & 0x0F) << 4) | ((c[2] & 0x3C) >> 2);
   d[2] = ((c[2] & 0x03) << 6) | c[3];
   write(1, d, 3);
  }

 }

}

/*---------- doEncode ------------ */

int read3(unsigned char *b) {
 int n = 0;
 int n1;
 do {
  n1 = read(0, b + n, 3 - n);
  if (n1 > 0) n += n1;
 } while ((n < 3) && (n1 > 0));
 return n;
}

static unsigned char encoding[] =
 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

void doEncode(int doFold) {

 unsigned char b[3];
 unsigned char c[4];
 int thisLineLength = 0;

 while (1) switch (read3(b)) {

  case 3: {
    unsigned char c0, c1, c2, c3;
    c[0] = encoding[                        (b[0] & 0xFC) >> 2 ];
    c[1] = encoding[((b[0] & 0x03) << 4) | ((b[1] & 0xF0) >> 4)];
    c[2] = encoding[((b[1] & 0x0F) << 2) | ((b[2] & 0xC0) >> 6)];
    c[3] = encoding[  b[2] & 0x3F                              ];
    if (doFold && (thisLineLength > 36)) { thisLineLength = 0; write(1, "\n", 1); }
    write(1, c, 4); thisLineLength += 4;
   }
   break;

  case 2: {
    unsigned char c0, c1, c2, c3;
    c[0] = encoding[                        (b[0] & 0xFC) >> 2 ];
    c[1] = encoding[((b[0] & 0x03) << 4) | ((b[1] & 0xF0) >> 4)];
    c[2] = encoding[ (b[1] & 0x0F) << 2                        ];
    c[3] = '=';
    write(1, c, 4); if (doFold) write(1, "\n", 1);
   }
   return;

  case 1: {
    unsigned char c0, c1, c2, c3;
    c[0] = encoding[                        (b[0] & 0xFC) >> 2 ];
    c[1] = encoding[ (b[0] & 0x03) << 4                        ];
    c[2] = '=';
    c[3] = '=';
    write(1, c, 4); if (doFold) write(1, "\n", 1);
   }
   return;

  case 0:
   if (doFold && thisLineLength) write(1, "\n", 1);
   return;

 }

}

/*------------ main -------------- */

main(int argc, char *argv[]) {

 int decode = 0;
 int doFold = 0;

 {
  int c;
  extern int optind, opterr;
  opterr = 0;
  while ((c = getopt(argc, argv, "df")) != -1) switch (c) {
   case 'd': if (decode) Usage(argv[0]); decode = 1;
    break;
   case 'f': if (doFold) Usage(argv[0]); doFold = 1;
    break;
   case '?': Usage(argv[0]);
    break;
  }
  if (argc - optind != 0) Usage(argv[0]);
 }

 if (decode) doDecode();
 else        doEncode(doFold);

}