[Date Prev][Date Next] [Chronological] [Thread] [Top]

Re: Umlauts (was Re: Charset nightmare)

To: Michael Brehl - FRA HY 3 <mbr@condor.de>
Subject: Re: Umlauts (was Re: Charset nightmare)
From: "Baruzzi Giovanni" <giovanni.baruzzi@allianz-leben.de>
Date: Thu, 03 Dec 1998 18:23:07 +0100
Cc: openldap-general@openldap.org
Organization: Allianz Lebensversicherungs-AG
References: <199812031439.OAA29136@mail.condor.de>


Michael Brehl - FRA HY 3 wrote:

> (KEIN VIRUS GEFUNDEN)
> Yann wrote:
>
> > messenger.  I also tried UTF8 encoding, works fine for Netscape but
> > display nonsense in OE.
> I got stuck with the same problem - german umlauts.
> Can one please give me an example, how to enter
> the specific UTF-8 character in an ldif file ? I
> tried the RFC 2253 notation, but it did not work.
>
> Thanx in advance,
> Michael
>
> --
>                                 Michael Brehl
>                                 FRA HY 3
>      _/_/_/ _/_/_/_/_/ _/_/_/   Networking & Systems
>   _/_/     _/_/      _/_/       Condor Flugdienst GmbH
>  _/_/     _/_/_/_/  _/_/  _/_/  Phone: +49(+6107)939-179
> _/_/     _/_/      _/_/    _/   Fax:   +49(+6107)939-710
>  _/_/_/ _/_/        _/_/_/_/    Email: Michael.Brehl@condor.de

Hallo,

I have already faced the problem......
I compiled the 2 utlities enclosed here and used to process the original
input file, coded in Latin-1.
The result is then feed to the LDAPADD and the result are very good.
If you try to export the data again, the LDIF export will translate every
string containig an accented character in BASE64 (see Perl utilities here
under) and the result is not readable, but it is correct.

If you have a LDIF file originating from Windows, you should adapt the
utilities, because they are valid only for Latin-1 or use my simple PERL
subprogram  to convert just "Umlauts". and Esszet.


Best regards
Mit freundlichen Grüßen aus Stuttgart

Giovanni Baruzzi


===for
Windows===================================================================

#
# convert a string in UTF-8 if the string contains the german national
characters
#
# here the translations in hex
# ä => "C3A4"
# ö => "C3B6"
# ü => "C3BC"
# Ä => "C384"
# Ö => "C396"
# Ü => "C39C"
# ß => "C39f"
#
sub utf8 {
 my ($parm) = @_;
 $parm =~ s /ä/Ã¤/g ;
 $parm =~ s /ö/Ã¶/g ;
 $parm =~ s /ü/Ã¼/g ;
 $parm =~ s /Ä/Ã?/g ;
 $parm =~ s /Ö/Ã?/g ;
 $parm =~ s /Ü/Ã?/g ;
 $parm =~ s /ß/Ã?/g ;
 return ($parm);
}
======================================================================

==BASE64=============================================================================

# Historically this module has been implemented as pure perl code.
# The XS implementation runs about 20 times faster, but the Perl
# code might be more portable, so it is still here.

use integer;

sub old_encode_base64 ($;$)
{
    my $res = "";
    my $eol = $_[1];
    $eol = "\n" unless defined $eol;
    pos($_[0]) = 0;                          # ensure start at the
beginning
    while ($_[0] =~ /(.{1,45})/gs) {
        $res .= substr(pack('u', $1), 1);
        chop($res);
    }
    $res =~ tr|` -_|AA-Za-z0-9+/|;               # `# help emacs
    # fix padding at the end
    my $padding = (3 - length($_[0]) % 3) % 3;
    $res =~ s/.{$padding}$/'=' x $padding/e if $padding;
    # break encoded string into lines of no more than 76 characters each
    if (length $eol) {
        $res =~ s/(.{1,76})/$1$eol/g;
    }
    $res;
}

==BASE64=====================================================================

sub old_decode_base64 ($)
{
    local($^W) = 0; # unpack("u",...) gives bogus warning in 5.00[123]

    my $str = shift;
    my $res = "";

    $str =~ tr|A-Za-z0-9+=/||cd;            # remove non-base64 chars
    if (length($str) % 4) {
        require Carp;
        Carp::croak("Base64 decoder requires string length to be a
multiple of 4")
    }
    $str =~ s/=+$//;                        # remove padding
    $str =~ tr|A-Za-z0-9+/| -_|;            # convert to uuencoded format

    while ($str =~ /(.{1,60})/gs) {
        my $len = chr(32 + length($1)*3/4); # compute length byte
        $res .= unpack("u", $len . $1 );    # uudecode
    }
    $res;
}

==LATIN-1 TO
UTF-8==================================================================
/* Read Latin-1 (ISO-8859-1) characters from stdin, convert them
   to UTF-8, and write the converted characters to stdout.
   UTF-8 is defined by RFC 2044.
*/
#include <errno.h>
#include <stdio.h>

int
main (int argc, char** argv)
{
    register int c;
    while ((c = getchar()) != EOF) {
        if ((c & 0x80) == 0) {
            putchar (c);
        } else {
            putchar (0xC0 | (0x03 & (c >> 6)));
            putchar (0x80 | (0x3F & c));
        }
    }
    if ( ! feof (stdin)) {
        errno = ferror (stdin);
        perror (argv[0]);
    }
    return 0;
}

================================================================
/* Read UTF-8 characters from stdin, convert them to Latin-1
   (ISO-8859-1), and write the converted characters to stdout.
   UTF-8 is defined by RFC 2044.
*/
#include <errno.h>
#include <stdio.h>

static char UTF8len[64]
/* A map from the most-significant 6 bits of the first byte
   to the total number of bytes in a UTF-8 character.
*/
= {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* erroneous */
   2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};

int
main (int argc, char** argv)
{
    register int c;
    while ((c = getchar()) != EOF) {
        auto int len = UTF8len [(c >> 2) & 0x3F];
        register unsigned long u;
        switch (len) {
          case 6: u = c & 0x01; break;
          case 5: u = c & 0x03; break;
          case 4: u = c & 0x07; break;
          case 3: u = c & 0x0F; break;
          case 2: u = c & 0x1F; break;
          case 1: u = c & 0x7F; break;
          case 0: /* erroneous: c is the middle of a character. */
            len = 5; u = c & 0x3F; break;
        }
        while (--len && (c = getchar()) != EOF) {
            if ((c & 0xC0) == 0x80) {
                u = (u << 6) | (c & 0x3F);
            } else { /* unexpected start of a new character */
                ungetc (c, stdin);
                break;
            }
        }
        if (c == EOF) break;
        if (u <= 0xFF) {
            putchar (u);
        } else { /* this character can't be represented in Latin-1 */
            putchar ('?'); /* a reasonable alternative is 0x1A (SUB) */
        }
    }
    if ( ! feof (stdin)) {
        errno = ferror (stdin);
        perror (argv[0]);
    }
    return 0;
}

====================================================================

begin:vcard 
n:Baruzzi;Giovanni
tel;work:+49-7031-663-1421
x-mozilla-html:FALSE
adr:;;;;;;
version:2.1
email;internet:giovanni.baruzzi@allianz-leben.de
fn:Giovanni Baruzzi
end:vcard

References:
- Umlauts (was Re: Charset nightmare)
  - From: "Michael Brehl - FRA HY 3" <mbr@condor.de>

Prev by Date: Umlauts (was Re: Charset nightmare)
Next by Date: Re: [Q] HOWTO PGP Public Key Server ...
Index(es):
- Chronological
- Thread