[Date Prev][Date Next]
[Chronological]
[Thread]
[Top]
Re: [LDAP] Using foreign charsets
Here are some C programs that convert between Latin-1 and UTF-8.
They are not supported.
/* Read Latin-1 (ISO-8859-1) characters from stdin, convert them
to UTF-8, and write the converted characters to stdout.
UTF-8 is defined by RFC 2279.
*/
#include <errno.h>
#include <stdio.h>
int
main (int argc, char** argv)
{
register int c;
while ((c = getchar()) != EOF) {
if ((c & 0x80) == 0) {
putchar (c);
} else {
putchar (0xC0 | (0x03 & (c >> 6)));
putchar (0x80 | (0x3F & c));
}
}
if ( ! feof (stdin)) {
errno = ferror (stdin);
perror (argv[0]);
}
return 0;
}
/* Read UTF-8 characters from stdin, convert them to Latin-1
(ISO-8859-1), and write the converted characters to stdout.
UTF-8 is defined by RFC 2279.
*/
#include <errno.h>
#include <stdio.h>
static char UTF8len[64]
/* A map from the most-significant 6 bits of the first byte
to the total number of bytes in a UTF-8 character.
*/
= {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* erroneous */
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};
int
main (int argc, char** argv)
{
register int c;
while ((c = getchar()) != EOF) {
auto int len = UTF8len [(c >> 2) & 0x3F];
register unsigned long u;
switch (len) {
case 6: u = c & 0x01; break;
case 5: u = c & 0x03; break;
case 4: u = c & 0x07; break;
case 3: u = c & 0x0F; break;
case 2: u = c & 0x1F; break;
case 1: u = c & 0x7F; break;
case 0: /* erroneous: c is the middle of a character. */
u = c & 0x3F; len = 5; break;
}
while (--len && (c = getchar()) != EOF) {
if ((c & 0xC0) == 0x80) {
u = (u << 6) | (c & 0x3F);
} else { /* unexpected start of a new character */
ungetc (c, stdin);
break;
}
}
if (u <= 0xFF) {
putchar (u);
} else { /* this character can't be represented in Latin-1 */
putchar ('?'); /* a reasonable alternative is 0x1A (SUB) */
}
if (c == EOF) break;
}
if ( ! feof (stdin)) {
errno = ferror (stdin);
perror (argv[0]);
}
return 0;
}