Viewing file:
unicode.c (33.09 KB) -rw-rw-rw-Select action/file-type:

(
+) |

(
+) |

(
+) |
Code (
+) |
Session (
+) |

(
+) |
SDB (
+) |

(
+) |

(
+) |

(
+) |

(
+) |

(
+) |
/*
** Copyright 2000-2011 Double Precision, Inc.
** See COPYING for distribution information.
**
*/
#include "unicode_config.h"
#include "unicode.h"
#include "../rfc822/rfc822hdr.h"
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <iconv.h>
#include <errno.h>
#if HAVE_LOCALE_H
#if HAVE_SETLOCALE
#include <locale.h>
#if USE_LIBCHARSET
#if HAVE_LOCALCHARSET_H
#include <localcharset.h>
#elif HAVE_LIBCHARSET_H
#include <libcharset.h>
#endif /* HAVE_LOCALCHARSET_H */
#elif HAVE_LANGINFO_CODESET
#include <langinfo.h>
#endif /* USE_LIBCHARSET */
#endif /* HAVE_SETLOCALE */
#endif /* HAVE_LOCALE_H */
static char default_chset_buf[32];
static void init_default_chset()
{
const char *old_locale=NULL;
const char *chset=NULL;
char *locale_cpy=NULL;
char buf[sizeof(default_chset_buf)];
chset=getenv("MM_CHARSET");
if (chset == NULL)
chset=getenv("CHARSET");
if (chset == NULL)
{
#if HAVE_LOCALE_H
#if HAVE_SETLOCALE
old_locale=setlocale(LC_ALL, "");
locale_cpy=strdup(old_locale);
#if USE_LIBCHARSET
chset = locale_charset();
#elif HAVE_LANGINFO_CODESET
chset=nl_langinfo(CODESET);
#endif
#endif
#endif
}
memset(buf, 0, sizeof(buf));
if (chset &&
/* Map GNU libc iconv oddity to us-ascii */
(strcmp(chset, "ANSI_X3.4") == 0 ||
strncmp(chset, "ANSI_X3.4-", 10) == 0))
chset="US-ASCII";
if (chset)
{
strncat(buf, chset, sizeof(buf)-1);
}
else
{
const char *p=getenv("LANG");
/* LANG is xx_yy.CHARSET@modifier */
if (p && *p && (p=strchr(p, '.')) != NULL)
{
const char *q=strchr(++p, '@');
if (!q)
q=p+strlen(p);
if (q-p >= sizeof(buf)-1)
q=p+sizeof(buf)-1;
memcpy(buf, p, q-p);
buf[q-p]=0;
}
else
strcpy(buf, "US-ASCII");
}
memcpy(default_chset_buf, buf, sizeof(buf));
#if HAVE_LOCALE_H
#if HAVE_SETLOCALE
if (locale_cpy)
{
setlocale(LC_ALL, locale_cpy);
free(locale_cpy);
}
#endif
#endif
}
const char *unicode_default_chset()
{
if (default_chset_buf[0] == 0)
init_default_chset();
return default_chset_buf;
}
/*****************************************************************************/
const char libmail_u_ucs4_native[]=
#if WORDS_BIGENDIAN
"UCS-4BE"
#else
"UCS-4LE"
#endif
;
const char libmail_u_ucs2_native[]=
#if WORDS_BIGENDIAN
"UCS-2BE"
#else
"UCS-2LE"
#endif
;
/* A stack of conversion modules */
struct libmail_u_convert_hdr {
int (*convert_handler)(void *ptr,
const char *text, size_t cnt);
int (*deinit_handler)(void *ptr, int *errptr);
void *ptr;
struct libmail_u_convert_hdr *next;
};
/* Decoding table for modified UTF7-encoding as used in imap */
static const char mbase64_lookup[]={
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,63,-1,-1,-1,
52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,-1,
-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,
-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1};
static const char mbase64[]=
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
/*
** Conversion wrapper for converting to modified-utf7 IMAP encoding.
**
** This is done by converting to UCS2, then stacking on a module that
** takes that and converts UCS2 to modified-UTF7.
**
** init_nottoimaputf7() returns an opaque stack for converting to ucs2.
*/
static libmail_u_convert_handle_t
init_nottoimaputf7(const char *src_chset,
const char *dst_chset,
int (*output_func)(const char *, size_t, void *),
void *convert_arg);
/*
** The to modified UTF7 module
*/
struct libmail_u_convert_toimaputf7 {
struct libmail_u_convert_hdr hdr;
/* Accumulated output buffer */
char utf7encodebuf[1024];
size_t utf7encodebuf_cnt;
/* Accumulated bits for base64 encoding */
u_int32_t utf7bits;
/* How many bits in utf7bits */
u_int16_t utf7bitcount;
/* Flag: in base64mode */
u_int16_t utfmode;
int errflag;
/* Any extra characters that should be munged */
char smapmunge[16];
/* Remembered output function */
int (*output_func)(const char *, size_t, void *);
/* Remembered arg to the output function */
void *convert_arg;
};
/* Macro - flush the output buffer */
#define toimaputf7_encode_flush(p) do { \
int rc; \
\
rc=(*(p)->output_func)((p)->utf7encodebuf, \
(p)->utf7encodebuf_cnt, \
(p)->convert_arg); \
if (rc) \
return ((p)->errflag=(rc)); \
\
(p)->utf7encodebuf_cnt=0; \
} while (0)
static int toimaputf7_encode_flushfinal(struct libmail_u_convert_toimaputf7 *p)
{
if (p->utf7encodebuf_cnt > 0)
toimaputf7_encode_flush(p);
return 0;
}
/* Macro - add one char to the output buffer */
#define toimaputf7_encode_add(p,c) do { \
if ((p)->utf7encodebuf_cnt >= sizeof((p)->utf7encodebuf)) \
toimaputf7_encode_flush((p)); \
\
(p)->utf7encodebuf[(p)->utf7encodebuf_cnt++]=(c); \
} while (0);
static int deinit_toimaputf7(void *ptr, int *errptr);
static int do_convert_toutf7(const char *text, size_t cnt, void *arg);
static int convert_utf7_handler(void *ptr, const char *text, size_t cnt);
/*
** Create a conversion module stack
*/
libmail_u_convert_handle_t
libmail_u_convert_init(const char *src_chset,
const char *dst_chset,
int (*output_func)(const char *, size_t, void *),
void *convert_arg)
{
struct libmail_u_convert_toimaputf7 *toutf7;
libmail_u_convert_handle_t h;
const char *smapmunge;
size_t l=strlen(unicode_x_imap_modutf7);
if (strncmp(dst_chset, unicode_x_imap_modutf7, l) == 0 &&
(dst_chset[l] == 0 || dst_chset[l] == ' '))
{
smapmunge=dst_chset + l;
if (*smapmunge)
++smapmunge;
}
else
return init_nottoimaputf7(src_chset, dst_chset,
output_func,
convert_arg);
toutf7=malloc(sizeof(struct libmail_u_convert_toimaputf7));
if (!toutf7)
return NULL;
memset(toutf7, 0, sizeof(*toutf7));
h=init_nottoimaputf7(src_chset, libmail_u_ucs2_native,
do_convert_toutf7, toutf7);
if (!h)
{
free(toutf7);
return (NULL);
}
toutf7->output_func=output_func;
toutf7->convert_arg=convert_arg;
strncat(toutf7->smapmunge, smapmunge, sizeof(toutf7->smapmunge)-1);
toutf7->hdr.convert_handler=convert_utf7_handler;
toutf7->hdr.deinit_handler=deinit_toimaputf7;
toutf7->hdr.ptr=toutf7;
toutf7->hdr.next=h;
return &toutf7->hdr;
}
/* Passthrough to the wrapped stack */
static int convert_utf7_handler(void *ptr, const char *text, size_t cnt)
{
struct libmail_u_convert_toimaputf7 *toutf7=
(struct libmail_u_convert_toimaputf7 *)ptr;
return (*toutf7->hdr.next->convert_handler)(toutf7->hdr.next->ptr,
text, cnt);
}
static int utf7off(struct libmail_u_convert_toimaputf7 *toutf7)
{
if (!toutf7->utfmode)
return 0;
toutf7->utfmode=0;
if (toutf7->utf7bitcount > 0)
toimaputf7_encode_add(toutf7,
mbase64[(toutf7->utf7bits
<< (6-toutf7->utf7bitcount))
& 63]);
toimaputf7_encode_add(toutf7, '-');
return 0;
}
static int do_convert_toutf7(const char *text, size_t cnt, void *arg)
{
struct libmail_u_convert_toimaputf7 *toutf7=
(struct libmail_u_convert_toimaputf7 *)arg;
/* We better be getting UCS-2 here! */
const u_int16_t *utext=(const u_int16_t *)text;
cnt /= 2;
while (cnt)
{
if (toutf7->errflag)
return toutf7->errflag;
if (*utext >= 0x20 && *utext <= 0x7F
&& strchr( toutf7->smapmunge, (char)*utext) == NULL)
/*
&& (!toutf7->smapmunge || (*utext != '.' && *utext != '/' &&
*utext != '~' && *utext != ':')))
*/
{
if (utf7off(toutf7))
return toutf7->errflag;
toimaputf7_encode_add(toutf7, *utext);
if (*utext == '&')
toimaputf7_encode_add(toutf7, '-');
++utext;
--cnt;
continue;
}
if (!toutf7->utfmode)
{
toutf7->utfmode=1;
toutf7->utf7bitcount=0;
toimaputf7_encode_add(toutf7, '&');
continue;
}
toutf7->utf7bits = (toutf7->utf7bits << 16) |
(((u_int32_t)*utext) & 0xFFFF);
toutf7->utf7bitcount += 16;
++utext;
--cnt;
/* If there's at least 6 bits, output base64-encoded char */
while (toutf7->utf7bitcount >= 6)
{
u_int32_t v;
int n;
if (toutf7->errflag)
return toutf7->errflag;
v=toutf7->utf7bits;
n=toutf7->utf7bitcount-6;
toutf7->utf7bitcount -= 6;
if (n > 0)
v >>= n;
toimaputf7_encode_add(toutf7, mbase64[v & 63]);
}
}
return 0;
}
static int deinit_toimaputf7(void *ptr, int *errptr)
{
int rc;
struct libmail_u_convert_toimaputf7 *toutf7=
(struct libmail_u_convert_toimaputf7 *)ptr;
/* Flush out the downstream stack */
rc=(*toutf7->hdr.next->deinit_handler)(toutf7->hdr.next->ptr, errptr);
/* Make sure we're out of modified base64 */
if (rc == 0)
rc=utf7off(toutf7);
if (rc == 0 && toutf7->utf7encodebuf_cnt > 0)
rc=toimaputf7_encode_flushfinal(toutf7);
free(toutf7);
return rc;
}
/************/
/*
** Convert from modified-utf7 IMAP encoding.
**
** This module converts it to UCS-2, then this is attached to a stack that
** converts UCS-2 to the requested charset.
*/
static libmail_u_convert_handle_t
init_notfromimaputf7(const char *src_chset,
const char *dst_chset,
int (*output_func)(const char *, size_t, void *),
void *convert_arg);
struct libmail_u_convert_fromimaputf7 {
struct libmail_u_convert_hdr hdr;
/* Accumulated UCS-2 stream */
u_int16_t convbuf[512];
size_t convbuf_cnt;
/* Accumulated base64 bits */
u_int32_t modbits;
/* How many bits extracted from a base64 stream */
short modcnt;
/* Flag: seen the & */
char seenamp;
/* Flag: seen the &, and the next char wasn't - */
char inmod;
int errflag;
int converr;
};
/* Flush the accumulated UCS-2 stream */
#define convert_fromutf7_flush(p) do { \
(p)->errflag=(*(p)->hdr.next->convert_handler) \
((p)->hdr.next->ptr, \
(const char *)(p)->convbuf, \
(p)->convbuf_cnt * \
sizeof((p)->convbuf[0])); \
(p)->convbuf_cnt=0; \
} while (0)
/* Accumulated a UCS-2 char */
#define convert_fromutf7_add(p,c) do { \
if ((p)->convbuf_cnt >= \
sizeof((p)->convbuf)/sizeof((p)->convbuf[0])) \
convert_fromutf7_flush((p)); \
(p)->convbuf[(p)->convbuf_cnt++]=(c); \
} while (0)
static int convert_fromutf7(void *ptr,
const char *text, size_t cnt);
static int deinit_fromutf7(void *ptr, int *errptr);
static libmail_u_convert_handle_t
init_nottoimaputf7(const char *src_chset,
const char *dst_chset,
int (*output_func)(const char *, size_t, void *),
void *convert_arg)
{
struct libmail_u_convert_fromimaputf7 *fromutf7;
libmail_u_convert_handle_t h;
size_t l=strlen(unicode_x_imap_modutf7);
if (strncmp(src_chset, unicode_x_imap_modutf7, l) == 0 &&
(src_chset[l] == 0 || src_chset[l] == ' '))
;
else
return init_notfromimaputf7(src_chset, dst_chset,
output_func,
convert_arg);
fromutf7=(struct libmail_u_convert_fromimaputf7 *)
malloc(sizeof(struct libmail_u_convert_fromimaputf7));
if (!fromutf7)
return NULL;
memset(fromutf7, 0, sizeof(*fromutf7));
/* Create a stack for converting UCS-2 to the dest charset */
h=init_notfromimaputf7(libmail_u_ucs2_native, dst_chset,
output_func, convert_arg);
if (!h)
{
free(fromutf7);
return (NULL);
}
fromutf7->hdr.next=h;
fromutf7->hdr.convert_handler=convert_fromutf7;
fromutf7->hdr.deinit_handler=deinit_fromutf7;
fromutf7->hdr.ptr=fromutf7;
return &fromutf7->hdr;
}
static int convert_fromutf7(void *ptr,
const char *text, size_t cnt)
{
struct libmail_u_convert_fromimaputf7 *fromutf7=
(struct libmail_u_convert_fromimaputf7 *)ptr;
int bits;
while (cnt)
{
if (fromutf7->errflag)
return fromutf7->errflag;
if (!fromutf7->seenamp && *text == '&')
{
fromutf7->seenamp=1;
fromutf7->inmod=0;
fromutf7->modcnt=0;
++text;
--cnt;
continue;
}
if (fromutf7->seenamp)
{
if (*text == '-')
{
convert_fromutf7_add(fromutf7, '&');
++text;
--cnt;
fromutf7->seenamp=0;
continue;
}
fromutf7->seenamp=0;
fromutf7->inmod=1;
}
if (!fromutf7->inmod)
{
/* Not in the base64 encoded stream */
convert_fromutf7_add(fromutf7,
((u_int16_t)*text) & 0xFFFF);
++text;
--cnt;
continue;
}
if (*text == '-')
{
/* End of the base64 encoded stream */
fromutf7->inmod=0;
++text;
--cnt;
continue;
}
/* Got 6 more bits */
bits=mbase64_lookup[(unsigned char)*text];
++text;
--cnt;
if (bits < 0)
{
errno=EILSEQ;
return fromutf7->errflag=-1;
}
fromutf7->modbits = (fromutf7->modbits << 6) | bits;
fromutf7->modcnt += 6;
if (fromutf7->modcnt >= 16)
{
/* Got a UCS-2 char */
int shiftcnt=fromutf7->modcnt - 16;
u_int32_t v=fromutf7->modbits;
if (shiftcnt)
v >>= shiftcnt;
fromutf7->modcnt -= 16;
convert_fromutf7_add(fromutf7, v);
}
}
return 0;
}
static int deinit_fromutf7(void *ptr, int *errptr)
{
struct libmail_u_convert_fromimaputf7 *fromutf7=
(struct libmail_u_convert_fromimaputf7 *)ptr;
int rc;
if (fromutf7->seenamp || fromutf7->inmod)
{
if (fromutf7->errflag == 0)
{
fromutf7->errflag= -1;
errno=EILSEQ;
}
}
if (fromutf7->convbuf_cnt)
convert_fromutf7_flush(fromutf7);
rc=fromutf7->hdr.next->deinit_handler(fromutf7->hdr.next->ptr, errptr);
if (fromutf7->errflag && rc == 0)
rc=fromutf7->errflag;
if (errptr && fromutf7->converr)
*errptr=1;
free(fromutf7);
return rc;
}
/************/
/* A real conversion module, via iconv */
struct libmail_u_convert_iconv {
struct libmail_u_convert_hdr hdr;
iconv_t h;
int errflag; /* Accumulated errors */
int (*output_func)(const char *, size_t, void *);
void *convert_arg;
char buffer[1024]; /* Input buffer */
size_t bufcnt; /* Accumulated input in buffer */
char skipcnt; /* Skip this many bytes upon encountering EILSEQ */
char skipleft; /* How many bytes are currently left to skip */
char converr; /* Flag - an EILSEQ was encountered */
} ;
static int init_iconv(struct libmail_u_convert_iconv *h,
const char *src_chset,
const char *dst_chset,
int (*output_func)(const char *, size_t, void *),
void *convert_arg);
static libmail_u_convert_handle_t
init_notfromimaputf7(const char *src_chset,
const char *dst_chset,
int (*output_func)(const char *, size_t, void *),
void *convert_arg)
{
struct libmail_u_convert_iconv *h=
malloc(sizeof(struct libmail_u_convert_iconv));
if (!h)
return NULL;
memset(h, 0, sizeof(*h));
if (init_iconv(h, src_chset, dst_chset, output_func, convert_arg))
{
free(h);
return NULL;
}
return &h->hdr;
}
/* Run the stack */
int libmail_u_convert(libmail_u_convert_handle_t h,
const char *text, size_t cnt)
{
return (*h->convert_handler)(h->ptr, text, cnt);
}
/* Destroy the stack */
int libmail_u_convert_deinit(libmail_u_convert_handle_t h, int *errptr)
{
return (*h->deinit_handler)(h, errptr);
}
static int deinit_iconv(void *ptr, int *errptr);
static int convert_iconv(void *ptr,
const char *text, size_t cnt);
/* Initialize a single conversion module, in the stack */
static int init_iconv(struct libmail_u_convert_iconv *h,
const char *src_chset,
const char *dst_chset,
int (*output_func)(const char *, size_t, void *),
void *convert_arg)
{
if ((h->h=iconv_open(dst_chset, src_chset)) == (iconv_t)-1)
return -1;
h->hdr.convert_handler=convert_iconv;
h->hdr.deinit_handler=deinit_iconv;
h->hdr.ptr=h;
h->output_func=output_func;
h->convert_arg=convert_arg;
/* Heuristically determine how many octets to skip upon an EILSEQ */
h->skipcnt=1;
switch (src_chset[0]) {
case 'u':
case 'U':
switch (src_chset[1]) {
case 'c':
case 'C':
switch (src_chset[2]) {
case 's':
case 'S':
if (src_chset[3] == '-')
switch (src_chset[4]) {
case '4':
/* UCS-4 */
h->skipcnt=4;
break;
case '2':
/* UCS-2 */
h->skipcnt=2;
break;
}
}
break;
case 't':
case 'T':
switch (src_chset[2]) {
case 'f':
case 'F':
if (src_chset[3] == '-')
switch (src_chset[4]) {
case '3':
/* UTF-32 */
h->skipcnt=4;
break;
case '1':
/* UTF-16 */
h->skipcnt=2;
break;
}
}
}
}
return 0;
}
static void convert_flush(struct libmail_u_convert_iconv *);
static void convert_flush_iconv(struct libmail_u_convert_iconv *, char **,
size_t *);
/*
** iconv conversion module. Accumulate input in an input buffer. When the
** input buffer is full, invoke convert_flush().
*/
static int convert_iconv(void *ptr,
const char *text, size_t cnt)
{
struct libmail_u_convert_iconv *h=(struct libmail_u_convert_iconv *)ptr;
while (cnt && h->errflag == 0)
{
if (h->bufcnt >= sizeof(h->buffer)-1)
{
convert_flush(h);
if (h->errflag)
break;
}
h->buffer[h->bufcnt++]= *text++;
--cnt;
}
return h->errflag;
}
/*
** Finish an iconv conversion module. Invoke convert_flush() to flush any
** buffered input. Invoke convert_flush_iconv() to return state to the initial
** conversion state.
*/
static int deinit_iconv(void *ptr, int *errptr)
{
int rc;
int converr;
struct libmail_u_convert_iconv *h=(struct libmail_u_convert_iconv *)ptr;
libmail_u_convert_handle_t next;
if (h->errflag == 0)
convert_flush(h);
if (h->bufcnt && h->errflag == 0)
h->converr=1;
if (h->errflag == 0)
convert_flush_iconv(h, NULL, NULL);
rc=h->errflag;
converr=h->converr != 0;
iconv_close(h->h);
next=h->hdr.next;
free(h);
if (errptr)
*errptr=converr;
/* If there's another module in the stack, clean that up */
if (next)
{
int converrnext;
int rcnext=libmail_u_convert_deinit(next, &converrnext);
if (converrnext && errptr && *errptr == 0)
*errptr=converr;
if (rcnext && rc == 0)
rc=rcnext;
}
return rc;
}
/*
** Invoke convert_flush_iconv() to flush the input buffer. If there's
** unconverted text remaining, reposition it at the beginning of the input
** buffer.
*/
static void convert_flush(struct libmail_u_convert_iconv *h)
{
char *p;
size_t n;
if (h->bufcnt == 0 || h->errflag)
return;
p=h->buffer;
n=h->bufcnt;
convert_flush_iconv(h, &p, &n);
if (h->errflag)
return;
if (h->bufcnt == n)
n=0; /* Unexpected error, dunno what to do, punt */
h->bufcnt=0;
while (n)
{
h->buffer[h->bufcnt]= *p;
++h->bufcnt;
++p;
--n;
}
}
/*
** Convert text via iconv.
*/
static void convert_flush_iconv(struct libmail_u_convert_iconv *h,
char **inbuf, size_t *inbytesleft)
{
int save_errno;
while (1)
{
char outbuf[1024];
char *outp;
size_t outleft;
size_t n;
size_t origin=0;
if (inbytesleft)
{
if ((origin=*inbytesleft) == 0)
return;
if (inbuf && h->skipleft && origin)
{
/* Skipping after an EILSEQ */
--h->skipleft;
--*inbytesleft;
++*inbuf;
continue;
}
}
if (h->errflag)
{
/* Quietly eat everything after a previous error */
if (inbytesleft)
*inbytesleft=0;
return;
}
outp=outbuf;
outleft=sizeof(outbuf);
n=iconv(h->h, inbuf, inbytesleft, &outp, &outleft);
save_errno=errno;
/* Anything produced by iconv() gets pushed down the stack */
if (outp > outbuf)
{
int rc=(*h->output_func)(outbuf, outp-outbuf,
h->convert_arg);
if (rc)
{
h->errflag=rc;
return;
}
}
if (n != (size_t)-1)
{
/* iconv(3) reason #2 */
break;
}
if (inbytesleft == 0)
{
/*
** An error when generating the shift sequence to
** return to the initial state. We don't know what to
** do, now.
*/
errno=EINVAL;
h->errflag= -1;
return;
}
/*
** convert_flush() gets invoked when the 1024 char input buffer
** fills or to convert input that has been buffered when
** convert_chset_end() gets invoked.
**
** A return code of EINVAL from iconv() is iconv() encountering
** an incomplete multibyte sequence.
**
** If iconv() failed without consuming any input:
**
** - iconv(3) reason #1, EILSEQ, invalid multibyte sequence
** that starts at the beginning of the string we wish to
** convert. Discard one character, and try again.
**
** - iconv(3) reason #3, EINVAL, incomplete multibyte sequence.
** If it's possible to have an incomplete 1024 character long
** multibyte sequence, we're in trouble. Or we've encountered
** an EINVAL when flushing out the remaining buffered input,
** in convert_chset_end(). In either case, it's ok to sicard
** one character at a time, until we either reach the end,
** or get some other result.
**
** - iconv(3) reason #4, E2BIG. If the 1024 character output
** buffer, above, is insufficient to produce the output from a
** single converted character, we're in trouble.
*/
if (*inbytesleft == origin)
{
h->skipleft=h->skipcnt;
h->converr=1;
}
/*
** Stopped at an incomplete multibyte sequence, try again on
** the next round.
*/
else if (save_errno == EINVAL)
break;
if (save_errno == EILSEQ)
h->converr=1; /* Another possibility this can happen */
/*
** If we get here because of iconv(3) reason #4, filled out
** the output buffer, we should continue with the conversion.
** Otherwise, upon encountering any other error condition,
** reset the conversion state.
*/
if (save_errno != E2BIG)
iconv(h->h, NULL, NULL, NULL, NULL);
}
}
/*****************************************************************************/
/*
** A wrapper for libmail_u_convert() that collects the converted character
** text into a buffer. This is done by passing an output function to
** libmail_u_convert() that saves converted text in a linked-list
** of buffers.
**
** Then, in the deinitialization function, the buffers get concatenated into
** the final character buffer.
*/
struct libmail_u_convert_cbuf {
struct libmail_u_convert_cbuf *next;
char *fragment;
size_t fragment_size;
};
struct libmail_u_convert_tocbuf {
struct libmail_u_convert_hdr hdr;
char **cbufptr_ret;
size_t *cbufsize_ret;
int errflag;
size_t tot_size;
int nullterminate;
struct libmail_u_convert_cbuf *first, **last;
};
static int save_tocbuf(const char *, size_t, void *);
static int convert_tocbuf(void *ptr,
const char *text, size_t cnt);
static int deinit_tocbuf(void *ptr, int *errptr);
libmail_u_convert_handle_t
libmail_u_convert_tocbuf_init(const char *src_chset,
const char *dst_chset,
char **cbufptr_ret,
size_t *cbufsize_ret,
int nullterminate
)
{
struct libmail_u_convert_tocbuf *p=
malloc(sizeof(struct libmail_u_convert_tocbuf));
libmail_u_convert_handle_t h;
if (!p)
return NULL;
memset(p, 0, sizeof(*p));
h=libmail_u_convert_init(src_chset, dst_chset, save_tocbuf, p);
if (!h)
{
free(p);
return NULL;
}
p->cbufptr_ret=cbufptr_ret;
p->cbufsize_ret=cbufsize_ret;
p->last= &p->first;
p->nullterminate=nullterminate;
p->hdr.next=h;
p->hdr.convert_handler=convert_tocbuf;
p->hdr.deinit_handler=deinit_tocbuf;
p->hdr.ptr=p;
return &p->hdr;
}
/* Capture the output of the conversion stack */
static int save_tocbuf(const char *text, size_t cnt, void *ptr)
{
struct libmail_u_convert_tocbuf *p=
(struct libmail_u_convert_tocbuf *)ptr;
struct libmail_u_convert_cbuf *fragment=
malloc(sizeof(struct libmail_u_convert_cbuf)+cnt);
size_t tot_size;
if (!fragment)
{
p->errflag=1;
return 1;
}
fragment->next=NULL;
fragment->fragment=(char *)(fragment+1);
if ((fragment->fragment_size=cnt) > 0)
memcpy(fragment->fragment, text, cnt);
*(p->last)=fragment;
p->last=&fragment->next;
tot_size=p->tot_size + cnt; /* Keep track of the total size saved */
if (tot_size < p->tot_size) /* Overflow? */
{
errno=E2BIG;
return 1;
}
p->tot_size=tot_size;
return 0;
}
/* Punt converted text down the stack */
static int convert_tocbuf(void *ptr, const char *text, size_t cnt)
{
struct libmail_u_convert_tocbuf *p=
(struct libmail_u_convert_tocbuf *)ptr;
return libmail_u_convert(p->hdr.next, text, cnt);
}
/*
** Destroy the conversion stack. Destroy the downstream, then assemble the
** final array.
*/
static int deinit_tocbuf(void *ptr, int *errptr)
{
struct libmail_u_convert_tocbuf *p=
(struct libmail_u_convert_tocbuf *)ptr;
int rc=libmail_u_convert_deinit(p->hdr.next, errptr);
struct libmail_u_convert_cbuf *bufptr;
if (rc == 0 && p->nullterminate)
{
char zero=0;
rc=save_tocbuf( &zero, sizeof(zero), p->hdr.ptr);
}
if (rc == 0)
{
if (((*p->cbufptr_ret)=malloc(p->tot_size ? p->tot_size:1)) !=
NULL)
{
size_t i=0;
for (bufptr=p->first; bufptr; bufptr=bufptr->next)
{
if (bufptr->fragment_size)
memcpy(&(*p->cbufptr_ret)[i],
bufptr->fragment,
bufptr->fragment_size);
i += bufptr->fragment_size;
}
(*p->cbufsize_ret)=i;
}
else
{
rc= -1;
}
}
for (bufptr=p->first; bufptr; )
{
struct libmail_u_convert_cbuf *b=bufptr;
bufptr=bufptr->next;
free(b);
}
free(p);
return rc;
}
libmail_u_convert_handle_t
libmail_u_convert_tocbuf_toutf8_init(const char *src_chset,
char **cbufptr_ret,
size_t *cbufsize_ret,
int nullterminate
)
{
return libmail_u_convert_tocbuf_init(src_chset, "utf-8",
cbufptr_ret, cbufsize_ret,
nullterminate);
}
libmail_u_convert_handle_t
libmail_u_convert_tocbuf_fromutf8_init(const char *dst_chset,
char **cbufptr_ret,
size_t *cbufsize_ret,
int nullterminate
)
{
return libmail_u_convert_tocbuf_init("utf-8", dst_chset,
cbufptr_ret, cbufsize_ret,
nullterminate);
}
char *libmail_u_convert_toutf8(const char *text,
const char *charset,
int *error)
{
char *cbufptr;
size_t cbufsize;
libmail_u_convert_handle_t h=
libmail_u_convert_tocbuf_toutf8_init(charset,
&cbufptr,
&cbufsize, 1);
if (!h)
return NULL;
libmail_u_convert(h, text, strlen(text));
if (libmail_u_convert_deinit(h, error) == 0)
return cbufptr;
return NULL;
}
char *libmail_u_convert_fromutf8(const char *text,
const char *charset,
int *error)
{
char *cbufptr;
size_t cbufsize;
libmail_u_convert_handle_t h=
libmail_u_convert_tocbuf_fromutf8_init(charset,
&cbufptr,
&cbufsize, 1);
if (!h)
return NULL;
libmail_u_convert(h, text, strlen(text));
if (libmail_u_convert_deinit(h, error) == 0)
return cbufptr;
return NULL;
}
char *libmail_u_convert_tobuf(const char *text,
const char *charset,
const char *dstcharset,
int *error)
{
char *cbufptr;
size_t cbufsize;
libmail_u_convert_handle_t h=
libmail_u_convert_tocbuf_init(charset,
dstcharset,
&cbufptr,
&cbufsize, 1);
if (!h)
return NULL;
libmail_u_convert(h, text, strlen(text));
if (libmail_u_convert_deinit(h, error) == 0)
return cbufptr;
return NULL;
}
/*****************************************************************************/
/*
** Convert text to unicode_chars. Same basic approach as
** libmail_u_convert_tocbuf_init(). The output character set gets specified
** as UCS-4, the final output size is divided by 4, and the output buffer gets
** typed as a unicode_char array.
*/
struct libmail_u_convert_buf {
struct libmail_u_convert_buf *next;
unicode_char *fragment;
size_t fragment_size;
size_t max_fragment_size;
};
struct libmail_u_convert_tou {
struct libmail_u_convert_hdr hdr;
unicode_char **ucptr_ret;
size_t *ucsize_ret;
int errflag;
size_t tot_size;
int nullterminate;
struct libmail_u_convert_buf *first, *tail, **last;
};
static int save_unicode(const char *, size_t, void *);
static int convert_tounicode(void *ptr,
const char *text, size_t cnt);
static int deinit_tounicode(void *ptr, int *errptr);
libmail_u_convert_handle_t
libmail_u_convert_tou_init(const char *src_chset,
unicode_char **ucptr_ret,
size_t *ucsize_ret,
int nullterminate
)
{
struct libmail_u_convert_tou *p=
malloc(sizeof(struct libmail_u_convert_tou));
libmail_u_convert_handle_t h;
if (!p)
return NULL;
memset(p, 0, sizeof(*p));
h=libmail_u_convert_init(src_chset, libmail_u_ucs4_native,
save_unicode, p);
if (!h)
{
free(p);
return NULL;
}
p->ucptr_ret=ucptr_ret;
p->ucsize_ret=ucsize_ret;
p->last= &p->first;
p->nullterminate=nullterminate;
p->hdr.next=h;
p->hdr.convert_handler=convert_tounicode;
p->hdr.deinit_handler=deinit_tounicode;
p->hdr.ptr=p;
return &p->hdr;
}
libmail_u_convert_handle_t
libmail_u_convert_fromu_init(const char *dst_chset,
char **cbufptr_ret,
size_t *csize_ret,
int nullterminate
)
{
return libmail_u_convert_tocbuf_init(libmail_u_ucs4_native,
dst_chset,
cbufptr_ret,
csize_ret,
nullterminate);
}
int libmail_u_convert_uc(libmail_u_convert_handle_t handle,
const unicode_char *text,
size_t cnt)
{
return libmail_u_convert(handle, (const char *)text,
cnt * sizeof(*text));
}
/* Capture the output of the conversion stack */
static int save_unicode(const char *text, size_t cnt, void *ptr)
{
struct libmail_u_convert_tou *p=
(struct libmail_u_convert_tou *)ptr;
struct libmail_u_convert_buf *fragment;
size_t tot_size;
cnt /= sizeof(unicode_char);
tot_size=p->tot_size + cnt*sizeof(unicode_char);
/* Keep track of the total size saved */
if (p->tail)
{
size_t n=p->tail->max_fragment_size-p->tail->fragment_size;
if (n > cnt)
n=cnt;
if (n)
{
memcpy(p->tail->fragment+p->tail->fragment_size,
text, n*sizeof(unicode_char));
cnt -= n;
text += n*sizeof(unicode_char);
p->tail->fragment_size += n;
}
}
if (cnt > 0)
{
size_t cnt_alloc=cnt;
if (cnt_alloc < 16)
cnt_alloc=16;
if ((fragment=malloc(sizeof(struct libmail_u_convert_buf)
+cnt_alloc*sizeof(unicode_char)))
== NULL)
{
p->errflag=1;
return 1;
}
fragment->next=NULL;
fragment->fragment=(unicode_char *)(fragment+1);
fragment->max_fragment_size=cnt_alloc;
fragment->fragment_size=cnt;
memcpy(fragment->fragment, text, cnt*sizeof(unicode_char));
*(p->last)=fragment;
p->last=&fragment->next;
p->tail=fragment;
}
if (tot_size < p->tot_size) /* Overflow? */
{
errno=E2BIG;
return 1;
}
p->tot_size=tot_size;
return 0;
}
/* Punt converted text down the stack */
static int convert_tounicode(void *ptr,
const char *text, size_t cnt)
{
struct libmail_u_convert_tou *p=
(struct libmail_u_convert_tou *)ptr;
return libmail_u_convert(p->hdr.next, text, cnt);
}
/*
** Destroy the conversion stack. Destroy the downstream, then assemble the
** final array.
*/
static int deinit_tounicode(void *ptr, int *errptr)
{
struct libmail_u_convert_tou *p=
(struct libmail_u_convert_tou *)ptr;
int rc=libmail_u_convert_deinit(p->hdr.next, errptr);
struct libmail_u_convert_buf *bufptr;
if (rc == 0 && p->nullterminate)
{
unicode_char zero=0;
rc=save_unicode( (const char *)&zero, sizeof(zero),
p->hdr.ptr);
}
if (rc == 0)
{
if (((*p->ucptr_ret)=malloc(p->tot_size ? p->tot_size:1)) !=
NULL)
{
size_t i=0;
for (bufptr=p->first; bufptr; bufptr=bufptr->next)
{
if (bufptr->fragment_size)
memcpy(&(*p->ucptr_ret)[i],
bufptr->fragment,
bufptr->fragment_size
*sizeof(*bufptr->fragment));
i += bufptr->fragment_size;
}
(*p->ucsize_ret)=i;
}
else
{
rc= -1;
}
}
for (bufptr=p->first; bufptr; )
{
struct libmail_u_convert_buf *b=bufptr;
bufptr=bufptr->next;
free(b);
}
free(p);
return rc;
}
int libmail_u_convert_tou_tobuf(const char *text,
size_t text_l,
const char *charset,
unicode_char **uc,
size_t *ucsize,
int *err)
{
libmail_u_convert_handle_t h;
if ((h=libmail_u_convert_tou_init(charset, uc, ucsize, 0)) == NULL)
return -1;
if (libmail_u_convert(h, text, text_l) < 0)
{
libmail_u_convert_deinit(h, NULL);
return -1;
}
if (libmail_u_convert_deinit(h, err))
return -1;
return 0;
}
int libmail_u_convert_fromu_tobuf(const unicode_char *utext,
size_t utext_l,
const char *charset,
char **c,
size_t *csize,
int *err)
{
libmail_u_convert_handle_t h;
if (utext_l == (size_t)-1)
{
for (utext_l=0; utext[utext_l]; ++utext_l)
;
}
if ((h=libmail_u_convert_fromu_init(charset, c, csize, 1)) == NULL)
return -1;
if (libmail_u_convert_uc(h, utext, utext_l) < 0)
{
libmail_u_convert_deinit(h, NULL);
return -1;
}
if (libmail_u_convert_deinit(h, err))
return -1;
return 0;
}
char *libmail_u_convert_tocase(const char *str,
const char *charset,
unicode_char (*first_char_func)(unicode_char),
unicode_char (*char_func)(unicode_char))
{
unicode_char *uc;
size_t ucsize;
size_t i;
int err;
char *c;
size_t csize;
if (libmail_u_convert_tou_tobuf(str, strlen(str),
charset, &uc, &ucsize, &err))
return NULL;
if (err)
{
free(uc);
return NULL;
}
for (i=0; i<ucsize; ++i)
{
uc[i]=(*first_char_func)(uc[i]);
if (char_func)
first_char_func=char_func;
}
if (libmail_u_convert_fromu_tobuf(uc, ucsize,
charset,
&c, &csize, &err))
{
free(uc);
return NULL;
}
free(uc);
if (err)
{
free(c);
return NULL;
}
return c;
}