Viewing file:
ksx1001.c (12.31 KB) -rw-r--r--Select action/file-type:

(
+) |

(
+) |

(
+) |
Code (
+) |
Session (
+) |

(
+) |
SDB (
+) |

(
+) |

(
+) |

(
+) |

(
+) |

(
+) |
/*
* ISO-2022-KR, EUC-KR & CP949 <=> Unicode translate functions.
* by Hatuka*nezumi - IKEDA Soji <nezumi@jca.apc.org>
*/
#include <stdio.h>
#include <string.h>
#include "unicode.h"
#include "ksx1001.h"
#define EUCKR_CP949_EXTENSION 1
/*
* ISO-2022-KR (RFC1557) Converters
*/
struct kschar_t {
int state;
unsigned int value;
};
static size_t read_char(const char* src, struct kschar_t *ch)
{
unsigned int hi, lo;
switch (src[0]) {
case KS_CHAR_SI:
/* Shift-in */
ch->state = KS_STATE_ASCII;
ch->value = 0;
return 1;
case KS_CHAR_SO:
/* Shift-out */
ch->state = KS_STATE_KSX1001;
ch->value = 0;
return 1;
case KS_CHAR_ESC:
/* Announcer sequence */
if (src[1] == '$' && src[2] == ')' && src[3] == 'C') {
ch->value = 0;
return 4;
}
/* ESC character */
else
{
ch->state = KS_STATE_BINARY;
ch->value = KS_CHAR_ESC;
return 1;
}
}
/* Control Characters */
if ((unsigned char)src[0] < 0x20)
{
/* state will not be changed. */
ch->value = (unsigned int)src[0];
}
/* US-ASCII */
if ((ch->state == KS_STATE_ASCII || ch->state == KS_STATE_BINARY)
&& (unsigned char)src[0] < 0x80)
{
ch->state = KS_STATE_ASCII;
ch->value = (unsigned int)src[0];
return 1;
}
/* KS X 1001 */
else if (ch->state == KS_STATE_KSX1001
&& 0x21 <= src[0] && src[0] <= 0x7E
&& 0x21 <= src[1] && src[1] <= 0x7E)
{
hi = (unsigned int)src[0];
lo = (unsigned int)src[1];
if (cp949_to_uni_tbls[hi-1] != NULL
&& cp949_to_uni_tbls[hi-1][lo+0x3F] != 0xFFFD)
{
ch->value = hi * 256 + lo;
return 2;
}
else
{
ch->value = 0x003F;
return 2;
}
}
else
{
ch->state = KS_STATE_BINARY;
ch->value = 0x003F;
return 1;
}
}
static unicode_char c2u_iso2022kr_convchar(unsigned int c, int state)
{
unsigned int hi = (c >> 8);
unsigned int lo = c & 0x00FF;
/* Control characters */
if (c < (unsigned int)0x0020)
return (unicode_char)c;
/* US-ASCII */
else if (state == KS_STATE_ASCII && c < (unsigned int)0x0080)
return (unicode_char)c;
/* KS X 1001 */
else if (state == KS_STATE_KSX1001 && c != 0x003F
&& cp949_to_uni_tbls[hi-1] != NULL
&& cp949_to_uni_tbls[hi-1][lo+0x3F] != 0xFFFD)
return cp949_to_uni_tbls[hi-1][lo+0x3F];
/* Uniknown */
else
return (unicode_char)0xFFFD;
}
static unicode_char *c2u_iso2022kr(const struct unicode_info *u,
const char *ks_str, int *err)
{
size_t i, cnt, w;
unicode_char *uc;
struct kschar_t ch;
if (err)
*err = -1;
/* Count the number of potential unicode characters first. */
i = cnt = 0;
ch.state = KS_STATE_ASCII;
ch.value = 0;
while (ks_str[i]) {
i += read_char(ks_str+i, &ch);
if (ch.value)
++cnt;
}
uc = malloc((cnt+1) * sizeof(unicode_char));
if (!uc)
return NULL;
i = cnt = 0;
ch.state = KS_STATE_ASCII;
ch.value = 0;
while (ks_str[i]) {
w = read_char(ks_str+i, &ch);
if (ch.value) {
uc[cnt] = c2u_iso2022kr_convchar(ch.value, ch.state);
if (uc[cnt] == (unicode_char)0xFFFD && err) {
*err = i;
free(uc);
return NULL;
}
++cnt;
}
i+=w;
}
uc[cnt] = 0;
return uc;
}
static void revlookup(unicode_char u, struct kschar_t *ch)
{
unsigned int hi = u >> 8;
unsigned int lo = u & 0x00ff;
unsigned int k;
unsigned char c1, c2;
/* ISO-2022-KR is mapped inside BMP range. */
if (u >= (unicode_char)0x10000)
{
ch->state = KS_STATE_BINARY;
ch->value = 0x003F;
return;
}
/* US-ASCII */
if (u < (unicode_char)0x0080)
{
ch->state = KS_STATE_ASCII;
ch->value = (unsigned int)u;
return;
}
/* For compatibility: 2 Characters replaced by KS X 1003 */
if (u == (unicode_char)0x20A9) /* WON SIGN */
{
ch->state = KS_STATE_ASCII;
ch->value = 0x5C;
return;
}
if (u == (unicode_char)0x203E) /* OVERLINE */
{
ch->state = KS_STATE_ASCII;
ch->value = 0x7E;
return;
}
/* KS X 1001 */
if (uni_to_ksx1001_tbls[hi] != NULL
&& (k = uni_to_ksx1001_tbls[hi][lo]) != 0x003F)
{
c1 = (k >> 8);
c2 = (k & 0x00FF);
if (c1 >= (unsigned char)0xA1 && c2 >= (unsigned char)0xA1)
{
c1 -= 0x80;
c2 -= 0x80;
ch->state = KS_STATE_KSX1001;
ch->value = c1*256 + c2;
return;
}
else
{
ch->state = KS_STATE_BINARY;
ch->value = 0x003F;
return;
}
}
/* Otherwise, return 'unknown' characters */
ch->state = KS_STATE_BINARY;
ch->value = 0x003F;
return;
}
static char *u2c_iso2022kr(const struct unicode_info *u,
const unicode_char *str, int *err)
{
size_t i, cnt;
int k;
int kstate = KS_STATE_ASCII;
int ks;
int has_ksx1001=0;
char *s;
struct kschar_t ch;
if (err)
*err = -1;
/* Count the number of potential octets first. */
ch.state = KS_STATE_ASCII;
ch.value = 0;
kstate = KS_STATE_ASCII;
has_ksx1001 = 0;
for (i = cnt = 0; str[i]; i++) {
revlookup(str[i], &ch);
ks = ch.state;
k = ch.value;
if (ks != kstate)
{
cnt++;
kstate = ks;
}
if (k)
cnt += ((kstate == KS_STATE_KSX1001)? 2: 1);
if (kstate == KS_STATE_KSX1001)
has_ksx1001 = 1;
}
if (kstate != KS_STATE_ASCII && kstate != KS_STATE_BINARY)
cnt++;
if (has_ksx1001)
cnt+=4;
s = malloc(cnt+1);
if (!s)
return NULL;
cnt = 0;
if (has_ksx1001)
{
s[cnt++] = KS_CHAR_ESC;
s[cnt++] = '$';
s[cnt++] = ')';
s[cnt++] = 'C';
}
ch.state = KS_STATE_ASCII;
ch.value = 0;
kstate = KS_STATE_ASCII;
for (i = 0; str[i]; i++)
{
revlookup(str[i], &ch);
ks = ch.state;
k = ch.value;
if (ks != kstate)
{
switch (ks)
{
case KS_STATE_KSX1001:
s[cnt++] = KS_CHAR_SO;
break;
default:
s[cnt++] = KS_CHAR_SI;
break;
}
kstate = ks;
}
switch (kstate)
{
case KS_STATE_KSX1001:
s[cnt++] = (char)(k >> 8);
s[cnt++] = (char)(k & 0x00FF);
break;
default:
s[cnt++] = (char)k;
}
if (kstate == KS_STATE_BINARY && k == 0x003F)
if (err)
{
*err = i;
free(s);
return NULL;
}
}
if (kstate != KS_STATE_ASCII && kstate != KS_STATE_BINARY)
{
s[cnt++] = KS_CHAR_SI;
}
s[cnt] = 0;
return s;
}
/*
* EUC-KR / CP949 (UHC) Converters
*/
static unicode_char *c2u_euckr_doconv(const struct unicode_info *u,
const char *euckr_str, int *err,
int compat)
{
unicode_char *uc=0;
unicode_char c;
unsigned char hi=0, lo=0;
int len=0;
int i=0;
int pos=0;
if(err) *err = -1;
len = strlen(euckr_str);
uc = (unicode_char*)malloc((len+1) * sizeof(unicode_char) *2);
if (!uc)
return NULL;
for(i=0; i<len;) {
/* 2 Characters replaced by KS X 1003 */
if ((compat & EUCKR_CP949_EXTENSION)
&& euckr_str[i] == 0x5C) /* WON SIGN */
{
uc[pos++] = (unicode_char)0x20A9;
i++;
}
else if ((compat & EUCKR_CP949_EXTENSION)
&& euckr_str[i] == 0x7E) /* OVERLINE */
{
uc[pos++] = (unicode_char)0x203E;
i++;
}
/* US-ASCII or KS X 1003 */
else if((unsigned char)euckr_str[i] < 0x80)
{
uc[pos++] = (unicode_char)(euckr_str[i]);
i++;
}
/* KS X 1001 */
else if ((unsigned char)euckr_str[i] >= 0xa1
&& (unsigned char)euckr_str[i+1] >= 0xa1)
{
hi = (unsigned char)euckr_str[i];
lo = (unsigned char)euckr_str[i+1];
if (cp949_to_uni_tbls[hi-0x81] == NULL)
c = (unicode_char)0xFFFD;
else
c = cp949_to_uni_tbls[hi-0x81][lo-0x41];
uc[pos++] = c;
if (c == (unicode_char)0xFFFD && err)
{
*err = i;
free(uc);
return NULL;
}
i+=2;
}
/* CP949 extension */
else if ((0x81 <= (unsigned)euckr_str[i]
&& (unsigned)euckr_str[i] <= 0xFE)
&& ((0x41 <= (unsigned)euckr_str[i+1]
&& (unsigned)euckr_str[i+1] <= 0x5A)
|| (0x61 <= (unsigned)euckr_str[i+1]
&& (unsigned)euckr_str[i+1] <= 0x7A)
|| (0x81 <= (unsigned)euckr_str[i+1]
&& (unsigned)euckr_str[i+1] <= 0xFE)))
{
hi = (unsigned char)euckr_str[i];
lo = (unsigned char)euckr_str[i+1];
if (!(compat & EUCKR_CP949_EXTENSION))
c = 0xFFFD;
else if (cp949_to_uni_tbls[hi-0x81] != NULL)
c = cp949_to_uni_tbls[hi-0x81][lo-0x41];
else
c = 0xFFFD;
uc[pos++] = c;
if (c == 0xFFFD && err)
*err = i;
free(uc);
return NULL;
i+=2;
}
/* Not found */
else if (err)
{
*err = i;
free(uc);
return NULL;
}
else
{
uc[pos++] = (unicode_char)0xFFFD;
i++;
}
}
uc[pos++] = 0;
return uc;
}
static unicode_char *c2u_euckr(const struct unicode_info *u,
const char *euckr_str, int *err)
{
return c2u_euckr_doconv(u, euckr_str, err, 0);
}
static unicode_char *c2u_cp949(const struct unicode_info *u,
const char *euckr_str, int *err)
{
return c2u_euckr_doconv(u, euckr_str, err, EUCKR_CP949_EXTENSION);
}
static char *u2c_euckr_doconv(const struct unicode_info *u,
const unicode_char *str, int *err,
int compat)
{
int i=0;
int pos=0;
int len=0;
char* s;
if(err) *err = -1;
while(str[len])
len++;
s = malloc((len+1)*2);
if (!s)
return NULL;
for(i=0; str[i]; i++)
{
int ksx_char = 0;
unsigned char hi=0, lo=0;
unsigned char str_i_high=str[i] >> 8;
/* EUC-KR is mapped inside BMP range. */
if (str[i] >= (unicode_char)0x10000)
{
if (err)
{
*err = i;
free(s);
return NULL;
}
s[pos++] = '?';
}
/* US-ASCII */
else if (str[i] < (unicode_char)0x0080)
s[pos++] = (char)str[i];
/* For compatibility: 2 characters replaced by KS X 1003 */
else if (str[i] == (unicode_char)0x20A9) /* WON SIGN */
s[pos++] = 0x5C;
else if (str[i] == (unicode_char)0x203E) /* OVERLINE */
s[pos++] = 0x7E;
/* KS X 1001 */
else if (uni_to_ksx1001_tbls[str_i_high] != NULL)
{
ksx_char = uni_to_ksx1001_tbls[str_i_high][str[i] & 0xff];
hi = ksx_char >> 8;
lo = ksx_char & 0xff;
if (hi)
{
s[pos++] = hi;
s[pos++] = lo;
}
else
{
ksx_char = 0x003F;
s[pos++] = '?';
}
if (ksx_char == 0x003F && err)
{
*err = i;
free(s);
return NULL;
}
}
/* CP949 Extension */
else if (uni_to_cp949_tbls[str_i_high] != NULL)
{
if (!(compat & EUCKR_CP949_EXTENSION))
ksx_char = 0x003F;
else
ksx_char = uni_to_cp949_tbls[str_i_high][str[i] & 0xff];
hi = ksx_char >> 8;
lo = ksx_char & 0xff;
if (hi)
{
s[pos++] = hi;
s[pos++] = lo;
}
else
{
ksx_char = 0x003F;
s[pos++] = '?';
}
if (ksx_char == 0x003F && err)
{
*err = i;
free(s);
return NULL;
}
}
/* Not found */
else if (err)
{
*err = i;
free(s);
return NULL;
}
else
s[pos++] = '?';
}
s[pos] = 0;
return s;
}
static char *u2c_euckr(const struct unicode_info *u,
const unicode_char *str, int *err)
{
return u2c_euckr_doconv(u, str, err, 0);
}
static char *u2c_cp949(const struct unicode_info *u,
const unicode_char *str, int *err)
{
return u2c_euckr_doconv(u, str, err, EUCKR_CP949_EXTENSION);
}
static char *toupper_func(const struct unicode_info *u,
const char *cp, int *ip)
{
unicode_char *uc = (*u->c2u)(u, cp, ip);
char *s;
size_t i;
if (!uc)
return (NULL);
for (i=0; uc[i] && i<10000; i++) {
if ((unicode_char)'a' <= uc[i] && uc[i] <= (unicode_char)'z')
uc[i] = uc[i] - ((unicode_char)'a' - (unicode_char)'A');
}
s = (*u->u2c)(u, uc, NULL);
free(uc);
return (s);
}
static char *tolower_func(const struct unicode_info *u,
const char *cp, int *ip)
{
unicode_char *uc = (*u->c2u)(u, cp, ip);
char *s;
size_t i;
if (!uc)
return (NULL);
for (i=0; uc[i]; i++) {
if ((unicode_char)'A' <= uc[i] && uc[i] <= (unicode_char)'Z')
uc[i] = uc[i] + ((unicode_char)'a' - (unicode_char)'A');
}
s = (*u->u2c)(u, uc, NULL);
free(uc);
return (s);
}
static char *totitle_func(const struct unicode_info *u,
const char *cp, int *ip)
{
unicode_char *uc = (*u->c2u)(u, cp, ip);
char *s;
if (!uc)
return (NULL);
/* Uh, sorry, what's "title" char? */
/*
* for (i=0; uc[i]; i++)
* uc[i] = unicode_tc(uc[i]);
*/
s = (*u->u2c)(u, uc, NULL);
free(uc);
return (s);
}
extern const struct unicode_info unicode_UTF8;
const struct unicode_info unicode_ISO2022_KR = {
"ISO-2022-KR",
UNICODE_MB | UNICODE_REPLACEABLE | UNICODE_SISO |
UNICODE_HEADER_BASE64,
c2u_iso2022kr,
u2c_iso2022kr,
toupper_func,
tolower_func,
totitle_func,
&unicode_UTF8
};
const struct unicode_info unicode_EUC_KR = {
"EUC-KR",
UNICODE_MB | UNICODE_REPLACEABLE | UNICODE_USASCII |
UNICODE_HEADER_BASE64 | UNICODE_BODY_BASE64,
c2u_euckr,
u2c_euckr,
toupper_func,
tolower_func,
totitle_func,
&unicode_UTF8
};
const struct unicode_info unicode_CP949 = {
"CP949",
UNICODE_MB | UNICODE_REPLACEABLE |
UNICODE_HEADER_BASE64 | UNICODE_BODY_BASE64,
c2u_cp949,
u2c_cp949,
toupper_func,
tolower_func,
totitle_func,
&unicode_UTF8
};