Viewing file:
gb2312.c (7.65 KB) -rw-r--r--Select action/file-type:

(
+) |

(
+) |

(
+) |
Code (
+) |
Session (
+) |

(
+) |
SDB (
+) |

(
+) |

(
+) |

(
+) |

(
+) |

(
+) |
/*
** Copyright 2000-2002 Double Precision, Inc.
** See COPYING for distribution information.
**
** $Id: gb2312.c,v 1.13 2004/05/23 14:28:24 mrsam Exp $
*/
#include "gb2312.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static const unicode_char * const gb2312[]= {
gb2312_a1,
gb2312_a2,
gb2312_a3,
gb2312_a4,
gb2312_a5,
gb2312_a6,
gb2312_a7,
gb2312_a8,
gb2312_a9,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
gb2312_b0,
gb2312_b1,
gb2312_b2,
gb2312_b3,
gb2312_b4,
gb2312_b5,
gb2312_b6,
gb2312_b7,
gb2312_b8,
gb2312_b9,
gb2312_ba,
gb2312_bb,
gb2312_bc,
gb2312_bd,
gb2312_be,
gb2312_bf,
gb2312_c0,
gb2312_c1,
gb2312_c2,
gb2312_c3,
gb2312_c4,
gb2312_c5,
gb2312_c6,
gb2312_c7,
gb2312_c8,
gb2312_c9,
gb2312_ca,
gb2312_cb,
gb2312_cc,
gb2312_cd,
gb2312_ce,
gb2312_cf,
gb2312_d0,
gb2312_d1,
gb2312_d2,
gb2312_d3,
gb2312_d4,
gb2312_d5,
gb2312_d6,
gb2312_d7,
gb2312_d8,
gb2312_d9,
gb2312_da,
gb2312_db,
gb2312_dc,
gb2312_dd,
gb2312_de,
gb2312_df,
gb2312_e0,
gb2312_e1,
gb2312_e2,
gb2312_e3,
gb2312_e4,
gb2312_e5,
gb2312_e6,
gb2312_e7,
gb2312_e8,
gb2312_e9,
gb2312_ea,
gb2312_eb,
gb2312_ec,
gb2312_ed,
gb2312_ee,
gb2312_ef,
gb2312_f0,
gb2312_f1,
gb2312_f2,
gb2312_f3,
gb2312_f4,
gb2312_f5,
gb2312_f6,
gb2312_f7,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL};
static unicode_char *c2u(const struct unicode_info *u,
const char *cp, int *err)
{
size_t i, cnt;
unicode_char *uc;
if (err)
*err= -1;
/*
** Count the number of potential unicode characters first.
*/
for (i=cnt=0; cp[i]; i++)
{
if ( (int)(unsigned char)cp[i] < 0xA1 ||
(int)(unsigned char)cp[i] > 0xFE ||
cp[i+1] == 0)
{
++cnt;
continue;
}
++i;
++cnt;
}
uc=malloc((cnt+1)*sizeof(unicode_char));
if (!uc)
return (NULL);
i=cnt=0;
while (cp[i])
{
int a=(int)(unsigned char)cp[i], b;
if ( a >= 0xA1 && a <= 0xFE && cp[i+1])
{
unicode_char ucv;
b=(int)(unsigned char)cp[i+1];
if (0xA1 <= b && b <= 0xFE
&& gb2312[a-0xA1]
&& (ucv=gb2312[a-0xA1][b-0xA1]))
uc[cnt++]= ucv;
else if (err)
{
*err = i;
free(uc);
return NULL;
}
else
uc[cnt++] = (unicode_char)0xFFFD;
i += 2;
}
else if (a < (unsigned)0x80)
{
uc[cnt++]=a;
i += 1;
}
else if (err)
{
*err=i;
free(uc);
return (NULL);
}
else
{
uc[cnt++]= 0xFFFD;
i += 1;
}
}
uc[cnt]=0;
return (uc);
}
static unsigned revlookup(unicode_char c)
{
unsigned j;
unsigned bucket;
unsigned uc;
bucket=c % gb2312_revhash_size;
uc=0;
for (j=gb2312_revtable_index[ bucket ];
j < sizeof(gb2312_revtable_uc)/sizeof(gb2312_revtable_uc[0]);
++j)
{
unicode_char uuc=gb2312_revtable_uc[j];
if (uuc == c)
return (gb2312_revtable_octets[j]);
if ((uuc % gb2312_revhash_size) != bucket)
break;
}
return (0);
}
static char *u2c(const struct unicode_info *u,
const unicode_char *cp, int *err)
{
size_t cnt, i;
char *s;
if (err)
*err= -1;
/*
** Figure out the size of the octet string. Unicodes < 0x7f will
** map to a single byte, unicodes >= 0x80 will map to two bytes.
*/
for (i=cnt=0; cp[i]; i++)
{
if (cp[i] > 0x7f)
++cnt;
++cnt;
}
s=malloc(cnt+1);
if (!s)
return (NULL);
cnt=0;
for (i=0; cp[i]; i++)
{
unsigned uc;
/* US-ASCII or GB 1988 (ISO 646 PRC version) */
if (cp[i] < (unicode_char)0x0080)
{
s[cnt++]= (char)cp[i];
continue;
}
/* For compatibility: 2 characters replaced by GB 1988 */
if (cp[i] == (unicode_char)0x00A5) /* YEN SIGN == yuan sign */
{
s[cnt++] = 0x24;
continue;
}
if (cp[i] == (unicode_char)0x203E) /* OVERLINE */
{
s[cnt++] = 0x7E;
continue;
}
uc=revlookup(cp[i]);
if (!uc)
{
if (err)
{
*err=i;
free(s);
return (NULL);
}
s[cnt++] = '?';
}
else
{
s[cnt++]= (char)(uc >> 8);
s[cnt++]= (char)(uc & 0x00FF);
}
}
s[cnt]=0;
return (s);
}
static char *toupper_func(const struct unicode_info *u,
const char *cp, int *ip)
{
unicode_char *uc=c2u(u, cp, ip);
char *s;
unsigned i;
if (!uc)
return (NULL);
for (i=0; uc[i]; i++)
{
unicode_char c=unicode_uc(uc[i]);
if (revlookup(c))
uc[i]=c;
}
s=u2c(u, uc, NULL);
free(uc);
return (s);
}
static char *tolower_func(const struct unicode_info *u,
const char *cp, int *ip)
{
unicode_char *uc=c2u(u, cp, ip);
char *s;
unsigned i;
if (!uc)
return (NULL);
for (i=0; uc[i]; i++)
{
unicode_char c=unicode_lc(uc[i]);
if (revlookup(c))
uc[i]=c;
}
s=u2c(u, uc, NULL);
free(uc);
return (s);
}
static char *totitle_func(const struct unicode_info *u,
const char *cp, int *ip)
{
unicode_char *uc=c2u(u, cp, ip);
char *s;
unsigned i;
if (!uc)
return (NULL);
for (i=0; uc[i]; i++)
{
unicode_char c=unicode_tc(uc[i]);
if (revlookup(c))
uc[i]=c;
}
s=u2c(u, uc, NULL);
free(uc);
return (s);
}
const struct unicode_info unicode_GB2312 = {
"GB2312",
UNICODE_MB | UNICODE_REPLACEABLE | UNICODE_USASCII |
UNICODE_HEADER_BASE64 | UNICODE_BODY_BASE64,
c2u,
u2c,
toupper_func,
tolower_func,
totitle_func};
#if 0
int main()
{
FILE *fp=popen("gunzip -cd <Unihan-3.2.0.txt.gz", "r");
char buf[4000];
unicode_char *uc;
char *s, *p;
int dummyi;
if (!fp)
return (0);
while (fgets(buf, sizeof(buf), fp))
{
unsigned a, b, c;
int dummy;
if (sscanf(buf, "U+%4x kIRG_GSource 0-%4x", &b, &a) != 2)
continue;
a |= 0x8080;
printf("0x%04x 0x%04x: ", a, b);
buf[0]= a / 256;
buf[1]= a % 256;
buf[2]=0;
uc=c2u(buf, &dummy);
if (!uc)
{
printf("c2u failure: %d\n", dummy);
return (1);
}
if (uc[0] != b || uc[1])
{
printf("c2u failure: got 0x%04x, expected 0x%04x\n",
(unsigned)uc[0], (unsigned)b);
return (1);
}
s=u2c(uc, &dummy);
free(uc);
if (!s)
{
printf("u2c failure: %d\n", dummy);
return (1);
}
c=0;
if (!s[0] || !s[1] || s[2] ||
(c=(int)(unsigned char)s[0] * 256 +
(unsigned char)s[1]) != a)
{
printf("u2c failure: got 0x%04x, expected 0x%04x\n",
c, a);
return (1);
}
p=toupper_func(s, NULL);
if (!p)
{
printf("toupper failure\n");
return (1);
}
if (strcmp(p, s))
printf("toupper ");
free(p);
p=tolower_func(s, NULL);
if (!p)
{
printf("tolower failure\n");
return (1);
}
if (strcmp(p, s))
printf("tolower ");
free(p);
p=totitle_func(s, NULL);
if (!p)
{
printf("totitle failure\n");
return (1);
}
if (strcmp(p, s))
printf("totitle ");
free(p);
free(s);
printf("ok\n");
}
fclose(fp);
buf[0]=0x40;
buf[1]=0;
uc=c2u(buf, NULL);
if (!uc)
{
printf("us-ascii c2u failure\n");
return (1);
}
s=u2c(uc, NULL);
free(uc);
if (!s)
{
printf("us-asccu u2c failure\n");
return (1);
}
free(s);
buf[0]=0x40;
buf[1]=0xF0;
buf[2]=0;
uc=c2u(buf, NULL);
if (!uc)
{
printf("fallback failed\n");
return (1);
}
printf("fallback: %04x %04x\n", (unsigned)uc[0],
(unsigned)uc[1]);
s=u2c(uc, NULL);
free(uc);
if (!s)
{
printf("fallback-reverse failed\n");
return (1);
}
printf("fallback: %02x %02x\n", (int)(unsigned char)s[0],
(int)(unsigned char)s[1]);
free(s);
buf[0]=0xB2;
buf[1]=0x20;
buf[2]=0;
uc=c2u(buf, &dummyi);
if (uc)
{
printf("abort failed\n");
return (1);
}
printf("aborted at index %d\n", dummyi);
{
static unicode_char testing[]={0x0040, 0x1000, 0};
uc=testing;
s=u2c(uc, NULL);
if (!s)
{
printf("abort-fallback failed\n");
return (1);
}
printf("abort-fallback: %02x %02x\n", (int)(unsigned char)s[0],
(int)(unsigned char)s[1]);
free(s);
uc=testing;
}
s=u2c(uc, &dummyi);
if (s)
{
printf("abort-abort failed\n");
return (1);
}
printf("abort-aborted: index %d\n", dummyi);
return (0);
}
#endif