Viewing file:
big5.pl (23.51 KB) -rwxr-xr-xSelect action/file-type:

(
+) |

(
+) |

(
+) |
Code (
+) |
Session (
+) |

(
+) |
SDB (
+) |

(
+) |

(
+) |

(
+) |

(
+) |

(
+) |
#!/usr/bin/perl
# USAGE: perl big5.pl > big5.h
# Requires:
# - Unihan-3.2.0.txt.gz
# - big5-iso.txt, found on
# http://www.info.gov.hk/digital21/chi/hkscs/download/big5-iso.txt
require "cjkcompat.pl";
my $revhash=1501;
$unihan = "Unihan-3.2.0.txt.gz";
$hkscs = "big5-iso.txt";
# Get hanzi maps
open (SET, "gunzip -cd <${unihan} |") || die "${unihan}: $!\n";
while (<SET>)
{
chomp;
s/\#.*//;
next unless /^U\+([0-9A-F]{4,5})\s+kBigFive\s+(....)/i;
my ($unicode, $code)=("0x$1", "0x$2");
eval "\$code=$code;";
eval "\$unicode=$unicode;";
next if $code == 0xA2CD; # See below.
die if $code < 0 || $code > 65535;
my $codeh= int($code/256);
my $codel= $code % 256;
&add($codeh,$codel,$unicode);
}
close SET;
# Get HKSCS extension maps
open (SET, $hkscs) || die "${hkscs}: $!\n";
while (<SET>)
{
chomp;
s/\#.*//;
@_ = split(/\s+/, $_);
next unless scalar(@_) == 4;
my ($unicode, $code)=(hex($_[3]), hex($_[0]));
next unless $code;
die unless $unicode;
# PUA
####next if 0xE000 <= $unicode && $unicode <= 0xF8FF;
die if $code < 0 || $code > 65535;
my $codeh= int($code/256);
my $codel= $code % 256;
&add($codeh,$codel,$unicode,1);
$count++;
}
close SET;
if ($count != 4818) {
die "$count characters are found. HKSCS extension table has been updated. Check ${hkscs}.";
}
# Unihan-3.2.0 does not make mention of Big5 non-hanzi.
# So manually add a converting map...
#
# Note:
# non-HKSCS Map is based on:
# http://wakaba-web.hp.infoseek.co.jp/table/big5-eten.txt
&add(0xA1,0x40,0x3000); # IDEOGRAPHIC SPACE
&add(0xA1,0x41,0xFF0C); # FULLWIDTH COMMA
&add(0xA1,0x42,0x3001); # IDEOGRAPHIC COMMA
&add(0xA1,0x43,0x3002); # IDEOGRAPHIC FULL STOP
&add(0xA1,0x44,0xFF0E); # FULLWIDTH FULL STOP
&add(0xA1,0x45,0x2027); # HYPHENATION POINT
&add(0xA1,0x46,0xFF1B); # FULLWIDTH SEMICOLON
&add(0xA1,0x47,0xFF1A); # FULLWIDTH COLON
&add(0xA1,0x48,0xFF1F); # FULLWIDTH QUESTION MARK
&add(0xA1,0x49,0xFF01); # FULLWIDTH EXCLAMATION MARK
&add(0xA1,0x4A,0xFE30); # PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
&add(0xA1,0x4B,0x2026); # HORIZONTAL ELLIPSIS
&add(0xA1,0x4C,0x2025); # TWO DOT LEADER
&add(0xA1,0x4D,0xFE50); # SMALL COMMA
&add(0xA1,0x4E,0xFE51); # SMALL IDEOGRAPHIC COMMA
&add(0xA1,0x4F,0xFE52); # SMALL FULL STOP
&add(0xA1,0x50,0x00B7); # MIDDLE DOT
&add(0xA1,0x51,0xFE54); # SMALL SEMICOLON
&add(0xA1,0x52,0xFE55); # SMALL COLON
&add(0xA1,0x53,0xFE56); # SMALL QUESTION MARK
&add(0xA1,0x54,0xFE57); # SMALL EXCLAMATION MARK
&add(0xA1,0x55,0xFF5C); # FULLWIDTH VERTICAL LINE
&add(0xA1,0x56,0x2013); # EN DASH
&add(0xA1,0x57,0xFE31); # PRESENTATION FORM FOR VERTICAL EM DASH
&add(0xA1,0x58,0x2014); # EM DASH
&add(0xA1,0x59,0xFE33); # PRESENTATION FORM FOR VERTICAL LOW LINE
&add(0xA1,0x5A,0x2574); # BOX DRAWINGS LIGHT LEFT
&add(0xA1,0x5B,0xFE34); # PRESENTATION FORM FOR VERTICAL WAVY LOW LINE
&add(0xA1,0x5C,0xFE4F); # WAVY LOW LINE
&add(0xA1,0x5D,0xFF08); # FULLWIDTH LEFT PARENTHESIS
&add(0xA1,0x5E,0xFF09); # FULLWIDTH RIGHT PARENTHESIS
&add(0xA1,0x5F,0xFE35); # PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
&add(0xA1,0x60,0xFE36); # PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS
&add(0xA1,0x61,0xFF5B); # FULLWIDTH LEFT CURLY BRACKET
&add(0xA1,0x62,0xFF5D); # FULLWIDTH RIGHT CURLY BRACKET
&add(0xA1,0x63,0xFE37); # PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET
&add(0xA1,0x64,0xFE38); # PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET
&add(0xA1,0x65,0x3014); # LEFT TORTOISE SHELL BRACKET
&add(0xA1,0x66,0x3015); # RIGHT TORTOISE SHELL BRACKET
&add(0xA1,0x67,0xFE39); # PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET
&add(0xA1,0x68,0xFE3A); # PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET
&add(0xA1,0x69,0x3010); # LEFT BLACK LENTICULAR BRACKET
&add(0xA1,0x6A,0x3011); # RIGHT BLACK LENTICULAR BRACKET
&add(0xA1,0x6B,0xFE3B); # PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET
&add(0xA1,0x6C,0xFE3C); # PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET
&add(0xA1,0x6D,0x300A); # LEFT DOUBLE ANGLE BRACKET
&add(0xA1,0x6E,0x300B); # RIGHT DOUBLE ANGLE BRACKET
&add(0xA1,0x6F,0xFE3D); # PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET
&add(0xA1,0x70,0xFE3E); # PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET
&add(0xA1,0x71,0x3008); # LEFT ANGLE BRACKET
&add(0xA1,0x72,0x3009); # RIGHT ANGLE BRACKET
&add(0xA1,0x73,0xFE3F); # PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET
&add(0xA1,0x74,0xFE40); # PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET
&add(0xA1,0x75,0x300C); # LEFT CORNER BRACKET
&add(0xA1,0x76,0x300D); # RIGHT CORNER BRACKET
&add(0xA1,0x77,0xFE41); # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
&add(0xA1,0x78,0xFE42); # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
&add(0xA1,0x79,0x300E); # LEFT WHITE CORNER BRACKET
&add(0xA1,0x7A,0x300F); # RIGHT WHITE CORNER BRACKET
&add(0xA1,0x7B,0xFE43); # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
&add(0xA1,0x7C,0xFE44); # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
&add(0xA1,0x7D,0xFE59); # SMALL LEFT PARENTHESIS
&add(0xA1,0x7E,0xFE5A); # SMALL RIGHT PARENTHESIS
&add(0xA1,0xA1,0xFE5B); # SMALL LEFT CURLY BRACKET
&add(0xA1,0xA2,0xFE5C); # SMALL RIGHT CURLY BRACKET
&add(0xA1,0xA3,0xFE5D); # SMALL LEFT TORTOISE SHELL BRACKET
&add(0xA1,0xA4,0xFE5E); # SMALL RIGHT TORTOISE SHELL BRACKET
&add(0xA1,0xA5,0x2018); # LEFT SINGLE QUOTATION MARK
&add(0xA1,0xA6,0x2019); # RIGHT SINGLE QUOTATION MARK
&add(0xA1,0xA7,0x201C); # LEFT DOUBLE QUOTATION MARK
&add(0xA1,0xA8,0x201D); # RIGHT DOUBLE QUOTATION MARK
&add(0xA1,0xA9,0x301D); # REVERSED DOUBLE PRIME QUOTATION MARK
&add(0xA1,0xAA,0x301E); # DOUBLE PRIME QUOTATION MARK
&add(0xA1,0xAB,0x2035); # REVERSED PRIME
&add(0xA1,0xAC,0x2032); # PRIME
&add(0xA1,0xAD,0xFF03); # FULLWIDTH NUMBER SIGN
&add(0xA1,0xAE,0xFF06); # FULLWIDTH AMPERSAND
&add(0xA1,0xAF,0xFF0A); # FULLWIDTH ASTERISK
&add(0xA1,0xB0,0x203B); # REFERENCE MARK
&add(0xA1,0xB1,0x00A7); # SECTION SIGN
&add(0xA1,0xB2,0x3003); # DITTO MARK
&add(0xA1,0xB3,0x25CB); # WHITE CIRCLE
&add(0xA1,0xB4,0x25CF); # BLACK CIRCLE
&add(0xA1,0xB5,0x25B3); # WHITE UP-POINTING TRIANGLE
&add(0xA1,0xB6,0x25B2); # BLACK UP-POINTING TRIANGLE
&add(0xA1,0xB7,0x25CE); # BULLSEYE
&add(0xA1,0xB8,0x2606); # WHITE STAR
&add(0xA1,0xB9,0x2605); # BLACK STAR
&add(0xA1,0xBA,0x25C7); # WHITE DIAMOND
&add(0xA1,0xBB,0x25C6); # BLACK DIAMOND
&add(0xA1,0xBC,0x25A1); # WHITE SQUARE
&add(0xA1,0xBD,0x25A0); # BLACK SQUARE
&add(0xA1,0xBE,0x25BD); # WHITE DOWN-POINTING TRIANGLE
&add(0xA1,0xBF,0x25BC); # BLACK DOWN-POINTING TRIANGLE
&add(0xA1,0xC0,0x32A3); # CIRCLED IDEOGRAPH CORRECT
&add(0xA1,0xC1,0x2105); # CARE OF
&add(0xA1,0xC2,0x00AF); # MACRON
&add(0xA1,0xC3,0xFFE3); # FULLWIDTH MACRON
&add(0xA1,0xC4,0xFF3F); # FULLWIDTH LOW LINE
&add(0xA1,0xC5,0x02CD); # MODIFIER LETTER LOW MACRON
&add(0xA1,0xC6,0xFE49); # DASHED OVERLINE
&add(0xA1,0xC7,0xFE4A); # CENTRELINE OVERLINE
&add(0xA1,0xC8,0xFE4D); # DASHED LOW LINE
&add(0xA1,0xC9,0xFE4E); # CENTRELINE LOW LINE
&add(0xA1,0xCA,0xFE4B); # WAVY OVERLINE
&add(0xA1,0xCB,0xFE4C); # DOUBLE WAVY OVERLINE
&add(0xA1,0xCC,0xFE5F); # SMALL NUMBER SIGN
&add(0xA1,0xCD,0xFE60); # SMALL AMPERSAND
&add(0xA1,0xCE,0xFE61); # SMALL ASTERISK
&add(0xA1,0xCF,0xFF0B); # FULLWIDTH PLUS SIGN
&add(0xA1,0xD0,0xFF0D); # FULLWIDTH HYPHEN-MINUS
&add(0xA1,0xD1,0x00D7); # MULTIPLICATION SIGN
&add(0xA1,0xD2,0x00F7); # DIVISION SIGN
&add(0xA1,0xD3,0x00B1); # PLUS-MINUS SIGN
&add(0xA1,0xD4,0x221A); # SQUARE ROOT
&add(0xA1,0xD5,0xFF1C); # FULLWIDTH LESS-THAN SIGN
&add(0xA1,0xD6,0xFF1E); # FULLWIDTH GREATER-THAN SIGN
&add(0xA1,0xD7,0xFF1D); # FULLWIDTH EQUALS SIGN
&add(0xA1,0xD8,0x2266); # LESS-THAN OVER EQUAL TO
&add(0xA1,0xD9,0x2267); # GREATER-THAN OVER EQUAL TO
&add(0xA1,0xDA,0x2260); # NOT EQUAL TO
&add(0xA1,0xDB,0x221E); # INFINITY
&add(0xA1,0xDC,0x2252); # APPROXIMATELY EQUAL TO OR THE IMAGE OF
&add(0xA1,0xDD,0x2261); # IDENTICAL TO
&add(0xA1,0xDE,0xFE62); # SMALL PLUS SIGN
&add(0xA1,0xDF,0xFE63); # SMALL HYPHEN-MINUS
&add(0xA1,0xE0,0xFE64); # SMALL LESS-THAN SIGN
&add(0xA1,0xE1,0xFE65); # SMALL GREATER-THAN SIGN
&add(0xA1,0xE2,0xFE66); # SMALL EQUALS SIGN
&add(0xA1,0xE3,0xFF5E); # FULLWIDTH TILDE
&add(0xA1,0xE4,0x2229); # INTERSECTION
&add(0xA1,0xE5,0x222A); # UNION
&add(0xA1,0xE6,0x22A5); # UP TACK
&add(0xA1,0xE7,0x2220); # ANGLE
&add(0xA1,0xE8,0x221F); # RIGHT ANGLE
&add(0xA1,0xE9,0x22BF); # RIGHT TRIANGLE
&add(0xA1,0xEA,0x33D2); # SQUARE LOG
&add(0xA1,0xEB,0x33D1); # SQUARE LN
&add(0xA1,0xEC,0x222B); # INTEGRAL
&add(0xA1,0xED,0x222E); # CONTOUR INTEGRAL
&add(0xA1,0xEE,0x2235); # BECAUSE
&add(0xA1,0xEF,0x2234); # THEREFORE
&add(0xA1,0xF0,0x2640); # FEMALE SIGN
&add(0xA1,0xF1,0x2642); # MALE SIGN
&add(0xA1,0xF2,0x2295); # CIRCLED PLUS
&add(0xA1,0xF3,0x2299); # CIRCLED DOT OPERATOR
&add(0xA1,0xF4,0x2191); # UPWARDS ARROW
&add(0xA1,0xF5,0x2193); # DOWNWARDS ARROW
&add(0xA1,0xF6,0x2190); # LEFTWARDS ARROW
&add(0xA1,0xF7,0x2192); # RIGHTWARDS ARROW
&add(0xA1,0xF8,0x2196); # NORTH WEST ARROW
&add(0xA1,0xF9,0x2197); # NORTH EAST ARROW
&add(0xA1,0xFA,0x2199); # SOUTH WEST ARROW
&add(0xA1,0xFB,0x2198); # SOUTH EAST ARROW
&add(0xA1,0xFC,0x2225); # PARALLEL TO
&add(0xA1,0xFD,0x2223); # DIVIDES
&add(0xA1,0xFE,0xFF0F); # FULLWIDTH SOLIDUS
&add(0xA2,0x40,0xFF3C); # FULLWIDTH REVERSE SOLIDUS
&add(0xA2,0x41,0x2215); # DIVISION SLASH
&add(0xA2,0x42,0xFE68); # SMALL REVERSE SOLIDUS
&add(0xA2,0x43,0xFF04); # FULLWIDTH DOLLAR SIGN
&add(0xA2,0x44,0xFFE5); # FULLWIDTH YEN SIGN
&add(0xA2,0x45,0x3012); # POSTAL MARK
&add(0xA2,0x46,0xFFE0); # FULLWIDTH CENT SIGN
&add(0xA2,0x47,0xFFE1); # FULLWIDTH POUND SIGN
&add(0xA2,0x48,0xFF05); # FULLWIDTH PERCENT SIGN
&add(0xA2,0x49,0xFF20); # FULLWIDTH COMMERCIAL AT
&add(0xA2,0x4A,0x2103); # DEGREE CELSIUS
&add(0xA2,0x4B,0x2109); # DEGREE FAHRENHEIT
&add(0xA2,0x4C,0xFE69); # SMALL DOLLAR SIGN
&add(0xA2,0x4D,0xFE6A); # SMALL PERCENT SIGN
&add(0xA2,0x4E,0xFE6B); # SMALL COMMERCIAL AT
&add(0xA2,0x4F,0x33D5); # SQUARE MIL
&add(0xA2,0x50,0x339C); # SQUARE MM
&add(0xA2,0x51,0x339D); # SQUARE CM
&add(0xA2,0x52,0x339E); # SQUARE KM
&add(0xA2,0x53,0x33CE); # SQUARE KM CAPITAL
&add(0xA2,0x54,0x33A1); # SQUARE M SQUARED
&add(0xA2,0x55,0x338E); # SQUARE MG
&add(0xA2,0x56,0x338F); # SQUARE KG
&add(0xA2,0x57,0x33C4); # SQUARE CC
&add(0xA2,0x58,0x00B0); # DEGREE SIGN
# 0xA259-0xA261: <CJK>
&add(0xA2,0x62,0x2581); # LOWER ONE EIGHTH BLOCK
&add(0xA2,0x63,0x2582); # LOWER ONE QUARTER BLOCK
&add(0xA2,0x64,0x2583); # LOWER THREE EIGHTHS BLOCK
&add(0xA2,0x65,0x2584); # LOWER HALF BLOCK
&add(0xA2,0x66,0x2585); # LOWER FIVE EIGHTHS BLOCK
&add(0xA2,0x67,0x2586); # LOWER THREE QUARTERS BLOCK
&add(0xA2,0x68,0x2587); # LOWER SEVEN EIGHTHS BLOCK
&add(0xA2,0x69,0x2588); # FULL BLOCK
&add(0xA2,0x6A,0x258F); # LEFT ONE EIGHTH BLOCK
&add(0xA2,0x6B,0x258E); # LEFT ONE QUARTER BLOCK
&add(0xA2,0x6C,0x258D); # LEFT THREE EIGHTHS BLOCK
&add(0xA2,0x6D,0x258C); # LEFT HALF BLOCK
&add(0xA2,0x6E,0x258B); # LEFT FIVE EIGHTHS BLOCK
&add(0xA2,0x6F,0x258A); # LEFT THREE QUARTERS BLOCK
&add(0xA2,0x70,0x2589); # LEFT SEVEN EIGHTHS BLOCK
&add(0xA2,0x71,0x253C); # BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
&add(0xA2,0x72,0x2534); # BOX DRAWINGS LIGHT UP AND HORIZONTAL
&add(0xA2,0x73,0x252C); # BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
&add(0xA2,0x74,0x2524); # BOX DRAWINGS LIGHT VERTICAL AND LEFT
&add(0xA2,0x75,0x251C); # BOX DRAWINGS LIGHT VERTICAL AND RIGHT
&add(0xA2,0x76,0x2594); # UPPER ONE EIGHTH BLOCK
&add(0xA2,0x77,0x2500); # BOX DRAWINGS LIGHT HORIZONTAL
&add(0xA2,0x78,0x2502); # BOX DRAWINGS LIGHT VERTICAL
&add(0xA2,0x79,0x2595); # RIGHT ONE EIGHTH BLOCK
&add(0xA2,0x7A,0x250C); # BOX DRAWINGS LIGHT DOWN AND RIGHT
&add(0xA2,0x7B,0x2510); # BOX DRAWINGS LIGHT DOWN AND LEFT
&add(0xA2,0x7C,0x2514); # BOX DRAWINGS LIGHT UP AND RIGHT
&add(0xA2,0x7D,0x2518); # BOX DRAWINGS LIGHT UP AND LEFT
# 0xA27E-0xA2A7:
# Duplicated maps with HKSCS:2001 0xF9E9-0xF9EB,0xF9F9-0xF9FD
&dup(0xA2,0x7E,0x256D); # BOX DRAWINGS LIGHT ARC DOWN AND RIGHT
&dup(0xA2,0xA1,0x256E); # BOX DRAWINGS LIGHT ARC DOWN AND LEFT
&dup(0xA2,0xA2,0x2570); # BOX DRAWINGS LIGHT ARC UP AND RIGHT
&dup(0xA2,0xA3,0x256F); # BOX DRAWINGS LIGHT ARC UP AND LEFT
&dup(0xA2,0xA4,0x2550); # BOX DRAWINGS DOUBLE HORIZONTAL
&dup(0xA2,0xA5,0x255E); # BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE
&dup(0xA2,0xA6,0x256A); # BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE
&dup(0xA2,0xA7,0x2561); # BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE
&add(0xA2,0xA8,0x25E2); # BLACK LOWER RIGHT TRIANGLE
&add(0xA2,0xA9,0x25E3); # BLACK LOWER LEFT TRIANGLE
&add(0xA2,0xAA,0x25E5); # BLACK UPPER RIGHT TRIANGLE
&add(0xA2,0xAB,0x25E4); # BLACK UPPER LEFT TRIANGLE
&add(0xA2,0xAC,0x2571); # BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT
&add(0xA2,0xAD,0x2572); # BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT
&add(0xA2,0xAE,0x2573); # BOX DRAWINGS LIGHT DIAGONAL CROSS
# FULLWIDTH DIGIT
foreach ((0xAF..0xB8)) {
&add(0xA2,$_,0xFF10+$_-0xAF);
}
# ROMAN NUMERAL
foreach ((0xB9..0xC2)) {
&add(0xA2,$_,0x2160+$_-0xB9);
}
# HANGZHOU NUMERAL
foreach ((0xC3..0xCB)) {
&add(0xA2,$_,0x3021+$_-0xC3);
}
# 0xA2CC-0xA2CE:
# unified to CJK by original Big5 (and causes duplicated mapping) but they
# might be mapped to same block as preceding code points (HANGZHOU NUMERAL).
&add(0xA2,0xCC,0x3038); # HANGZHOU NUMERAL TEN
&add(0xA2,0xCD,0x3039); # HANGZHOU NUMERAL TWENTY
&add(0xA2,0xCE,0x303A); # HANGZHOU NUMERAL THIRTY
# FULLWIDTH LATIN CAPITAL LETTER
foreach ((0xCF..0xE8)) {
&add(0xA2,$_,0xFF21+$_-0xCF);
}
# FULLWIDTH LATIN SMALL LETTER
foreach ((0xE9..0xFE)) {
&add(0xA2,$_,0xFF41+$_-0xE9);
}
foreach ((0x40..0x43)) {
&add(0xA3,$_,0xFF57+$_-0x40);
}
&add(0xA3,0x44,0x0391); # GREEK CAPITAL LETTER ALPHA
&add(0xA3,0x45,0x0392); # GREEK CAPITAL LETTER BETA
&add(0xA3,0x46,0x0393); # GREEK CAPITAL LETTER GAMMA
&add(0xA3,0x47,0x0394); # GREEK CAPITAL LETTER DELTA
&add(0xA3,0x48,0x0395); # GREEK CAPITAL LETTER EPSILON
&add(0xA3,0x49,0x0396); # GREEK CAPITAL LETTER ZETA
&add(0xA3,0x4A,0x0397); # GREEK CAPITAL LETTER ETA
&add(0xA3,0x4B,0x0398); # GREEK CAPITAL LETTER THETA
&add(0xA3,0x4C,0x0399); # GREEK CAPITAL LETTER IOTA
&add(0xA3,0x4D,0x039A); # GREEK CAPITAL LETTER KAPPA
&add(0xA3,0x4E,0x039B); # GREEK CAPITAL LETTER LAMDA
&add(0xA3,0x4F,0x039C); # GREEK CAPITAL LETTER MU
&add(0xA3,0x50,0x039D); # GREEK CAPITAL LETTER NU
&add(0xA3,0x51,0x039E); # GREEK CAPITAL LETTER XI
&add(0xA3,0x52,0x039F); # GREEK CAPITAL LETTER OMICRON
&add(0xA3,0x53,0x03A0); # GREEK CAPITAL LETTER PI
&add(0xA3,0x54,0x03A1); # GREEK CAPITAL LETTER RHO
&add(0xA3,0x55,0x03A3); # GREEK CAPITAL LETTER SIGMA
&add(0xA3,0x56,0x03A4); # GREEK CAPITAL LETTER TAU
&add(0xA3,0x57,0x03A5); # GREEK CAPITAL LETTER UPSILON
&add(0xA3,0x58,0x03A6); # GREEK CAPITAL LETTER PHI
&add(0xA3,0x59,0x03A7); # GREEK CAPITAL LETTER CHI
&add(0xA3,0x5A,0x03A8); # GREEK CAPITAL LETTER PSI
&add(0xA3,0x5B,0x03A9); # GREEK CAPITAL LETTER OMEGA
&add(0xA3,0x5C,0x03B1); # GREEK SMALL LETTER ALPHA
&add(0xA3,0x5D,0x03B2); # GREEK SMALL LETTER BETA
&add(0xA3,0x5E,0x03B3); # GREEK SMALL LETTER GAMMA
&add(0xA3,0x5F,0x03B4); # GREEK SMALL LETTER DELTA
&add(0xA3,0x60,0x03B5); # GREEK SMALL LETTER EPSILON
&add(0xA3,0x61,0x03B6); # GREEK SMALL LETTER ZETA
&add(0xA3,0x62,0x03B7); # GREEK SMALL LETTER ETA
&add(0xA3,0x63,0x03B8); # GREEK SMALL LETTER THETA
&add(0xA3,0x64,0x03B9); # GREEK SMALL LETTER IOTA
&add(0xA3,0x65,0x03BA); # GREEK SMALL LETTER KAPPA
&add(0xA3,0x66,0x03BB); # GREEK SMALL LETTER LAMDA
&add(0xA3,0x67,0x03BC); # GREEK SMALL LETTER MU
&add(0xA3,0x68,0x03BD); # GREEK SMALL LETTER NU
&add(0xA3,0x69,0x03BE); # GREEK SMALL LETTER XI
&add(0xA3,0x6A,0x03BF); # GREEK SMALL LETTER OMICRON
&add(0xA3,0x6B,0x03C0); # GREEK SMALL LETTER PI
&add(0xA3,0x6C,0x03C1); # GREEK SMALL LETTER RHO
&add(0xA3,0x6D,0x03C3); # GREEK SMALL LETTER SIGMA
&add(0xA3,0x6E,0x03C4); # GREEK SMALL LETTER TAU
&add(0xA3,0x6F,0x03C5); # GREEK SMALL LETTER UPSILON
&add(0xA3,0x70,0x03C6); # GREEK SMALL LETTER PHI
&add(0xA3,0x71,0x03C7); # GREEK SMALL LETTER CHI
&add(0xA3,0x72,0x03C8); # GREEK SMALL LETTER PSI
&add(0xA3,0x73,0x03C9); # GREEK SMALL LETTER OMEGA
# BOPOMOFO LETTER
foreach ((0x74..0x7E)) {
&add(0xA3,$_,0x3105+$_-0x74);
}
foreach ((0xA1..0xBA)) {
&add(0xA3,$_,0x3110+$_-0xA1);
}
&add(0xA3,0xBB,0x02D9); # DOT ABOVE
&add(0xA3,0xBC,0x02C9); # MODIFIER LETTER MACRON
&add(0xA3,0xBD,0x02CA); # MODIFIER LETTER ACUTE ACCENT
&add(0xA3,0xBE,0x02C7); # CARON
&add(0xA3,0xBF,0x02CB); # MODIFIER LETTER GRAVE ACCENT
# 0xA3E1:
# found in some vendor codepages for Big5 (e.g. CP950).
&add(0xA3,0xE1,0x20AC); # EURO SIGN
# 0xC6BF-0xC6D7:
# Some implementations for Big5 (non-HKSCS) remove these code points
# and Big5-HKSCS:2001 assigns other CJK characters.
# So this range will not be mapped for Big5-ETen.
# 0xC8A5-0xC8CC: ETen input codes; not mapped.
# Following characters are not included in the HKSCS. However,
# the code points are reserved as compatibility points for backward
# compatibility.
# cf. http://www.info.gov.hk/digital21/chi/hkscs/download/big5cmp.txt
&dup(0x8E,0x69,$fwd{0xBA}{0xE6},1);
&dup(0x8E,0x6F,$fwd{0xED}{0xCA},1);
&dup(0x8E,0x7E,$fwd{0xA2}{0x61},1);
&dup(0x8E,0xAB,$fwd{0xBA}{0xFC},1);
&dup(0x8E,0xB4,$fwd{0xBF}{0xA6},1);
&dup(0x8E,0xCD,$fwd{0xAA}{0xCC},1);
&dup(0x8E,0xD0,$fwd{0xBF}{0xAE},1);
&dup(0x8F,0x57,$fwd{0xB5}{0xD7},1);
&dup(0x8F,0x69,$fwd{0xE3}{0xC8},1);
&dup(0x8F,0x6E,$fwd{0xDB}{0x79},1);
&dup(0x8F,0xCB,$fwd{0xBF}{0xCC},1);
&dup(0x8F,0xCC,$fwd{0xA0}{0xD4},1);
&dup(0x8F,0xFE,$fwd{0xB0}{0x5F},1);
&dup(0x90,0x6D,$fwd{0xB3}{0xA3},1);
&dup(0x90,0x7A,$fwd{0xF9}{0xD7},1);
&dup(0x90,0xDC,$fwd{0xC0}{0x52},1);
&dup(0x90,0xF1,$fwd{0xC5}{0x54},1);
&dup(0x91,0xBF,$fwd{0xF1}{0xE3},1);
&dup(0x92,0x44,$fwd{0x92}{0x42},1);
&dup(0x92,0xAF,$fwd{0xA2}{0x59},1);
&dup(0x92,0xB0,$fwd{0xA2}{0x5A},1);
&dup(0x92,0xB1,$fwd{0xA2}{0x5C},1);
&dup(0x92,0xB2,$fwd{0xA2}{0x5B},1);
&dup(0x92,0xC8,$fwd{0xA0}{0x5F},1);
&dup(0x92,0xD1,$fwd{0xE6}{0xAB},1);
&dup(0x94,0x47,$fwd{0xD2}{0x56},1);
&dup(0x94,0xCA,$fwd{0xE6}{0xD0},1);
&dup(0x95,0xD9,$fwd{0xCA}{0x52},1);
&dup(0x96,0x44,$fwd{0x9C}{0xE4},1);
&dup(0x96,0xED,$fwd{0x96}{0xEE},1);
&dup(0x96,0xFC,$fwd{0xE9}{0x59},1);
&dup(0x9B,0x76,$fwd{0xEF}{0xF9},1);
&dup(0x9B,0x78,$fwd{0xC5}{0xF7},1);
&dup(0x9B,0x7B,$fwd{0xF5}{0xE8},1);
&dup(0x9B,0xC6,$fwd{0xE8}{0xCD},1);
&dup(0x9B,0xDE,$fwd{0xD0}{0xC0},1);
&dup(0x9B,0xEC,$fwd{0xFD}{0x64},1);
&dup(0x9B,0xF6,$fwd{0xBF}{0x47},1);
&dup(0x9C,0x42,$fwd{0xEB}{0xC9},1);
&dup(0x9C,0x53,$fwd{0xCD}{0xE7},1);
&dup(0x9C,0x62,$fwd{0xC0}{0xE7},1);
&dup(0x9C,0x68,$fwd{0xDC}{0x52},1);
&dup(0x9C,0x6B,$fwd{0xF8}{0x6D},1);
&dup(0x9C,0x77,$fwd{0xDB}{0x5D},1);
&dup(0x9C,0xBC,$fwd{0xC9}{0x5C},1);
&dup(0x9C,0xBD,$fwd{0xAF}{0xB0},1);
&dup(0x9C,0xD0,$fwd{0xD4}{0xD1},1);
&dup(0x9D,0x57,$fwd{0xE0}{0x7C},1);
&dup(0x9D,0x5A,$fwd{0xB5}{0xAE},1);
&dup(0x9D,0xC4,$fwd{0xA9}{0xE4},1);
&dup(0x9E,0xA9,$fwd{0xAB}{0xEC},1);
&dup(0x9E,0xEF,$fwd{0xDE}{0xCD},1);
&dup(0x9E,0xFD,$fwd{0xC9}{0xFC},1);
&dup(0x9F,0x60,$fwd{0xF9}{0xC4},1);
&dup(0x9F,0x66,$fwd{0x91}{0xBE},1);
&dup(0x9F,0xCB,$fwd{0xB9}{0xB0},1);
&dup(0x9F,0xD8,$fwd{0x93}{0x61},1);
&dup(0xA0,0x63,$fwd{0x8F}{0xB6},1);
&dup(0xA0,0x77,$fwd{0xA9}{0xF0},1);
&dup(0xA0,0xD5,$fwd{0x94}{0x7A},1);
&dup(0xA0,0xDF,$fwd{0xDE}{0x72},1);
&dup(0xA0,0xE4,$fwd{0x94}{0x55},1);
&dup(0xFA,0x5F,$fwd{0xAD}{0xC5},1);
&dup(0xFA,0x66,$fwd{0xB0}{0xB0},1);
&dup(0xFA,0xBD,$fwd{0xA5}{0x5D},1);
&dup(0xFA,0xC5,$fwd{0xA2}{0xCD},1);
&dup(0xFA,0xD5,$fwd{0xAD}{0xEB},1);
&dup(0xFB,0x48,$fwd{0x9D}{0xEF},1);
&dup(0xFB,0xB8,$fwd{0xB4}{0x40},1);
&dup(0xFB,0xF3,$fwd{0xC9}{0xDB},1);
&dup(0xFB,0xF9,$fwd{0x9D}{0xFB},1);
&dup(0xFC,0x4F,$fwd{0xD8}{0xF4},1);
&dup(0xFC,0x6C,$fwd{0xA0}{0xDC},1);
&dup(0xFC,0xB9,$fwd{0xBC}{0xB5},1);
&dup(0xFC,0xE2,$fwd{0xB4}{0xB8},1);
&dup(0xFC,0xF1,$fwd{0xA7}{0xFB},1);
&dup(0xFD,0xB7,$fwd{0xCB}{0x58},1);
&dup(0xFD,0xB8,$fwd{0xB4}{0xFC},1);
&dup(0xFD,0xBB,$fwd{0xB4}{0xE4},1);
&dup(0xFD,0xF1,$fwd{0xB5}{0x4E},1);
&dup(0xFE,0x52,$fwd{0x99}{0x75},1);
&dup(0xFE,0x6F,$fwd{0xB7}{0xEC},1);
&dup(0xFE,0xAA,$fwd{0xA2}{0x60},1);
&dup(0xFE,0xDD,$fwd{0xCF}{0xF1},1);
sub add {
local($codeh,$codel,$unicode, $ishkscs) = @_;
my $code = $codeh*256+$codel;
my $unicodehash= int($unicode % $revhash);
if ($ishkscs) {
die sprintf("0x%04X",$code) if $codeh < 0x88 || $codeh > 0xFE;
} else {
die sprintf("0x%04X",$code) if $codeh < 161 || $codeh > 249;
}
die if $codel < 64 || ($codel >= 128 && $codel < 161) || $codel >= 255;
die unless $unicode;
die sprintf("0x%04X->U+%04X is duplicated with 0x%04X. use dup()\n",$code,$unicode,$revmap{$unicode})
if defined $revmap{$unicode};
if (! defined $fwd{$codeh})
{
my %dummy;
$fwd{$codeh}= \%dummy;
}
die sprintf("0x%04X->U+%04X is already mapped to U+%04X\n",$code,$unicode,$fwd{$codeh}{$codel})
if defined $fwd{$codeh}{$codel};
$fwd{$codeh}{$codel}=$unicode;
if (! defined $rev[$unicodehash])
{
my @dummy;
$rev[$unicodehash]= \@dummy;
}
my $r=$rev[$unicodehash];
push @$r, "$unicode $code";
$revmap{$unicode} = $code;
}
sub dup {
local($codeh,$codel,$unicode,$ishkscs) = @_;
my $code = $codeh*256+$codel;
my $unicodehash= int($unicode % $revhash);
if ($ishkscs) {
die sprintf("0x%04X",$code) if $codeh < 0x88 || $codeh > 0xFE;
} else {
die sprintf("0x%04X",$code) if $codeh < 161 || $codeh > 249;
}
die if $codel < 64 || ($codel >= 128 && $codel < 161) || $codel >= 255;
die unless $unicode;
die sprintf("%04X->U+%04X is not duplicated. use add()\n",$code,$unicode)
if ! defined $revmap{$unicode};
if (! defined $fwd{$codeh})
{
my %dummy;
$fwd{$codeh}= \%dummy;
}
$fwd{$codeh}{$codel}=$unicode;
}
# Add maps for CJK compatibility ideographs of Unicode.
&add_cjkcompat(%compat_ksx1001);
####&add_cjkcompat(%compat_big5);
&add_cjkcompat(%compat_ibm32);
&add_cjkcompat(%compat_jisx0213);
####&add_cjkcompat(%compat_cns11643);
sub add_cjkcompat {
local(%compat) = @_;
foreach (keys %compat) {
if (defined $revmap{$compat{$_}}) {
my $unicodehash = int($_ % $revhash);
if (! defined $rev[$unicodehash])
{
my @dummy;
$rev[$unicodehash]= \@dummy;
}
my $r=$rev[$unicodehash];
push @$r, "$_ $revmap{$compat{$_}}";
}
}
}
print '
/*
** Copyright 2000 Double Precision, Inc.
** See COPYING for distribution information.
**
** $Id: big5.pl,v 1.4 2004/02/08 04:59:14 mrsam Exp $
** Non-hanzi and ETen / HKSCS extension support
** by Hatuka*nezumi - IKEDA Soji <nezumi@jca.apc.org>.
*/
#include "unicode.h"
';
foreach (sort keys %fwd)
{
my $h=$_;
my $l;
printf ("static const unicode_char big5_%02x_lo[64]={", $h);
for ($l=64; $l < 128; $l++)
{
print "\n" if ($l % 16) == 0;
printf ("%d", $fwd{$h}{$l});
print "," unless $l >= 127;
}
print "};\n";
printf ("static const unicode_char big5_%02x_hi[94]={\n", $h);
for ($l=161; $l < 255; $l++)
{
print "\n" if ($l % 16) == 0;
printf ("%d", $fwd{$h}{$l});
print "," unless $l >= 254;
}
print "};\n";
}
print "static const unsigned big5_revhash_size=$revhash;
static const unicode_char big5_revtable_uc[]={\n";
my $index=0;
my $maxl=0;
for ($i=0; $i<$revhash; $i++)
{
my $a= $rev[$i];
$revindex[$i]=$index;
my $v;
my @aa=@$a;
$maxl= $#aa if $#aa > $maxl;
while (defined ($v=shift @aa))
{
print "," if $index > 0;
print "\n" if $index && ($index % 16) == 0;
$v =~ s/ .*//;
print $v;
++$index;
}
}
print "};\nstatic const unsigned big5_revtable_octets[]={\n";
$index=0;
for ($i=0; $i<$revhash; $i++)
{
my $a= $rev[$i];
my $v;
my @aa=@$a;
while (defined ($v=shift @aa))
{
print "," if $index > 0;
print "\n" if $index && ($index % 16) == 0;
$v =~ s/.* //;
print $v;
++$index;
}
}
print "};\nstatic const unsigned big5_revtable_index[]={\n";
for ($i=0; $i<$revhash; $i++)
{
print "," if $i > 0;
print "\n" if $i && ($i % 16) == 0;
print $revindex[$i];
}
print "};\n";