Viewing file:
unicode_wordbreak.c (8.33 KB) -rw-rw-rw-Select action/file-type:

(
+) |

(
+) |

(
+) |
Code (
+) |
Session (
+) |

(
+) |
SDB (
+) |

(
+) |

(
+) |

(
+) |

(
+) |

(
+) |
/*
** Copyright 2011 Double Precision, Inc.
** See COPYING for distribution information.
**
*/
#include "unicode_config.h"
#include "unicode.h"
#include <unistd.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include "wordbreaktab_internal.h"
#include "wordbreaktab.h"
struct unicode_wb_info {
int (*cb_func)(int, void *);
void *cb_arg;
uint8_t prevclass;
size_t wb4_cnt;
size_t wb4_extra_cnt;
int (*next_handler)(unicode_wb_info_t, uint8_t);
int (*end_handler)(unicode_wb_info_t);
};
static int sot(unicode_wb_info_t i, uint8_t cl);
static int wb4(unicode_wb_info_t i);
static int wb1and2_done(unicode_wb_info_t i, uint8_t cl);
static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl);
static int seen_wb67_end_handler(unicode_wb_info_t i);
static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl);
static int seen_wb1112_end_handler(unicode_wb_info_t i);
static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *),
void *cb_arg)
{
unicode_wb_info_t i=calloc(1, sizeof(struct unicode_wb_info));
if (!i)
return NULL;
i->next_handler=sot;
i->cb_func=cb_func;
i->cb_arg=cb_arg;
return i;
}
int unicode_wb_end(unicode_wb_info_t i)
{
int rc;
if (i->end_handler)
rc=(*i->end_handler)(i);
else
rc=wb4(i);
free(i);
return rc;
}
int unicode_wb_next_cnt(unicode_wb_info_t i,
const unicode_char *chars,
size_t cnt)
{
int rc;
while (cnt)
{
rc=unicode_wb_next(i, *chars++);
--cnt;
if (rc)
return rc;
}
return 0;
}
int unicode_wb_next(unicode_wb_info_t i, unicode_char ch)
{
return (*i->next_handler)
(i, unicode_tab_lookup(ch,
unicode_indextab,
sizeof(unicode_indextab)
/ sizeof(unicode_indextab[0]),
unicode_rangetab,
unicode_classtab,
UNICODE_WB_OTHER));
}
static int wb4(unicode_wb_info_t i)
{
int rc=0;
while (i->wb4_cnt > 0)
{
--i->wb4_cnt;
if (rc == 0)
rc=(*i->cb_func)(0, i->cb_arg);
}
return rc;
}
static int result(unicode_wb_info_t i, int flag)
{
int rc=wb4(i);
if (rc == 0)
rc=(*i->cb_func)(flag, i->cb_arg);
return rc;
}
#define SET_HANDLER(next,end) (i->next_handler=next, i->end_handler=end)
static int sot(unicode_wb_info_t i, uint8_t cl)
{
i->prevclass=cl;
SET_HANDLER(wb1and2_done, NULL);
return result(i, 1); /* WB1 */
}
static int wb1and2_done(unicode_wb_info_t i, uint8_t cl)
{
uint8_t prevclass=i->prevclass;
i->prevclass=cl;
if (prevclass == UNICODE_WB_CR && cl == UNICODE_WB_LF)
return result(i, 0); /* WB3 */
switch (prevclass) {
case UNICODE_WB_CR:
case UNICODE_WB_LF:
case UNICODE_WB_Newline:
return result(i, 1); /* WB3a */
}
switch (cl) {
case UNICODE_WB_CR:
case UNICODE_WB_LF:
case UNICODE_WB_Newline:
return result(i, 1); /* WB3b */
}
if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
{
i->prevclass=prevclass;
++i->wb4_cnt;
return 0; /* WB4 */
}
if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_ALetter)
{
return result(i, 0); /* WB5 */
}
if (prevclass == UNICODE_WB_ALetter &&
(cl == UNICODE_WB_MidLetter || cl == UNICODE_WB_MidNumLet))
{
i->wb4_extra_cnt=0;
SET_HANDLER(seen_wb67_handler, seen_wb67_end_handler);
return 0;
}
return wb67_done(i, prevclass, cl);
}
/*
** ALetter (MidLetter | MidNumLet ) ?
**
** prevclass cl
**
** Seen ALetter (MidLetter | MidNumLet), with the second character's status
** not returned yet.
*/
static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl)
{
int rc;
uint8_t prevclass;
size_t extra_cnt;
if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
{
++i->wb4_extra_cnt;
return 0;
}
extra_cnt=i->wb4_extra_cnt;
/*
** Reset the handler to the default, then check WB6
*/
SET_HANDLER(wb1and2_done, NULL);
if (cl == UNICODE_WB_ALetter)
{
rc=result(i, 0); /* WB6 */
i->wb4_cnt=extra_cnt;
if (rc == 0)
rc=result(i, 0); /* WB7 */
i->prevclass=cl;
return rc;
}
prevclass=i->prevclass; /* This was the second character */
/*
** Process the second character, starting with WB7
*/
rc=wb67_done(i, UNICODE_WB_ALetter, prevclass);
i->prevclass=prevclass;
i->wb4_cnt=extra_cnt;
if (rc == 0)
rc=(*i->next_handler)(i, cl);
/* Process the current char now */
return rc;
}
/*
** Seen ALetter (MidLetter | MidNumLet), with the second character's status
** not returned yet, and now sot.
*/
static int seen_wb67_end_handler(unicode_wb_info_t i)
{
int rc;
size_t extra_cnt=i->wb4_extra_cnt;
/*
** Process the second character, starting with WB7.
*/
rc=wb67_done(i, UNICODE_WB_ALetter, i->prevclass);
i->wb4_cnt=extra_cnt;
if (rc == 0)
rc=wb4(i);
return rc;
}
static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
{
if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_Numeric)
return result(i, 0); /* WB8 */
if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_Numeric)
return result(i, 0); /* WB9 */
if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_ALetter)
return result(i, 0); /* WB10 */
if (prevclass == UNICODE_WB_Numeric &&
(cl == UNICODE_WB_MidNum || cl == UNICODE_WB_MidNumLet))
{
i->wb4_extra_cnt=0;
SET_HANDLER(seen_wb1112_handler, seen_wb1112_end_handler);
return 0;
}
return wb1112_done(i, prevclass, cl);
}
/*
** Numeric (MidNum | MidNumLet ) ?
**
** prevclass cl
**
** Seen Numeric (MidNum | MidNumLet), with the second character's status
** not returned yet.
*/
static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl)
{
int rc;
uint8_t prevclass;
size_t extra_cnt;
if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
{
++i->wb4_extra_cnt;
return 0;
}
extra_cnt=i->wb4_extra_cnt;
/*
** Reset the handler to the default, then check WB6
*/
SET_HANDLER(wb1and2_done, NULL);
if (cl == UNICODE_WB_Numeric)
{
rc=result(i, 0); /* WB11 */
i->wb4_cnt=extra_cnt;
if (rc == 0)
rc=result(i, 0); /* WB12 */
i->prevclass=cl;
return rc;
}
prevclass=i->prevclass; /* This was the second character */
/*
** Process the second character, starting with WB7
*/
rc=wb1112_done(i, UNICODE_WB_Numeric, prevclass);
i->prevclass=prevclass;
i->wb4_cnt=extra_cnt;
if (rc == 0)
rc=(*i->next_handler)(i, cl);
/* Process the current char now */
return rc;
}
/*
** Seen Numeric (MidNum | MidNumLet), with the second character's status
** not returned yet, and now sot.
*/
static int seen_wb1112_end_handler(unicode_wb_info_t i)
{
int rc;
size_t extra_cnt=i->wb4_extra_cnt;
/*
** Process the second character, starting with WB11.
*/
rc=wb1112_done(i, UNICODE_WB_Numeric, i->prevclass);
i->wb4_cnt=extra_cnt;
if (rc == 0)
rc=wb4(i);
return rc;
}
static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
{
if (prevclass == UNICODE_WB_Katakana &&
cl == UNICODE_WB_Katakana)
return result(i, 0); /* WB13 */
switch (prevclass) {
case UNICODE_WB_ALetter:
case UNICODE_WB_Numeric:
case UNICODE_WB_Katakana:
case UNICODE_WB_ExtendNumLet:
if (cl == UNICODE_WB_ExtendNumLet)
return result(i, 0); /* WB13a */
}
if (prevclass == UNICODE_WB_ExtendNumLet)
switch (cl) {
case UNICODE_WB_ALetter:
case UNICODE_WB_Numeric:
case UNICODE_WB_Katakana:
return result(i, 0); /* WB13b */
}
return result(i, 1); /* WB14 */
}
/* --------------------------------------------------------------------- */
struct unicode_wbscan_info {
unicode_wb_info_t wb_handle;
int found;
size_t cnt;
};
static int unicode_wbscan_callback(int, void *);
unicode_wbscan_info_t unicode_wbscan_init()
{
unicode_wbscan_info_t i=calloc(1, sizeof(struct unicode_wbscan_info));
if (!i)
return NULL;
if ((i->wb_handle=unicode_wb_init(unicode_wbscan_callback, i)) == NULL)
{
free(i);
return NULL;
}
return i;
}
int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch)
{
if (!i->found)
unicode_wb_next(i->wb_handle, ch);
return i->found;
}
size_t unicode_wbscan_end(unicode_wbscan_info_t i)
{
size_t n;
unicode_wb_end(i->wb_handle);
n=i->cnt;
free(i);
return n;
}
static int unicode_wbscan_callback(int flag, void *arg)
{
unicode_wbscan_info_t i=(unicode_wbscan_info_t)arg;
if (flag && i->cnt > 0)
i->found=1;
if (!i->found)
++i->cnt;
return 0;
}