Viewing file:
unicode_linebreak.c (13.01 KB) -rw-rw-rw-Select action/file-type:

(
+) |

(
+) |

(
+) |
Code (
+) |
Session (
+) |

(
+) |
SDB (
+) |

(
+) |

(
+) |

(
+) |

(
+) |

(
+) |
/*
** Copyright 2011 Double Precision, Inc.
** See COPYING for distribution information.
**
*/
#include "unicode_config.h"
#include "unicode.h"
#include <unistd.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include "linebreaktab_internal.h"
#include "linebreaktab.h"
#define UNICODE_LB_SOT 0xFF
struct unicode_lb_info {
int (*cb_func)(int, void *);
void *cb_arg;
int opts;
uint8_t savedclass;
size_t savedcmcnt;
uint8_t prevclass;
uint8_t prevclass_nsp;
int (*next_handler)(struct unicode_lb_info *, uint8_t);
int (*end_handler)(struct unicode_lb_info *);
};
/* http://www.unicode.org/reports/tr14/#Algorithm */
static int next_def(unicode_lb_info_t, uint8_t);
static int end_def(unicode_lb_info_t);
static int next_lb25_seenophy(unicode_lb_info_t, uint8_t);
static int end_lb25_seenophy(unicode_lb_info_t);
static int next_lb25_seennu(unicode_lb_info_t, uint8_t);
static int next_lb25_seennuclcp(unicode_lb_info_t, uint8_t);
static void unicode_lb_reset(unicode_lb_info_t i)
{
i->prevclass=i->prevclass_nsp=UNICODE_LB_SOT;
i->next_handler=next_def;
i->end_handler=end_def;
}
unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *),
void *cb_arg)
{
unicode_lb_info_t i=calloc(1, sizeof(struct unicode_lb_info));
i->cb_func=cb_func;
i->cb_arg=cb_arg;
unicode_lb_reset(i);
return i;
}
int unicode_lb_end(unicode_lb_info_t i)
{
int rc=(*i->end_handler)(i);
free(i);
return rc;
}
void unicode_lb_set_opts(unicode_lb_info_t i, int opts)
{
i->opts=opts;
}
/* Default end handler has nothing to do */
static int end_def(unicode_lb_info_t i)
{
/* LB3 N/A */
return 0;
}
#define RESULT(x) (*i->cb_func)((x), i->cb_arg)
int unicode_lb_next_cnt(unicode_lb_info_t i,
const unicode_char *chars,
size_t cnt)
{
while (cnt)
{
int rc=unicode_lb_next(i, *chars);
if (rc)
return rc;
++chars;
--cnt;
}
return 0;
}
int unicode_lb_lookup(unicode_char ch)
{
return unicode_tab_lookup(ch,
unicode_indextab,
sizeof(unicode_indextab)
/ sizeof(unicode_indextab[0]),
unicode_rangetab,
unicode_classtab,
UNICODE_LB_AL /* XX, LB1 */);
}
int unicode_lb_next(unicode_lb_info_t i,
unicode_char ch)
{
return (*i->next_handler)(i, unicode_lb_lookup(ch));
}
static int next_def_nolb25(unicode_lb_info_t i,
uint8_t uclass,
int nolb25);
/*
** Default logic for next unicode char.
*/
static int next_def(unicode_lb_info_t i,
uint8_t uclass)
{
return next_def_nolb25(i, uclass, 0);
}
static int next_def_nolb25(unicode_lb_info_t i,
uint8_t uclass,
/* Flag -- recursively invoked after discarding LB25 */
int nolb25)
{
/* Retrieve the previous unicode character's linebreak class. */
uint8_t prevclass=i->prevclass;
uint8_t prevclass_nsp=i->prevclass_nsp;
/* Save this unicode char's linebreak class, for the next goaround */
i->prevclass=uclass;
if (uclass != UNICODE_LB_SP)
i->prevclass_nsp=uclass;
if (uclass == UNICODE_LB_NU)
i->next_handler=next_lb25_seennu; /* LB25 */
if (prevclass == UNICODE_LB_SOT)
{
if (uclass == UNICODE_LB_CM) /* LB9 */
i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
return RESULT(UNICODE_LB_NONE); /* LB2 */
}
if (prevclass == UNICODE_LB_CR && uclass == UNICODE_LB_LF)
return RESULT(UNICODE_LB_NONE); /* LB5 */
switch (prevclass) {
case UNICODE_LB_BK:
case UNICODE_LB_CR:
case UNICODE_LB_LF:
case UNICODE_LB_NL:
if (uclass == UNICODE_LB_CM)
{
i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
/* LB9 */
}
return RESULT(UNICODE_LB_MANDATORY); /* LB4, LB5 */
case UNICODE_LB_SP:
case UNICODE_LB_ZW:
if (uclass == UNICODE_LB_CM)
i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
/* LB10 */
break;
default:
break;
}
switch (uclass) {
/* LB6: */
case UNICODE_LB_BK:
case UNICODE_LB_CR:
case UNICODE_LB_LF:
case UNICODE_LB_NL:
/* LB7: */
case UNICODE_LB_SP:
case UNICODE_LB_ZW:
return RESULT(UNICODE_LB_NONE);
default:
break;
}
if (prevclass_nsp == UNICODE_LB_ZW)
return RESULT(UNICODE_LB_ALLOWED); /* LB8 */
if (uclass == UNICODE_LB_CM)
{
i->prevclass=prevclass;
i->prevclass_nsp=prevclass_nsp;
return RESULT(UNICODE_LB_NONE); /* LB9 */
}
if (prevclass == UNICODE_LB_WJ || uclass == UNICODE_LB_WJ)
return RESULT(UNICODE_LB_NONE); /* LB11 */
if (prevclass == UNICODE_LB_GL)
return RESULT(UNICODE_LB_NONE); /* LB12 */
if (uclass == UNICODE_LB_GL &&
prevclass != UNICODE_LB_SP &&
prevclass != UNICODE_LB_BA &&
prevclass != UNICODE_LB_HY)
return RESULT(UNICODE_LB_NONE); /* LB12a */
switch (uclass) {
case UNICODE_LB_SY:
if (i->opts & UNICODE_LB_OPT_SYBREAK)
{
if (prevclass == UNICODE_LB_SP)
return RESULT(UNICODE_LB_ALLOWED);
}
case UNICODE_LB_CL:
case UNICODE_LB_CP:
case UNICODE_LB_EX:
case UNICODE_LB_IS:
return RESULT(UNICODE_LB_NONE); /* LB13 */
default:
break;
}
if ((i->opts & UNICODE_LB_OPT_SYBREAK) && prevclass == UNICODE_LB_SY)
switch (uclass) {
case UNICODE_LB_EX:
case UNICODE_LB_AL:
case UNICODE_LB_ID:
return RESULT(UNICODE_LB_NONE);
}
if (prevclass_nsp == UNICODE_LB_OP)
return RESULT(UNICODE_LB_NONE); /* LB14 */
if (prevclass_nsp == UNICODE_LB_QU && uclass == UNICODE_LB_OP)
return RESULT(UNICODE_LB_NONE); /* LB15 */
if ((prevclass_nsp == UNICODE_LB_CL || prevclass_nsp == UNICODE_LB_CP)
&& uclass == UNICODE_LB_NS)
return RESULT(UNICODE_LB_NONE); /* LB16 */
if (prevclass_nsp == UNICODE_LB_B2 && uclass == UNICODE_LB_B2)
return RESULT(UNICODE_LB_NONE); /* LB17 */
if (prevclass == UNICODE_LB_SP)
return RESULT(UNICODE_LB_ALLOWED); /* LB18 */
if (uclass == UNICODE_LB_QU || prevclass == UNICODE_LB_QU)
return RESULT(UNICODE_LB_NONE); /* LB19 */
if (uclass == UNICODE_LB_CB || prevclass == UNICODE_LB_CB)
return RESULT(UNICODE_LB_ALLOWED); /* LB20 */
/* LB21: */
switch (uclass) {
case UNICODE_LB_BA:
case UNICODE_LB_HY:
case UNICODE_LB_NS:
return RESULT(UNICODE_LB_NONE);
default:
break;
}
if (prevclass == UNICODE_LB_BB)
return RESULT(UNICODE_LB_NONE);
if (uclass == UNICODE_LB_IN)
switch (prevclass) {
case UNICODE_LB_AL:
case UNICODE_LB_ID:
case UNICODE_LB_IN:
case UNICODE_LB_NU:
return RESULT(UNICODE_LB_NONE); /* LB22 */
default:
break;
}
if (prevclass == UNICODE_LB_ID && uclass == UNICODE_LB_PO)
return RESULT(UNICODE_LB_NONE); /* LB23 */
if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_NU)
return RESULT(UNICODE_LB_NONE); /* LB23 */
if (prevclass == UNICODE_LB_NU && uclass == UNICODE_LB_AL)
return RESULT(UNICODE_LB_NONE); /* LB23 */
if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_ID)
return RESULT(UNICODE_LB_NONE); /* LB24 */
if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_AL)
return RESULT(UNICODE_LB_NONE); /* LB24 */
if (prevclass == UNICODE_LB_PO && uclass == UNICODE_LB_AL)
return RESULT(UNICODE_LB_NONE); /* LB24 */
if ((i->opts & UNICODE_LB_OPT_PRBREAK) && uclass == UNICODE_LB_PR)
switch (prevclass) {
case UNICODE_LB_PR:
case UNICODE_LB_AL:
case UNICODE_LB_ID:
return RESULT(UNICODE_LB_NONE);
}
if (!nolb25 &&
(prevclass == UNICODE_LB_PR || prevclass == UNICODE_LB_PO))
{
if (uclass == UNICODE_LB_NU)
return RESULT(UNICODE_LB_NONE); /* LB25 */
if (uclass == UNICODE_LB_OP || uclass == UNICODE_LB_HY)
{
i->prevclass=prevclass;
i->prevclass_nsp=prevclass_nsp;
i->savedclass=uclass;
i->savedcmcnt=0;
i->next_handler=next_lb25_seenophy;
i->end_handler=end_lb25_seenophy;
return 0;
}
}
if ((prevclass == UNICODE_LB_OP || prevclass == UNICODE_LB_HY) &&
uclass == UNICODE_LB_NU)
return RESULT(UNICODE_LB_NONE); /* LB25 */
/*****/
if (prevclass == UNICODE_LB_JL)
switch (uclass) {
case UNICODE_LB_JL:
case UNICODE_LB_JV:
case UNICODE_LB_H2:
case UNICODE_LB_H3:
return RESULT(UNICODE_LB_NONE); /* LB26 */
default:
break;
}
if ((prevclass == UNICODE_LB_JV ||
prevclass == UNICODE_LB_H2) &&
(uclass == UNICODE_LB_JV ||
uclass == UNICODE_LB_JT))
return RESULT(UNICODE_LB_NONE); /* LB26 */
if ((prevclass == UNICODE_LB_JT ||
prevclass == UNICODE_LB_H3) &&
uclass == UNICODE_LB_JT)
return RESULT(UNICODE_LB_NONE); /* LB26 */
switch (prevclass) {
case UNICODE_LB_JL:
case UNICODE_LB_JV:
case UNICODE_LB_JT:
case UNICODE_LB_H2:
case UNICODE_LB_H3:
if (uclass == UNICODE_LB_IN || uclass == UNICODE_LB_PO)
return RESULT(UNICODE_LB_NONE); /* LB27 */
default:
break;
}
switch (uclass) {
case UNICODE_LB_JL:
case UNICODE_LB_JV:
case UNICODE_LB_JT:
case UNICODE_LB_H2:
case UNICODE_LB_H3:
if (prevclass == UNICODE_LB_PR)
return RESULT(UNICODE_LB_NONE); /* LB27 */
default:
break;
}
if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_AL)
return RESULT(UNICODE_LB_NONE); /* LB28 */
if (prevclass == UNICODE_LB_IS && uclass == UNICODE_LB_AL)
return RESULT(UNICODE_LB_NONE); /* LB29 */
if ((prevclass == UNICODE_LB_AL || prevclass == UNICODE_LB_NU) &&
uclass == UNICODE_LB_OP)
return RESULT(UNICODE_LB_NONE); /* LB30 */
if ((uclass == UNICODE_LB_AL || uclass == UNICODE_LB_NU) &&
prevclass == UNICODE_LB_CP)
return RESULT(UNICODE_LB_NONE); /* LB30 */
return RESULT(UNICODE_LB_ALLOWED); /* LB31 */
}
/*
** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
** character, but NU did not follow. Backtrack.
*/
static int unwind_lb25_seenophy(unicode_lb_info_t i)
{
int rc;
uint8_t class=i->savedclass;
int nolb25_flag=1;
i->next_handler=next_def;
i->end_handler=end_def;
do
{
rc=next_def_nolb25(i, i->savedclass, nolb25_flag);
if (rc)
return rc;
class=UNICODE_LB_CM;
nolb25_flag=0;
} while (i->savedcmcnt--);
return 0;
}
/*
** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
** character. If there's now a NU, we found the modified LB25 regexp.
*/
static int next_lb25_seenophy(unicode_lb_info_t i,
uint8_t uclass)
{
int rc;
if (uclass == UNICODE_LB_CM)
{
++i->savedcmcnt; /* Keep track of CMs, and try again */
return 0;
}
if (uclass != UNICODE_LB_NU)
{
rc=unwind_lb25_seenophy(i);
if (rc)
return rc;
return next_def_nolb25(i, uclass, 0);
}
do
{
rc=RESULT(UNICODE_LB_NONE); /* (OP|HY) feedback */
if (rc)
return rc;
} while (i->savedcmcnt--);
i->next_handler=next_lb25_seennu;
i->end_handler=end_def;
i->prevclass=i->prevclass_nsp=uclass;
return RESULT(UNICODE_LB_NONE);
}
/*
** Seen (PR|PO)(OP|HY), and now The End. Unwind, and give up.
*/
static int end_lb25_seenophy(unicode_lb_info_t i)
{
int rc=unwind_lb25_seenophy(i);
if (rc == 0)
rc=end_def(i);
return rc;
}
/*
** Seen an NU, modified LB25 regexp.
*/
static int next_lb25_seennu(unicode_lb_info_t i, uint8_t uclass)
{
if (uclass == UNICODE_LB_NU || uclass == UNICODE_LB_SY ||
uclass == UNICODE_LB_IS)
{
i->prevclass=i->prevclass_nsp=uclass;
return RESULT(UNICODE_LB_NONE);
}
if (uclass == UNICODE_LB_CM)
return RESULT(UNICODE_LB_NONE); /* LB9 */
if (uclass == UNICODE_LB_CL || uclass == UNICODE_LB_CP)
{
i->prevclass=i->prevclass_nsp=uclass;
i->next_handler=next_lb25_seennuclcp;
i->end_handler=end_def;
return RESULT(UNICODE_LB_NONE);
}
i->next_handler=next_def;
i->end_handler=end_def;
if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO)
{
i->prevclass=i->prevclass_nsp=uclass;
return RESULT(UNICODE_LB_NONE);
}
return next_def(i, uclass); /* Not a prefix, process normally */
}
/*
** Seen CL|CP, in the modified LB25 regexp.
*/
static int next_lb25_seennuclcp(unicode_lb_info_t i, uint8_t uclass)
{
if (uclass == UNICODE_LB_CM)
return RESULT(UNICODE_LB_NONE); /* LB9 */
i->next_handler=next_def;
i->end_handler=end_def;
if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO)
{
i->prevclass=i->prevclass_nsp=uclass;
return RESULT(UNICODE_LB_NONE);
}
return next_def(i, uclass);
}
/******************/
struct unicode_lbc_info {
unicode_lb_info_t handle;
struct unicode_buf buf;
size_t buf_ptr;
int (*cb_func)(int, unicode_char, void *);
void *cb_arg;
};
static int unicode_lbc_callback(int value, void *ptr)
{
unicode_lbc_info_t h=(unicode_lbc_info_t)ptr;
if (h->buf_ptr >= unicode_buf_len(&h->buf))
{
errno=EINVAL;
return -1; /* Shouldn't happen */
}
return (*h->cb_func)(value, unicode_buf_ptr(&h->buf)[h->buf_ptr++],
h->cb_arg);
}
unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char, void *),
void *cb_arg)
{
unicode_lbc_info_t h=
(unicode_lbc_info_t)calloc(1, sizeof(struct unicode_lbc_info));
if (!h)
return NULL;
h->cb_func=cb_func;
h->cb_arg=cb_arg;
if ((h->handle=unicode_lb_init(unicode_lbc_callback, h)) == NULL)
{
free(h);
return NULL;
}
unicode_buf_init(&h->buf, (size_t)-1);
return h;
}
void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts)
{
unicode_lb_set_opts(i->handle, opts);
}
int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch)
{
if (i->buf_ptr >= unicode_buf_len(&i->buf))
{
i->buf_ptr=0;
unicode_buf_clear(&i->buf);
}
unicode_buf_append(&i->buf, &ch, 1);
return unicode_lb_next(i->handle, ch);
}
int unicode_lbc_end(unicode_lbc_info_t i)
{
int rc=unicode_lb_end(i->handle);
unicode_buf_deinit(&i->buf);
free(i);
return rc;
}