Viewing file:
re.C (14.35 KB) -rw-rw-rw-Select action/file-type:

(
+) |

(
+) |

(
+) |
Code (
+) |
Session (
+) |

(
+) |
SDB (
+) |

(
+) |

(
+) |

(
+) |

(
+) |

(
+) |
#include "config.h"
#include "re.h"
#include "mio.h"
#include "regexpnode.h"
#include "rematch.h"
#include "funcs.h"
#include "buffer.h"
#include <ctype.h>
//////////////////////////////////////////////////////////////////////////////
//
// Create sets for the [:is....:] codes.
//
static void mk_alnum(unsigned char *p)
{
register unsigned i;
for (i=0; i<256; i++)
if (isalnum(i))
p[i/8] |= 1 << (i % 8);
}
static void mk_alpha(unsigned char *p)
{
register unsigned i;
for (i=0; i<256; i++)
if (isalpha(i))
p[i/8] |= 1 << (i % 8);
}
static void mk_cntrl(unsigned char *p)
{
register unsigned i;
for (i=0; i<256; i++)
if (iscntrl(i))
p[i/8] |= 1 << (i % 8);
}
static void mk_digit(unsigned char *p)
{
register unsigned i;
for (i=0; i<256; i++)
if (isdigit(i))
p[i/8] |= 1 << (i % 8);
}
static void mk_graph(unsigned char *p)
{
register unsigned i;
for (i=0; i<256; i++)
if (isgraph(i))
p[i/8] |= 1 << (i % 8);
}
static void mk_lower(unsigned char *p)
{
register unsigned i;
for (i=0; i<256; i++)
if (islower(i))
p[i/8] |= 1 << (i % 8);
}
static void mk_print(unsigned char *p)
{
register unsigned i;
for (i=0; i<256; i++)
if (isprint(i))
p[i/8] |= 1 << (i % 8);
}
static void mk_punct(unsigned char *p)
{
register unsigned i;
for (i=0; i<256; i++)
if (ispunct(i))
p[i/8] |= 1 << (i % 8);
}
static void mk_space(unsigned char *p)
{
register unsigned i;
for (i=0; i<256; i++)
if (isspace(i))
p[i/8] |= 1 << (i % 8);
}
static void mk_upper(unsigned char *p)
{
register unsigned i;
for (i=0; i<256; i++)
if (isupper(i))
p[i/8] |= 1 << (i % 8);
}
static void mk_xdigit(unsigned char *p)
{
register unsigned i;
for (i=0; i<256; i++)
if (isxdigit(i))
p[i/8] |= 1 << (i % 8);
}
static void mk_wbreak(unsigned char *p)
{
register unsigned i;
for (i=0; i<256; i++)
if (!isalnum(i) && i != '_')
p[i/8] |= 1 << (i % 8);
}
static const char *const is_setname[]={
":alnum:",
":alpha:",
":cntrl:",
":digit:",
":graph:",
":lower:",
":print:",
":punct:",
":space:",
":upper:",
":xdigit:",
":wbreak:"};
static void (*is_setfunc[])(unsigned char *)={
mk_alnum,
mk_alpha,
mk_cntrl,
mk_digit,
mk_graph,
mk_lower,
mk_print,
mk_punct,
mk_space,
mk_upper,
mk_xdigit,
mk_wbreak};
Re::Re() : chainedre(0), prevre(0), nodes(0), first(0), isCaret(0)
{
}
Re::~Re()
{
init();
}
void Re::init()
{
if (chainedre) delete chainedre;
chainedre=0;
RegExpNode *n;
while ((n=nodes) != 0)
{
nodes=n->next;
delete n;
}
}
inline RegExpNode *Re::allocnode()
{
RegExpNode *n;
if ((n=new RegExpNode(nextid++)) == 0)
outofmem();
n->next=nodes; nodes=n; return(n);
}
int Re::Compile(const char *ptr, int caseflag, int &errindex)
{
if (*ptr == '^')
{
if (CompileS(ptr+1, caseflag, errindex)) return (-1);
isCaret=1;
return (0);
}
if (CompileS("[.\n]*", 1, errindex) < 0) return (-1);
if ((chainedre=new Re) == 0)
outofmem();
isDummy=1;
chainedre->prevre=this;
return (chainedre->CompileS(ptr, caseflag, errindex));
}
int Re::CompileS(const char *ptr, int caseflag, int &errindex)
{
expr=ptr;
origexpr=expr;
init();
nextid=0;
first=0;
isCaret=0;
isDummy=0;
casesensitive=caseflag;
matchFull=0;
int rc=0;
try
{
RegExpNode **p=CompileOrClause(&first);
if (*expr == '!')
{
int dummy;
++expr;
if ((chainedre=new Re) == 0)
outofmem();
if ( chainedre->CompileS(expr, caseflag, dummy) < 0)
{
expr += dummy;
throw -1;
}
chainedre->prevre=this;
if (VerboseLevel() > 7)
merr.write("\n*** CHAINED TO ***\n");
} else if (curchar()) throw -1;
final=*p=allocnode();
final->thechar=REFINAL;
}
catch (int n)
{
init();
errindex=expr-origexpr;
rc= n;
}
if (rc == 0 && VerboseLevel() > 7)
{
RegExpNode *n;
Buffer b;
if (first)
{
b="Start node: ";
b.append( (unsigned long)first->id );
b += "\n\n";
b += '\0';
merr.write(b);
}
for (n=nodes; n; n=n->next)
{
b="Node ";
b.append( (unsigned long)n->id );
b += ": ";
switch (n->thechar) {
case RENULL:
b += "null";
break;
case RESET:
b += "[set] ";
{
int i,j=0;
for (i=0; i<256; i=j)
{
j=i+1;
if ((n->reset[i/8] &
(1 << (i % 8))) == 0)
continue;
for (j=i; j<256; j++)
if ((n->reset[j/8] &
(1 << (j % 8)))
== 0)
break;
if (i < ' ' || i > 127)
{
b += '#';
b.append((unsigned long)
i);
}
else
{
if (i == '#'
|| i == '-'
|| i == '\\')
b += '\\';
b += (char)i;
}
if (i+1 == j) continue;
b += ('-');
--j;
}
}
break;
case REFINAL:
b += "final";
break;
default:
if (n->thechar >= ' ' && n->thechar < 127)
{
b += '\'';
b += (char)n->thechar;
b += '\'';
}
else
{
b += "chr(";
b.append((unsigned long)n->thechar);
b += ')';
}
}
b += '\n';
b += '\0';
merr.write( b );
if (n->next1)
{
b=" transition to ";
b.append((unsigned long)n->next1->id);
b += '\n';
b += '\0';
merr.write(b);
}
if (n->next2)
{
b=" transition to ";
b.append((unsigned long)n->next2->id);
b += '\n';
b += '\0';
merr.write(b);
}
merr.write("\n");
}
}
return (rc);
}
RegExpNode **Re::CompileOrClause(RegExpNode **ptr)
{
RegExpNode **finish=CompileAtomString(ptr);
if ( curchar() != '|') return (finish);
RegExpNode *realfinish=allocnode();
realfinish->thechar=RENULL;
*finish=realfinish;
while ( curchar() == '|' )
{
nextchar();
RegExpNode *newstart=allocnode();
newstart->thechar=RENULL;
newstart->next1= *ptr;
*ptr=newstart;
finish=CompileAtomString(&newstart->next2);
*finish=realfinish;
}
return (&realfinish->next1);
}
RegExpNode **Re::CompileAtomString(RegExpNode **ptr)
{
int c;
for (;;)
{
c=curchar();
if (c == 0 || c == '|' || c == ')' || c == '!')
break;
ptr=CompileElement(ptr);
}
return (ptr);
}
RegExpNode **Re::CompileElement(RegExpNode **start)
{
RegExpNode **finish;
if (curchar() != '$')
{
finish=CompileAtom(start);
}
else
{
nextchar();
if (curchar() == 0)
{
matchFull=1;
return (start);
}
(*start)=allocnode();
(*start)->thechar='$';
finish= & (*start)->next1;
}
switch (curchar()) {
case '+':
(*finish)=allocnode();
(*finish)->thechar=RENULL;
(*finish)->next1=(*start);
finish= &(*finish)->next2;
nextchar();
break;
case '*':
(*finish)=allocnode();
(*finish)->thechar=RENULL;
(*finish)->next1=(*start);
(*start)=(*finish);
finish= &(*finish)->next2;
nextchar();
break;
case '?':
{
RegExpNode *newstart=allocnode();
newstart->thechar=RENULL;
(*finish)=allocnode();
(*finish)->thechar=RENULL;
newstart->next1= *start;
newstart->next2= *finish;
*start=newstart;
finish= &(*finish)->next1;
nextchar();
}
break;
}
return (finish);
}
RegExpNode **Re::CompileAtom(RegExpNode **ptr)
{
int c=curchar();
if (c == '(') // Subexpression
{
nextchar();
ptr=CompileOrClause(ptr);
if ( curchar() != ')') throw -1;
nextchar();
return (ptr);
}
(*ptr)=allocnode();
if (c == '[' || c == '.')
{
int i, complement=0;
if ( ((*ptr)->reset=new unsigned char[256/8]) == 0)
outofmem();
for (i=0; i<256/8; i++)
(*ptr)->reset[i]=0;
if ( c == '.' )
{
(*ptr)->reset[ '\n' / 8 ] |= 1 << ('\n' % 8);
complement=1;
}
else
{
nextchar();
if ( curchar() == '^')
{
complement=1;
nextchar();
}
is_sets(*ptr);
}
nextchar();
if (complement)
for (i=0; i<256/8; i++)
(*ptr)->reset[i] ^= ~0;
c=RESET;
}
else c=parsechar();
(*ptr)->thechar=c;
return (&(*ptr)->next1);
}
void Re::is_sets(RegExpNode *p)
{
Buffer buf;
int c=curchar();
int call_parsechar=1;
if (c == ':')
{
do
{
buf += c;
nextchar();
} while ( (c=curchar()) >= 0 && isalpha(c));
if (c == ':')
{
buf += c;
nextchar();
c=curchar();
if (c == ']')
{
buf += '\0';
const char *q=(const char *)buf;
unsigned i;
for (i=0; i<sizeof(is_setname)/
sizeof(is_setname[0]); i++)
{
if (strcmp(is_setname[i], q) == 0)
{
(*is_setfunc[i])(p->reset);
return;
}
}
}
}
int i=0;
for (i=0; i<buf.Length(); i++)
{
c=(int)(unsigned char)((const char*)buf)[i];
p->reset[ c / 8 ] |= 1 << (c % 8);
}
// In case the next character is '-', leave 'c' the way it
// is.
call_parsechar=0;
if (curchar() == ']') return;
}
do
{
int c2;
if (c == 0) throw -1;
if (c == '.')
{
for (c2=0; c2 < 256/8; c2++)
if (c2 != '\n' / 8)
p->reset[c2]= ~0;
else
p->reset[c2] |= ~(1 << ('\n' % 8));
if (call_parsechar)
nextchar();
call_parsechar=1;
continue;
}
if (call_parsechar)
c=parsechar();
c2=c;
call_parsechar=1;
if (curchar() == '-')
{
nextchar();
c2=parsechar();
}
while ( c <= c2 )
{
p->reset[ c / 8 ] |= 1 << (c % 8);
++c;
}
} while ((c=curchar()) != ']');
}
int Re::parsechar()
{
int c;
c=curchar();
if (c == 0) throw -1;
nextchar();
if (c != '\\') return (c);
c=curchar();
if (c == 0)
throw -1;
else if (c >= '0' && c <= '7')
{
unsigned char uc=0;
while ( c >= '0' && c <= '7' )
{
uc = uc * 8 + (c-'0');
nextchar();
c=curchar();
}
c=uc;
}
else
{
c=backslash_char(c);
nextchar();
}
return (c);
}
/////////////////////////////////////////////////////////////////////////////
int Re::Match(ReMatch &string)
{
matched= -1;
matchedpos=0;
charsmatched=0;
state1.init(nextid);
state2.init(nextid);
curstate= &state1;
nextstate= &state2;
curstate->nodes[0]=first;
curstate->numnodes=1;
curstate->nodenums[first->id]=0;
final_id=final->id;
if (VerboseLevel() > 8)
{
merr.write("*** MATCH START ***\n");
}
for (;;)
{
// Compute null closure
unsigned n;
for (n=0; n<curstate->numnodes; n++)
{
RegExpNode *p=curstate->nodes[n];
if (p->thechar != RENULL) continue;
RegExpNode *q=p->next1;
if (q && curstate->nodenums[q->id] != charsmatched)
{
curstate->nodes[curstate->numnodes++]=q;
curstate->nodenums[q->id]=charsmatched;
if (VerboseLevel() > 8)
{
Buffer b;
b=" Transition to state ";
b.append((unsigned long)q->id);
b += '\n';
b += '\0';
merr.write(b);
}
}
q=p->next2;
if (q && curstate->nodenums[q->id] != charsmatched)
{
curstate->nodes[curstate->numnodes++]=q;
curstate->nodenums[q->id]=charsmatched;
if (VerboseLevel() > 8)
{
Buffer b;
b=" Transition to state ";
b.append((unsigned long)q->id);
b += '\n';
b += '\0';
merr.write(b);
}
}
}
int nextChar;
if (curstate->nodenums[final_id] == charsmatched)
{
off_t pos=string.GetCurrentPos();
if (VerboseLevel() > 8)
merr.write("**Final node.\n");
if (chainedre)
{
unsigned long saved_matched_chainedre=
chainedre->charsmatched;
// On subsequent passes, charsmatched gets
// reset. If, previously, we had a match,
// don't forget # of characters matched!
if (VerboseLevel() > 8)
merr.write(
"**Final node - checking subexpr.\n");
if (chainedre->Match(string) == 0)
{
if (VerboseLevel() > 8)
{
Buffer buf;
buf="**Subexpr matched after ";
buf.append( (unsigned long)
charsmatched);
buf += " characters.\n";
buf += '\0';
merr.write(buf);
}
matched=0;
matchedpos=charsmatched;
if (isDummy) // Don't need to
// look for max matches
// for the dummy block
{
return (0);
}
}
else
{
if (VerboseLevel() > 8)
merr.write(
"**Subexpr didn't match.\n");
chainedre->charsmatched=
saved_matched_chainedre;
}
string.SetCurrentPos(pos);
nextChar=string.NextChar();
}
else
{
if (!matchFull) // We don't need to match full
// string.
{
if (VerboseLevel() > 8)
{
Buffer buf;
buf="Matched ";
buf.append( (unsigned long)
charsmatched);
buf += " characters.\n";
buf += '\0';
merr.write(buf);
}
matched=0;
matchedpos=charsmatched;
}
nextChar=string.NextChar();
if ( nextChar < 0)
{
if (VerboseLevel() > 8)
{
Buffer buf;
buf="Matched ";
buf.append( (unsigned long)
charsmatched);
buf += " characters.\n";
buf += '\0';
merr.write(buf);
}
return (0); // Matched everything
}
}
}
else nextChar=string.NextChar();
if (nextChar < 0)
{
if (VerboseLevel() > 8)
merr.write(
"Failed - End of matching string.\n");
charsmatched=matchedpos;
return (matched);
}
if (curstate->numnodes == 0) // No sense to continue
{
if (VerboseLevel() > 8)
merr.write(
"Failed - out of states.\n");
charsmatched=matchedpos;
return (matched);
}
if (VerboseLevel() > 8)
{
Buffer b;
b="Matching character: ";
if (nextChar <= ' ' || nextChar > 127)
{
b += '#';
b.append((unsigned long)nextChar);
}
else b += (char)nextChar;
b += '\n';
b += '\0';
merr.write(b);
}
++charsmatched;
if (!casesensitive)
nextChar=tolower(nextChar);
nextstate->numnodes=0;
for (n=0; n<curstate->numnodes; n++)
{
RegExpNode *p=curstate->nodes[n];
if (p->thechar == RESET)
{
if ((p->reset[nextChar / 8] &
(1 << (nextChar % 8))) == 0)
{
if (casesensitive) continue;
int uchar=toupper(nextChar);
if ((p->reset[uchar / 8] &
(1 << (uchar % 8))) == 0)
continue;
}
}
else
{
if (p->thechar != nextChar)
{
if (casesensitive) continue;
int uchar=toupper(nextChar);
if (p->thechar != uchar)
continue;
}
}
RegExpNode *q=p->next1;
if (q && nextstate->nodenums[q->id] != charsmatched)
{
nextstate->nodes[nextstate->numnodes++]=q;
nextstate->nodenums[q->id]=charsmatched;
if (VerboseLevel() > 8)
{
Buffer b;
b=" Transition to state ";
b.append((unsigned long)q->id);
b += '\n';
b += '\0';
merr.write(b);
}
}
q=p->next2;
if (q && nextstate->nodenums[q->id] != charsmatched)
{
nextstate->nodes[nextstate->numnodes++]=q;
nextstate->nodenums[q->id]=charsmatched;
if (VerboseLevel() > 8)
{
Buffer b;
b=" Transition to state ";
b.append((unsigned long)q->id);
b += '\n';
b += '\0';
merr.write(b);
}
}
}
ReEval *swap=curstate; curstate=nextstate; nextstate=swap;
}
}