aboutsummaryrefslogtreecommitdiffstats
path: root/b.c
diff options
context:
space:
mode:
authorWarner Losh <imp@FreeBSD.org>2019-06-02 04:23:56 +0000
committerWarner Losh <imp@FreeBSD.org>2019-06-02 04:23:56 +0000
commit03ee4d05f1d963d60451e04ce505e4da116300db (patch)
tree9dcabc6cffafcb6b8195148feb519d575b0bf6ac /b.c
parent3a4488f93f2dc8fe9de757a418b74aa0aa4f9ed1 (diff)
downloadsrc-vendor/one-true-awk.tar.gz
src-vendor/one-true-awk.zip
Import latest one-true-awk from upstreamvendor/one-true-awk/4189ef5dvendor/one-true-awk
Import git hash 4189ef5d from https://github.com/onetrueawk/awk.git as there's not been a release in a while. Upstream one-true-awk woke-up! Time to catch up. This may also revert FreeBSD changes that we'd placed in the vendor branch in anticipation of their inclusion in upstream. That's not yet the case, and these will be resolved in the merge. See FIXES for a complete list of bugs fixed (starting with the Jun 7, 2018 entry).
Notes
Notes: svn path=/vendor/one-true-awk/dist/; revision=348505 svn path=/vendor/one-true-awk/4189ef5d/; revision=348506; tag=vendor/one-true-awk/4189ef5d
Diffstat (limited to 'b.c')
-rw-r--r--b.c291
1 files changed, 271 insertions, 20 deletions
diff --git a/b.c b/b.c
index 5ccb4b1e5d0f..37ea0a5bb2a7 100644
--- a/b.c
+++ b/b.c
@@ -27,6 +27,7 @@ THIS SOFTWARE.
#define DEBUG
#include <ctype.h>
+#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
@@ -65,6 +66,11 @@ int rlxval;
static uschar *rlxstr;
static uschar *prestr; /* current position in current re */
static uschar *lastre; /* origin of last re */
+static uschar *lastatom; /* origin of last Atom */
+static uschar *starttok;
+static uschar *basestr; /* starts with original, replaced during
+ repetition processing */
+static uschar *firstbasestr;
static int setcnt;
static int poscnt;
@@ -82,11 +88,11 @@ fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */
fa *pfa;
static int now = 1;
- if (setvec == NULL) { /* first time through any RE */
+ if (setvec == 0) { /* first time through any RE */
maxsetvec = MAXLIN;
setvec = (int *) malloc(maxsetvec * sizeof(int));
tmpset = (int *) malloc(maxsetvec * sizeof(int));
- if (setvec == NULL || tmpset == NULL)
+ if (setvec == 0 || tmpset == 0)
overflo("out of space initializing makedfa");
}
@@ -124,6 +130,8 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
Node *p, *p1;
fa *f;
+ firstbasestr = (uschar *) s;
+ basestr = firstbasestr;
p = reparse(s);
p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
/* put ALL STAR in front of reg. exp. */
@@ -137,7 +145,7 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
f->accept = poscnt-1; /* penter has computed number of positions in re */
cfoll(f, p1); /* set up follow sets */
freetr(p1);
- if ((f->posns[0] = (int *) calloc(*(f->re[0].lfollow), sizeof(int))) == NULL)
+ if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
overflo("out of space in makedfa");
if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
overflo("out of space in makedfa");
@@ -145,6 +153,10 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
f->initstat = makeinit(f, anchor);
f->anchor = anchor;
f->restr = (uschar *) tostring(s);
+ if (firstbasestr != basestr) {
+ if (basestr)
+ xfree(basestr);
+ }
return f;
}
@@ -157,7 +169,7 @@ int makeinit(fa *f, int anchor)
f->reset = 0;
k = *(f->re[0].lfollow);
xfree(f->posns[2]);
- if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
+ if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
overflo("out of space in makeinit");
for (i=0; i <= k; i++) {
(f->posns[2])[i] = (f->re[0].lfollow)[i];
@@ -290,11 +302,11 @@ char *cclenter(const char *argp) /* add a character class */
int i, c, c2;
uschar *p = (uschar *) argp;
uschar *op, *bp;
- static uschar *buf = NULL;
+ static uschar *buf = 0;
static int bufsz = 100;
op = p;
- if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
+ if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
FATAL("out of space for character class [%.10s...] 1", p);
bp = buf;
for (i = 0; (c = *p++) != 0; ) {
@@ -350,14 +362,14 @@ void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfo
maxsetvec *= 4;
setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
- if (setvec == NULL || tmpset == NULL)
+ if (setvec == 0 || tmpset == 0)
overflo("out of space in cfoll()");
}
for (i = 0; i <= f->accept; i++)
setvec[i] = 0;
setcnt = 0;
follow(v); /* computes setvec and setcnt */
- if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
+ if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
overflo("out of space building follow set");
f->re[info(v)].lfollow = p;
*p = setcnt;
@@ -391,7 +403,7 @@ int first(Node *p) /* collects initially active leaves of p into setvec */
maxsetvec *= 4;
setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
- if (setvec == NULL || tmpset == NULL)
+ if (setvec == 0 || tmpset == 0)
overflo("out of space in first()");
}
if (type(p) == EMPTYRE) {
@@ -531,7 +543,7 @@ int pmatch(fa *f, const char *p0) /* longest match, for sub */
for (i = 2; i <= f->curstat; i++)
xfree(f->posns[i]);
k = *f->posns[0];
- if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
+ if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
overflo("out of space in pmatch");
for (i = 0; i <= k; i++)
(f->posns[2])[i] = (f->posns[0])[i];
@@ -588,7 +600,7 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
for (i = 2; i <= f->curstat; i++)
xfree(f->posns[i]);
k = *f->posns[0];
- if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
+ if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
overflo("out of state space");
for (i = 0; i <= k; i++)
(f->posns[2])[i] = (f->posns[0])[i];
@@ -628,9 +640,11 @@ Node *regexp(void) /* top-level parse of reg expr */
Node *primary(void)
{
Node *np;
+ int savelastatom;
switch (rtok) {
case CHAR:
+ lastatom = starttok;
np = op2(CHAR, NIL, itonp(rlxval));
rtok = relex();
return (unary(np));
@@ -639,16 +653,19 @@ Node *primary(void)
return (unary(op2(ALL, NIL, NIL)));
case EMPTYRE:
rtok = relex();
- return (unary(op2(ALL, NIL, NIL)));
+ return (unary(op2(EMPTYRE, NIL, NIL)));
case DOT:
+ lastatom = starttok;
rtok = relex();
return (unary(op2(DOT, NIL, NIL)));
case CCL:
np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
+ lastatom = starttok;
rtok = relex();
return (unary(np));
case NCCL:
np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
+ lastatom = starttok;
rtok = relex();
return (unary(np));
case '^':
@@ -658,6 +675,8 @@ Node *primary(void)
rtok = relex();
return (unary(op2(CHAR, NIL, NIL)));
case '(':
+ lastatom = starttok;
+ savelastatom = starttok - basestr; /* Retain over recursion */
rtok = relex();
if (rtok == ')') { /* special pleading for () */
rtok = relex();
@@ -665,6 +684,7 @@ Node *primary(void)
}
np = regexp();
if (rtok == ')') {
+ lastatom = basestr + savelastatom; /* Restore */
rtok = relex();
return (unary(np));
}
@@ -679,8 +699,12 @@ Node *primary(void)
Node *concat(Node *np)
{
switch (rtok) {
- case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
+ case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
return (concat(op2(CAT, np, primary())));
+ case EMPTYRE:
+ rtok = relex();
+ return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
+ primary())));
}
return (np);
}
@@ -749,7 +773,7 @@ struct charclass {
{ "alnum", 5, isalnum },
{ "alpha", 5, isalpha },
#ifndef HAS_ISBLANK
- { "blank", 5, isspace }, /* was isblank */
+ { "blank", 5, xisblank },
#else
{ "blank", 5, isblank },
#endif
@@ -765,16 +789,132 @@ struct charclass {
{ NULL, 0, NULL },
};
+#define REPEAT_SIMPLE 0
+#define REPEAT_PLUS_APPENDED 1
+#define REPEAT_WITH_Q 2
+#define REPEAT_ZERO 3
+
+static int
+replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
+ int atomlen, int firstnum, int secondnum, int special_case)
+{
+ int i, j;
+ uschar *buf = 0;
+ int ret = 1;
+ int init_q = (firstnum==0); /* first added char will be ? */
+ int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
+ int prefix_length = reptok - basestr; /* prefix includes first rep */
+ int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */
+ int size = prefix_length + suffix_length;
+
+ if (firstnum > 1) { /* add room for reps 2 through firstnum */
+ size += atomlen*(firstnum-1);
+ }
+
+ /* Adjust size of buffer for special cases */
+ if (special_case == REPEAT_PLUS_APPENDED) {
+ size++; /* for the final + */
+ } else if (special_case == REPEAT_WITH_Q) {
+ size += init_q + (atomlen+1)* n_q_reps;
+ } else if (special_case == REPEAT_ZERO) {
+ size += 2; /* just a null ERE: () */
+ }
+ if ((buf = (uschar *) malloc(size+1)) == NULL)
+ FATAL("out of space in reg expr %.10s..", lastre);
+ memcpy(buf, basestr, prefix_length); /* copy prefix */
+ j = prefix_length;
+ if (special_case == REPEAT_ZERO) {
+ j -= atomlen;
+ buf[j++] = '(';
+ buf[j++] = ')';
+ }
+ for (i=1; i < firstnum; i++) { /* copy x reps */
+ memcpy(&buf[j], atom, atomlen);
+ j += atomlen;
+ }
+ if (special_case == REPEAT_PLUS_APPENDED) {
+ buf[j++] = '+';
+ } else if (special_case == REPEAT_WITH_Q) {
+ if (init_q) buf[j++] = '?';
+ for (i=0; i < n_q_reps; i++) { /* copy x? reps */
+ memcpy(&buf[j], atom, atomlen);
+ j += atomlen;
+ buf[j++] = '?';
+ }
+ }
+ memcpy(&buf[j], reptok+reptoklen, suffix_length);
+ if (special_case == REPEAT_ZERO) {
+ buf[j+suffix_length] = '\0';
+ } else {
+ buf[size] = '\0';
+ }
+ /* free old basestr */
+ if (firstbasestr != basestr) {
+ if (basestr)
+ xfree(basestr);
+ }
+ basestr = buf;
+ prestr = buf + prefix_length;
+ if (special_case == REPEAT_ZERO) {
+ prestr -= atomlen;
+ ret++;
+ }
+ return ret;
+}
+
+static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
+ int atomlen, int firstnum, int secondnum)
+{
+ /*
+ In general, the repetition specifier or "bound" is replaced here
+ by an equivalent ERE string, repeating the immediately previous atom
+ and appending ? and + as needed. Note that the first copy of the
+ atom is left in place, except in the special_case of a zero-repeat
+ (i.e., {0}).
+ */
+ if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
+ if (firstnum < 2) {
+ /* 0 or 1: should be handled before you get here */
+ FATAL("internal error");
+ } else {
+ return replace_repeat(reptok, reptoklen, atom, atomlen,
+ firstnum, secondnum, REPEAT_PLUS_APPENDED);
+ }
+ } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
+ if (firstnum == 0) { /* {0} or {0,0} */
+ /* This case is unusual because the resulting
+ replacement string might actually be SMALLER than
+ the original ERE */
+ return replace_repeat(reptok, reptoklen, atom, atomlen,
+ firstnum, secondnum, REPEAT_ZERO);
+ } else { /* (firstnum >= 1) */
+ return replace_repeat(reptok, reptoklen, atom, atomlen,
+ firstnum, secondnum, REPEAT_SIMPLE);
+ }
+ } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
+ /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
+ return replace_repeat(reptok, reptoklen, atom, atomlen,
+ firstnum, secondnum, REPEAT_WITH_Q);
+ } else { /* Error - shouldn't be here (n>m) */
+ FATAL("internal error");
+ }
+ return 0;
+}
int relex(void) /* lexical analyzer for reparse */
{
int c, n;
int cflag;
- static uschar *buf = NULL;
+ static uschar *buf = 0;
static int bufsz = 100;
uschar *bp;
struct charclass *cc;
int i;
+ int num, m, commafound, digitfound;
+ const uschar *startreptok;
+
+rescan:
+ starttok = prestr;
switch (c = *prestr++) {
case '|': return OR;
@@ -795,7 +935,7 @@ int relex(void) /* lexical analyzer for reparse */
rlxval = c;
return CHAR;
case '[':
- if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
+ if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
FATAL("out of space in reg expr %.10s..", lastre);
bp = buf;
if (*prestr == '^') {
@@ -823,7 +963,15 @@ int relex(void) /* lexical analyzer for reparse */
if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
prestr[2 + cc->cc_namelen] == ']') {
prestr += cc->cc_namelen + 3;
- for (i = 0; i < NCHARS; i++) {
+ /*
+ * BUG: We begin at 1, instead of 0, since we
+ * would otherwise prematurely terminate the
+ * string for classes like [[:cntrl:]]. This
+ * means that we can't match the NUL character,
+ * not without first adapting the entire
+ * program to track each string's length.
+ */
+ for (i = 1; i <= UCHAR_MAX; i++) {
if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
FATAL("out of space for reg expr %.10s...", lastre);
if (cc->cc_func(i)) {
@@ -833,6 +981,40 @@ int relex(void) /* lexical analyzer for reparse */
}
} else
*bp++ = c;
+ } else if (c == '[' && *prestr == '.') {
+ char collate_char;
+ prestr++;
+ collate_char = *prestr++;
+ if (*prestr == '.' && prestr[1] == ']') {
+ prestr += 2;
+ /* Found it: map via locale TBD: for
+ now, simply return this char. This
+ is sufficient to pass conformance
+ test awk.ex 156
+ */
+ if (*prestr == ']') {
+ prestr++;
+ rlxval = collate_char;
+ return CHAR;
+ }
+ }
+ } else if (c == '[' && *prestr == '=') {
+ char equiv_char;
+ prestr++;
+ equiv_char = *prestr++;
+ if (*prestr == '=' && prestr[1] == ']') {
+ prestr += 2;
+ /* Found it: map via locale TBD: for now
+ simply return this char. This is
+ sufficient to pass conformance test
+ awk.ex 156
+ */
+ if (*prestr == ']') {
+ prestr++;
+ rlxval = equiv_char;
+ return CHAR;
+ }
+ }
} else if (c == '\0') {
FATAL("nonterminated character class %.20s", lastre);
} else if (bp == buf) { /* 1st char is special */
@@ -847,6 +1029,75 @@ int relex(void) /* lexical analyzer for reparse */
} else
*bp++ = c;
}
+ break;
+ case '{':
+ if (isdigit(*(prestr))) {
+ num = 0; /* Process as a repetition */
+ n = -1; m = -1;
+ commafound = 0;
+ digitfound = 0;
+ startreptok = prestr-1;
+ /* Remember start of previous atom here ? */
+ } else { /* just a { char, not a repetition */
+ rlxval = c;
+ return CHAR;
+ }
+ for (; ; ) {
+ if ((c = *prestr++) == '}') {
+ if (commafound) {
+ if (digitfound) { /* {n,m} */
+ m = num;
+ if (m<n)
+ FATAL("illegal repetition expression: class %.20s",
+ lastre);
+ if ((n==0) && (m==1)) {
+ return QUEST;
+ }
+ } else { /* {n,} */
+ if (n==0) return STAR;
+ if (n==1) return PLUS;
+ }
+ } else {
+ if (digitfound) { /* {n} same as {n,n} */
+ n = num;
+ m = num;
+ } else { /* {} */
+ FATAL("illegal repetition expression: class %.20s",
+ lastre);
+ }
+ }
+ if (repeat(starttok, prestr-starttok, lastatom,
+ startreptok - lastatom, n, m) > 0) {
+ if ((n==0) && (m==0)) {
+ return EMPTYRE;
+ }
+ /* must rescan input for next token */
+ goto rescan;
+ }
+ /* Failed to replace: eat up {...} characters
+ and treat like just PLUS */
+ return PLUS;
+ } else if (c == '\0') {
+ FATAL("nonterminated character class %.20s",
+ lastre);
+ } else if (isdigit(c)) {
+ num = 10 * num + c - '0';
+ digitfound = 1;
+ } else if (c == ',') {
+ if (commafound)
+ FATAL("illegal repetition expression: class %.20s",
+ lastre);
+ /* looking for {n,} or {n,m} */
+ commafound = 1;
+ n = num;
+ digitfound = 0; /* reset */
+ num = 0;
+ } else {
+ FATAL("illegal repetition expression: class %.20s",
+ lastre);
+ }
+ }
+ break;
}
}
@@ -860,7 +1111,7 @@ int cgoto(fa *f, int s, int c)
maxsetvec *= 4;
setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
- if (setvec == NULL || tmpset == NULL)
+ if (setvec == 0 || tmpset == 0)
overflo("out of space in cgoto()");
}
for (i = 0; i <= f->accept; i++)
@@ -882,7 +1133,7 @@ int cgoto(fa *f, int s, int c)
maxsetvec *= 4;
setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
- if (setvec == NULL || tmpset == NULL)
+ if (setvec == 0 || tmpset == 0)
overflo("cgoto overflow");
}
if (setvec[q[j]] == 0) {
@@ -925,7 +1176,7 @@ int cgoto(fa *f, int s, int c)
for (i = 0; i < NCHARS; i++)
f->gototab[f->curstat][i] = 0;
xfree(f->posns[f->curstat]);
- if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
+ if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
overflo("out of space in cgoto");
f->posns[f->curstat] = p;