aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRuslan Ermilov <ru@FreeBSD.org>2003-07-30 06:47:03 +0000
committerRuslan Ermilov <ru@FreeBSD.org>2003-07-30 06:47:03 +0000
commit88b8d48716b2c520d992a423aa9c57e19b09baef (patch)
treecab265f99f6731de1c61b382e08586a6f6d799bc
parentfc6b1dfe9528e808b3a12b6c207857ebddb5057b (diff)
downloadsrc-88b8d48716b2c520d992a423aa9c57e19b09baef.tar.gz
src-88b8d48716b2c520d992a423aa9c57e19b09baef.zip
Vendor import of bwk's 29-Jul-2003 release.
Notes
Notes: svn path=/vendor/one-true-awk/dist/; revision=118194
-rw-r--r--contrib/one-true-awk/FIXES46
-rw-r--r--contrib/one-true-awk/b.c33
-rw-r--r--contrib/one-true-awk/lex.c2
-rw-r--r--contrib/one-true-awk/main.c7
-rw-r--r--contrib/one-true-awk/run.c2
5 files changed, 60 insertions, 30 deletions
diff --git a/contrib/one-true-awk/FIXES b/contrib/one-true-awk/FIXES
index bf9381b63098..296a2c941c99 100644
--- a/contrib/one-true-awk/FIXES
+++ b/contrib/one-true-awk/FIXES
@@ -25,6 +25,52 @@ THIS SOFTWARE.
This file lists all bug fixes, changes, etc., made since the AWK book
was sent to the printers in August, 1987.
+Jul 29, 2003:
+ fixed (i think) the long-standing botch that included the beginning of
+ line state ^ for RE's in the set of valid characters; this led to a
+ variety of odd problems, including failure to properly match certain
+ regular expressions in non-US locales. thanks to ruslan for keeping
+ at this one.
+
+Jul 28, 2003:
+ n-th try at getting internationalization right, with thanks to volker
+ kiefel, arnold robbins and ruslan ermilov for advice, though they
+ should not be blamed for the outcome. according to posix, "." is the
+ radix character in programs and command line arguments regardless of
+ the locale; otherwise, the locale should prevail for input and output
+ of numbers. so it's intended to work that way.
+
+ i have rescinded the attempt to use strcoll in expanding shorthands in
+ regular expressions (cclenter). its properties are much too
+ surprising; for example [a-c] matches aAbBc in locale en_US but abBcC
+ in locale fr_CA. i can see how this might arise by implementation
+ but i cannot explain it to a human user. (this behavior can be seen
+ in gawk as well; we're leaning on the same library.)
+
+ the issue appears to be that strcoll is meant for sorting, where
+ merging upper and lower case may make sense (though note that unix
+ sort does not do this by default either). it is not appropriate
+ for regular expressions, where the goal is to match specific
+ patterns of characters. in any case, the notations [:lower:], etc.,
+ are available in awk, and they are more likely to work correctly in
+ most locales.
+
+ a moratorium is hereby declared on internationalization changes.
+ i apologize to friends and colleagues in other parts of the world.
+ i would truly like to get this "right", but i don't know what
+ that is, and i do not want to keep making changes until it's clear.
+
+Jul 4, 2003:
+ fixed bug that permitted non-terminated RE, as in "awk /x".
+
+Jun 1, 2003:
+ subtle change to split: if source is empty, number of elems
+ is always 0 and the array is not set.
+
+Mar 21, 2003:
+ added some parens to isblank, in another attempt to make things
+ internationally portable.
+
Mar 14, 2003:
the internationalization changes, somewhat modified, are now
reinstated. in theory awk will now do character comparisons
diff --git a/contrib/one-true-awk/b.c b/contrib/one-true-awk/b.c
index df3aaa956e1d..0f949be572ce 100644
--- a/contrib/one-true-awk/b.c
+++ b/contrib/one-true-awk/b.c
@@ -33,7 +33,7 @@ THIS SOFTWARE.
#include "awk.h"
#include "ytab.h"
-#define HAT (NCHARS-2) /* matches ^ in regular expr */
+#define HAT (NCHARS+2) /* matches ^ in regular expr */
/* NCHARS is 2**n */
#define MAXLIN 22
@@ -282,24 +282,9 @@ int quoted(char **pp) /* pick up next thing after a \\ */
return c;
}
-static int collate_range_cmp(int a, int b)
-{
- int r;
- static char s[2][2];
-
- if ((uschar)a == (uschar)b)
- return 0;
- s[0][0] = a;
- s[1][0] = b;
- if ((r = strcoll(s[0], s[1])) == 0)
- r = (uschar)a - (uschar)b;
- return r;
-}
-
char *cclenter(const char *argp) /* add a character class */
{
int i, c, c2;
- int j;
uschar *p = (uschar *) argp;
uschar *op, *bp;
static uschar *buf = 0;
@@ -318,18 +303,15 @@ char *cclenter(const char *argp) /* add a character class */
c2 = *p++;
if (c2 == '\\')
c2 = quoted((char **) &p);
- if (collate_range_cmp(c, c2) > 0) { /* empty; ignore */
+ if (c > c2) { /* empty; ignore */
bp--;
i--;
continue;
}
- for (j = 0; j < NCHARS; j++) {
- if ((collate_range_cmp(c, j) > 0) ||
- collate_range_cmp(j, c2) > 0)
- continue;
+ while (c < c2) {
if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
FATAL("out of space for character class [%.10s...] 2", p);
- *bp++ = j;
+ *bp++ = ++c;
i++;
}
continue;
@@ -718,11 +700,14 @@ Node *unary(Node *np)
* system i use, it's defined here. if some other locale has a richer
* definition of "blank", define HAS_ISBLANK and provide your own
* version.
+ * the parentheses here are an attempt to find a path through the maze
+ * of macro definition and/or function and/or version provided. thanks
+ * to nelson beebe for the suggestion; let's see if it works everywhere.
*/
#ifndef HAS_ISBLANK
-int isblank(int c)
+int (isblank)(int c)
{
return c==' ' || c=='\t';
}
@@ -839,8 +824,6 @@ int cgoto(fa *f, int s, int c)
int i, j, k;
int *p, *q;
- if (c < 0 || c > 255)
- FATAL("can't happen: neg char %d in cgoto", c);
while (f->accept >= maxsetvec) { /* guessing here! */
maxsetvec *= 4;
setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
diff --git a/contrib/one-true-awk/lex.c b/contrib/one-true-awk/lex.c
index e4b1fd374412..39f5d4d305c2 100644
--- a/contrib/one-true-awk/lex.c
+++ b/contrib/one-true-awk/lex.c
@@ -529,6 +529,8 @@ int regexpr(void)
}
}
*bp = 0;
+ if (c == 0)
+ SYNTAX("non-terminated regular expression %.10s...", buf);
yylval.s = tostring(buf);
unput('/');
RET(REGEXPR);
diff --git a/contrib/one-true-awk/main.c b/contrib/one-true-awk/main.c
index df855ddbfd1a..6e6604e0cba4 100644
--- a/contrib/one-true-awk/main.c
+++ b/contrib/one-true-awk/main.c
@@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
****************************************************************/
-const char *version = "version 20030314";
+const char *version = "version 20030729";
#define DEBUG
#include <stdio.h>
@@ -55,10 +55,8 @@ int main(int argc, char *argv[])
{
const char *fs = NULL;
- setlocale(LC_ALL, "");
- setlocale(LC_COLLATE, "");
setlocale(LC_CTYPE, "");
- setlocale(LC_MESSAGES, "");
+ setlocale(LC_NUMERIC, "C"); /* for parsing cmdline & prog */
cmdname = argv[0];
if (argc == 1) {
fprintf(stderr, "Usage: %s [-f programfile | 'program'] [-Ffieldsep] [-v var=value] [files]\n", cmdname);
@@ -147,6 +145,7 @@ int main(int argc, char *argv[])
if (!safe)
envinit(environ);
yyparse();
+ setlocale(LC_NUMERIC, ""); /* back to whatever it is locally */
if (fs)
*FS = qstring(fs, '\0');
dprintf( ("errorflag=%d\n", errorflag) );
diff --git a/contrib/one-true-awk/run.c b/contrib/one-true-awk/run.c
index 617ac7d822fd..066cb011cb3a 100644
--- a/contrib/one-true-awk/run.c
+++ b/contrib/one-true-awk/run.c
@@ -1221,7 +1221,7 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */
ap->sval = (char *) makesymtab(NSYMTAB);
n = 0;
- if ((*s != '\0' && strlen(fs) > 1) || arg3type == REGEXPR) { /* reg expr */
+ if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) { /* reg expr */
fa *pfa;
if (arg3type == REGEXPR) { /* it's ready already */
pfa = (fa *) a[2];