From 112a84adaae2e69c27b00ace46a5456e8f120359 Mon Sep 17 00:00:00 2001 From: xiechengliang Date: Thu, 25 Nov 2021 20:31:24 +0800 Subject: [PATCH] fix CVE-2021-42378 CVE-2021-42379 CVE-2021-42380 CVE-2021-42381 CVE-2021-42382 CVE-2021-42383 CVE-2021-42384 CVE-2021-42385 and CVE-2021-42386 Signed-off-by: xiechengliang --- backport-fix-awk-cve.patch | 7363 ++++++++++++++++++++++++++++++++++++ busybox.spec | 9 +- 2 files changed, 7371 insertions(+), 1 deletion(-) create mode 100644 backport-fix-awk-cve.patch diff --git a/backport-fix-awk-cve.patch b/backport-fix-awk-cve.patch new file mode 100644 index 0000000..743f576 --- /dev/null +++ b/backport-fix-awk-cve.patch @@ -0,0 +1,7363 @@ +From aec213c228426fbad3cd9d4038dffecaf92947bf Mon Sep 17 00:00:00 2001 +From: Ron Yorston +Date: Wed, 27 Jan 2021 11:19:14 +0000 +Subject: [PATCH 01/61] awk: allow printf('%c') to output NUL, closes 13486 + +Treat the output of printf as binary rather than a null-terminated +string so that NUL characters can be output. + +This is considered to be a GNU extension, though it's also available +in mawk and FreeBSD's awk. + +function old new delta +evaluate 3487 3504 +17 +awk_printf 504 519 +15 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 2/0 up/down: 32/0) Total: 32 bytes + +Signed-off-by: Ron Yorston +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 18 +++++++++++++++--- + testsuite/awk.tests | 5 +++++ + 2 files changed, 20 insertions(+), 3 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 2c15f9e4e..b4f6a3741 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -2155,7 +2155,10 @@ static int fmt_num(char *b, int size, const char *format, double n, int int_as_i + } + + /* formatted output into an allocated buffer, return ptr to buffer */ +-static char *awk_printf(node *n) ++#if !ENABLE_FEATURE_AWK_GNU_EXTENSIONS ++# define awk_printf(a, b) awk_printf(a) ++#endif ++static char *awk_printf(node *n, int *len) + { + char *b = NULL; + char *fmt, *s, *f; +@@ -2209,6 +2212,10 @@ static char *awk_printf(node *n) + nvfree(v); + b = xrealloc(b, i + 1); + b[i] = '\0'; ++#if ENABLE_FEATURE_AWK_GNU_EXTENSIONS ++ if (len) ++ *len = i; ++#endif + return b; + } + +@@ -2666,6 +2673,7 @@ static var *evaluate(node *op, var *res) + case XC( OC_PRINT ): + case XC( OC_PRINTF ): { + FILE *F = stdout; ++ IF_FEATURE_AWK_GNU_EXTENSIONS(int len;) + + if (op->r.n) { + rstream *rsm = newfile(R.s); +@@ -2703,8 +2711,12 @@ static var *evaluate(node *op, var *res) + fputs(getvar_s(intvar[ORS]), F); + + } else { /* OC_PRINTF */ +- char *s = awk_printf(op1); ++ char *s = awk_printf(op1, &len); ++#if ENABLE_FEATURE_AWK_GNU_EXTENSIONS ++ fwrite(s, len, 1, F); ++#else + fputs(s, F); ++#endif + free(s); + } + fflush(F); +@@ -2978,7 +2990,7 @@ static var *evaluate(node *op, var *res) + break; + + case XC( OC_SPRINTF ): +- setvar_p(res, awk_printf(op1)); ++ setvar_p(res, awk_printf(op1, NULL)); + break; + + case XC( OC_UNARY ): { +diff --git a/testsuite/awk.tests b/testsuite/awk.tests +index 92c83d719..cf9b722dc 100755 +--- a/testsuite/awk.tests ++++ b/testsuite/awk.tests +@@ -383,6 +383,11 @@ testing "awk errors on missing delete arg" \ + "awk -e '{delete}' 2>&1" "awk: cmd. line:1: Too few arguments\n" "" "" + SKIP= + ++optional FEATURE_AWK_GNU_EXTENSIONS ++testing "awk printf('%c') can output NUL" \ ++ "awk '{printf(\"hello%c null\n\", 0)}'" "hello\0 null\n" "" "\n" ++SKIP= ++ + # testing "description" "command" "result" "infile" "stdin" + testing 'awk negative field access' \ + 'awk 2>&1 -- '\''{ $(-1) }'\' \ +-- +2.27.0 + + +From 9dcd2d5cc91bde2d6cdd038ed23408057d6f6429 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Wed, 16 Jun 2021 09:18:08 +0200 +Subject: [PATCH 02/61] awk: fix use-after-free in "$BIGNUM1 $BIGGERNUM2" + concat op + +Second reference to a field reallocs/moves Fields[] array, but first ref +still tries to use the element where it was before move. + +function old new delta +fsrealloc 94 106 +12 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 85 ++++++++++++++++++++++++++++++++++++++++++--------- + 1 file changed, 71 insertions(+), 14 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index b4f6a3741..48836298c 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1745,12 +1745,22 @@ static char* qrealloc(char *b, int n, int *size) + /* resize field storage space */ + static void fsrealloc(int size) + { +- int i; ++ int i, newsize; + + if (size >= maxfields) { ++ /* Sanity cap, easier than catering for overflows */ ++ if (size > 0xffffff) ++ bb_die_memory_exhausted(); ++ + i = maxfields; + maxfields = size + 16; +- Fields = xrealloc(Fields, maxfields * sizeof(Fields[0])); ++ ++ newsize = maxfields * sizeof(Fields[0]); ++ debug_printf_eval("fsrealloc: xrealloc(%p, %u)\n", Fields, newsize); ++ Fields = xrealloc(Fields, newsize); ++ debug_printf_eval("fsrealloc: Fields=%p..%p\n", Fields, (char*)Fields + newsize - 1); ++ /* ^^^ did Fields[] move? debug aid for L.v getting "upstaged" by R.v in evaluate() */ ++ + for (; i < maxfields; i++) { + Fields[i].type = VF_SPECIAL; + Fields[i].string = NULL; +@@ -2614,20 +2624,30 @@ static var *evaluate(node *op, var *res) + /* execute inevitable things */ + if (opinfo & OF_RES1) + L.v = evaluate(op1, v1); +- if (opinfo & OF_RES2) +- R.v = evaluate(op->r.n, v1+1); + if (opinfo & OF_STR1) { + L.s = getvar_s(L.v); + debug_printf_eval("L.s:'%s'\n", L.s); + } +- if (opinfo & OF_STR2) { +- R.s = getvar_s(R.v); +- debug_printf_eval("R.s:'%s'\n", R.s); +- } + if (opinfo & OF_NUM1) { + L_d = getvar_i(L.v); + debug_printf_eval("L_d:%f\n", L_d); + } ++ /* NB: Must get string/numeric values of L (done above) ++ * _before_ evaluate()'ing R.v: if both L and R are $NNNs, ++ * and right one is large, then L.v points to Fields[NNN1], ++ * second evaluate() reallocates and moves (!) Fields[], ++ * R.v points to Fields[NNN2] but L.v now points to freed mem! ++ * (Seen trying to evaluate "$444 $44444") ++ */ ++ if (opinfo & OF_RES2) { ++ R.v = evaluate(op->r.n, v1+1); ++ //TODO: L.v may be invalid now, set L.v to NULL to catch bugs? ++ //L.v = NULL; ++ } ++ if (opinfo & OF_STR2) { ++ R.s = getvar_s(R.v); ++ debug_printf_eval("R.s:'%s'\n", R.s); ++ } + + debug_printf_eval("switch(0x%x)\n", XC(opinfo & OPCLSMASK)); + switch (XC(opinfo & OPCLSMASK)) { +@@ -2636,6 +2656,7 @@ static var *evaluate(node *op, var *res) + + /* test pattern */ + case XC( OC_TEST ): ++ debug_printf_eval("TEST\n"); + if ((op1->info & OPCLSMASK) == OC_COMMA) { + /* it's range pattern */ + if ((opinfo & OF_CHECKED) || ptest(op1->l.n)) { +@@ -2653,25 +2674,32 @@ static var *evaluate(node *op, var *res) + + /* just evaluate an expression, also used as unconditional jump */ + case XC( OC_EXEC ): ++ debug_printf_eval("EXEC\n"); + break; + + /* branch, used in if-else and various loops */ + case XC( OC_BR ): ++ debug_printf_eval("BR\n"); + op = istrue(L.v) ? op->a.n : op->r.n; + break; + + /* initialize for-in loop */ + case XC( OC_WALKINIT ): ++ debug_printf_eval("WALKINIT\n"); + hashwalk_init(L.v, iamarray(R.v)); + break; + + /* get next array item */ + case XC( OC_WALKNEXT ): ++ debug_printf_eval("WALKNEXT\n"); + op = hashwalk_next(L.v) ? op->a.n : op->r.n; + break; + + case XC( OC_PRINT ): +- case XC( OC_PRINTF ): { ++ debug_printf_eval("PRINT /\n"); ++ case XC( OC_PRINTF ): ++ debug_printf_eval("PRINTF\n"); ++ { + FILE *F = stdout; + IF_FEATURE_AWK_GNU_EXTENSIONS(int len;) + +@@ -2726,22 +2754,28 @@ static var *evaluate(node *op, var *res) + /* case XC( OC_DELETE ): - moved to happen before arg evaluation */ + + case XC( OC_NEWSOURCE ): ++ debug_printf_eval("NEWSOURCE\n"); + g_progname = op->l.new_progname; + break; + + case XC( OC_RETURN ): ++ debug_printf_eval("RETURN\n"); + copyvar(res, L.v); + break; + + case XC( OC_NEXTFILE ): ++ debug_printf_eval("NEXTFILE\n"); + nextfile = TRUE; + case XC( OC_NEXT ): ++ debug_printf_eval("NEXT\n"); + nextrec = TRUE; + case XC( OC_DONE ): ++ debug_printf_eval("DONE\n"); + clrvar(res); + break; + + case XC( OC_EXIT ): ++ debug_printf_eval("EXIT\n"); + awk_exit(L_d); + + /* -- recursive node type -- */ +@@ -2761,15 +2795,18 @@ static var *evaluate(node *op, var *res) + break; + + case XC( OC_IN ): ++ debug_printf_eval("IN\n"); + setvar_i(res, hash_search(iamarray(R.v), L.s) ? 1 : 0); + break; + + case XC( OC_REGEXP ): ++ debug_printf_eval("REGEXP\n"); + op1 = op; + L.s = getvar_s(intvar[F0]); + goto re_cont; + + case XC( OC_MATCH ): ++ debug_printf_eval("MATCH\n"); + op1 = op->r.n; + re_cont: + { +@@ -2795,6 +2832,7 @@ static var *evaluate(node *op, var *res) + break; + + case XC( OC_TERNARY ): ++ debug_printf_eval("TERNARY\n"); + if ((op->r.n->info & OPCLSMASK) != OC_COLON) + syntax_error(EMSG_POSSIBLE_ERROR); + res = evaluate(istrue(L.v) ? op->r.n->l.n : op->r.n->r.n, res); +@@ -2803,6 +2841,7 @@ static var *evaluate(node *op, var *res) + case XC( OC_FUNC ): { + var *vbeg, *v; + const char *sv_progname; ++ debug_printf_eval("FUNC\n"); + + /* The body might be empty, still has to eval the args */ + if (!op->r.n->info && !op->r.f->body.first) +@@ -2832,7 +2871,10 @@ static var *evaluate(node *op, var *res) + } + + case XC( OC_GETLINE ): +- case XC( OC_PGETLINE ): { ++ debug_printf_eval("GETLINE /\n"); ++ case XC( OC_PGETLINE ): ++ debug_printf_eval("PGETLINE\n"); ++ { + rstream *rsm; + int i; + +@@ -2873,6 +2915,7 @@ static var *evaluate(node *op, var *res) + /* simple builtins */ + case XC( OC_FBLTIN ): { + double R_d = R_d; /* for compiler */ ++ debug_printf_eval("FBLTIN\n"); + + switch (opn) { + case F_in: +@@ -2986,14 +3029,18 @@ static var *evaluate(node *op, var *res) + } + + case XC( OC_BUILTIN ): ++ debug_printf_eval("BUILTIN\n"); + res = exec_builtin(op, res); + break; + + case XC( OC_SPRINTF ): ++ debug_printf_eval("SPRINTF\n"); + setvar_p(res, awk_printf(op1, NULL)); + break; + +- case XC( OC_UNARY ): { ++ case XC( OC_UNARY ): ++ debug_printf_eval("UNARY\n"); ++ { + double Ld, R_d; + + Ld = R_d = getvar_i(R.v); +@@ -3023,7 +3070,9 @@ static var *evaluate(node *op, var *res) + break; + } + +- case XC( OC_FIELD ): { ++ case XC( OC_FIELD ): ++ debug_printf_eval("FIELD\n"); ++ { + int i = (int)getvar_i(R.v); + if (i < 0) + syntax_error(EMSG_NEGATIVE_FIELD); +@@ -3040,8 +3089,10 @@ static var *evaluate(node *op, var *res) + + /* concatenation (" ") and index joining (",") */ + case XC( OC_CONCAT ): ++ debug_printf_eval("CONCAT /\n"); + case XC( OC_COMMA ): { + const char *sep = ""; ++ debug_printf_eval("COMMA\n"); + if ((opinfo & OPCLSMASK) == OC_COMMA) + sep = getvar_s(intvar[SUBSEP]); + setvar_p(res, xasprintf("%s%s%s", L.s, sep, R.s)); +@@ -3049,17 +3100,22 @@ static var *evaluate(node *op, var *res) + } + + case XC( OC_LAND ): ++ debug_printf_eval("LAND\n"); + setvar_i(res, istrue(L.v) ? ptest(op->r.n) : 0); + break; + + case XC( OC_LOR ): ++ debug_printf_eval("LOR\n"); + setvar_i(res, istrue(L.v) ? 1 : ptest(op->r.n)); + break; + + case XC( OC_BINARY ): +- case XC( OC_REPLACE ): { ++ debug_printf_eval("BINARY /\n"); ++ case XC( OC_REPLACE ): ++ debug_printf_eval("REPLACE\n"); ++ { + double R_d = getvar_i(R.v); +- debug_printf_eval("BINARY/REPLACE: R_d:%f opn:%c\n", R_d, opn); ++ debug_printf_eval("R_d:%f opn:%c\n", R_d, opn); + switch (opn) { + case '+': + L_d += R_d; +@@ -3095,6 +3151,7 @@ static var *evaluate(node *op, var *res) + case XC( OC_COMPARE ): { + int i = i; /* for compiler */ + double Ld; ++ debug_printf_eval("COMPARE\n"); + + if (is_numeric(L.v) && is_numeric(R.v)) { + Ld = getvar_i(L.v) - getvar_i(R.v); +-- +2.27.0 + + +From 1d5e5492dd8368ee3870bcd390754aa7c3f8956c Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 18 Jun 2021 16:35:27 +0200 +Subject: [PATCH 03/61] awk: after preinc/dec, only allow variable, field ref, + array ref, or another preinc/dec + +Accepting nonsense like "--4", and even "-- -4" is confusing. + +function old new delta +parse_expr 917 938 +21 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 87 ++++++++++++++++++++++++++++++++++++++++----------- + 1 file changed, 69 insertions(+), 18 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 48836298c..2563722f9 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -66,6 +66,8 @@ + #endif + #ifndef debug_printf_parse + # define debug_printf_parse(...) (fprintf(stderr, __VA_ARGS__)) ++#else ++# define debug_parse_print_tc(...) ((void)0) + #endif + + +@@ -210,13 +212,13 @@ typedef struct tsplitter_s { + #define TC_SEQTERM (1 << 1) /* ) */ + #define TC_REGEXP (1 << 2) /* /.../ */ + #define TC_OUTRDR (1 << 3) /* | > >> */ +-#define TC_UOPPOST (1 << 4) /* unary postfix operator */ +-#define TC_UOPPRE1 (1 << 5) /* unary prefix operator */ ++#define TC_UOPPOST (1 << 4) /* unary postfix operator ++ -- */ ++#define TC_UOPPRE1 (1 << 5) /* unary prefix operator ++ -- $ */ + #define TC_BINOPX (1 << 6) /* two-opnd operator */ + #define TC_IN (1 << 7) + #define TC_COMMA (1 << 8) + #define TC_PIPE (1 << 9) /* input redirection pipe */ +-#define TC_UOPPRE2 (1 << 10) /* unary prefix operator */ ++#define TC_UOPPRE2 (1 << 10) /* unary prefix operator + - ! */ + #define TC_ARRTERM (1 << 11) /* ] */ + #define TC_GRPSTART (1 << 12) /* { */ + #define TC_GRPTERM (1 << 13) /* } */ +@@ -243,14 +245,51 @@ typedef struct tsplitter_s { + #define TC_STRING (1 << 29) + #define TC_NUMBER (1 << 30) + +-#define TC_UOPPRE (TC_UOPPRE1 | TC_UOPPRE2) ++#ifndef debug_parse_print_tc ++#define debug_parse_print_tc(n) do { \ ++if ((n) & TC_SEQSTART) debug_printf_parse(" SEQSTART"); \ ++if ((n) & TC_SEQTERM ) debug_printf_parse(" SEQTERM" ); \ ++if ((n) & TC_REGEXP ) debug_printf_parse(" REGEXP" ); \ ++if ((n) & TC_OUTRDR ) debug_printf_parse(" OUTRDR" ); \ ++if ((n) & TC_UOPPOST ) debug_printf_parse(" UOPPOST" ); \ ++if ((n) & TC_UOPPRE1 ) debug_printf_parse(" UOPPRE1" ); \ ++if ((n) & TC_BINOPX ) debug_printf_parse(" BINOPX" ); \ ++if ((n) & TC_IN ) debug_printf_parse(" IN" ); \ ++if ((n) & TC_COMMA ) debug_printf_parse(" COMMA" ); \ ++if ((n) & TC_PIPE ) debug_printf_parse(" PIPE" ); \ ++if ((n) & TC_UOPPRE2 ) debug_printf_parse(" UOPPRE2" ); \ ++if ((n) & TC_ARRTERM ) debug_printf_parse(" ARRTERM" ); \ ++if ((n) & TC_GRPSTART) debug_printf_parse(" GRPSTART"); \ ++if ((n) & TC_GRPTERM ) debug_printf_parse(" GRPTERM" ); \ ++if ((n) & TC_SEMICOL ) debug_printf_parse(" SEMICOL" ); \ ++if ((n) & TC_NEWLINE ) debug_printf_parse(" NEWLINE" ); \ ++if ((n) & TC_STATX ) debug_printf_parse(" STATX" ); \ ++if ((n) & TC_WHILE ) debug_printf_parse(" WHILE" ); \ ++if ((n) & TC_ELSE ) debug_printf_parse(" ELSE" ); \ ++if ((n) & TC_BUILTIN ) debug_printf_parse(" BUILTIN" ); \ ++if ((n) & TC_LENGTH ) debug_printf_parse(" LENGTH" ); \ ++if ((n) & TC_GETLINE ) debug_printf_parse(" GETLINE" ); \ ++if ((n) & TC_FUNCDECL) debug_printf_parse(" FUNCDECL"); \ ++if ((n) & TC_BEGIN ) debug_printf_parse(" BEGIN" ); \ ++if ((n) & TC_END ) debug_printf_parse(" END" ); \ ++if ((n) & TC_EOF ) debug_printf_parse(" EOF" ); \ ++if ((n) & TC_VARIABLE) debug_printf_parse(" VARIABLE"); \ ++if ((n) & TC_ARRAY ) debug_printf_parse(" ARRAY" ); \ ++if ((n) & TC_FUNCTION) debug_printf_parse(" FUNCTION"); \ ++if ((n) & TC_STRING ) debug_printf_parse(" STRING" ); \ ++if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ ++} while (0) ++#endif + + /* combined token classes */ ++#define TC_UOPPRE (TC_UOPPRE1 | TC_UOPPRE2) ++ + #define TC_BINOP (TC_BINOPX | TC_COMMA | TC_PIPE | TC_IN) + //#define TC_UNARYOP (TC_UOPPRE | TC_UOPPOST) + #define TC_OPERAND (TC_VARIABLE | TC_ARRAY | TC_FUNCTION \ + | TC_BUILTIN | TC_LENGTH | TC_GETLINE \ + | TC_SEQSTART | TC_STRING | TC_NUMBER) ++#define TC_LVALUE (TC_VARIABLE | TC_ARRAY) + + #define TC_STATEMNT (TC_STATX | TC_WHILE) + #define TC_OPTERM (TC_SEMICOL | TC_NEWLINE) +@@ -284,7 +323,6 @@ typedef struct tsplitter_s { + #define OF_CHECKED 0x200000 + #define OF_REQUIRED 0x400000 + +- + /* combined operator flags */ + #define xx 0 + #define xV OF_RES2 +@@ -313,10 +351,8 @@ typedef struct tsplitter_s { + #define PRIMASK2 0x7E000000 + + /* Operation classes */ +- + #define SHIFT_TIL_THIS 0x0600 + #define RECUR_FROM_THIS 0x1000 +- + enum { + OC_DELETE = 0x0100, OC_EXEC = 0x0200, OC_NEWSOURCE = 0x0300, + OC_PRINT = 0x0400, OC_PRINTF = 0x0500, OC_WALKINIT = 0x0600, +@@ -411,7 +447,9 @@ static const uint32_t tokeninfo[] ALIGN4 = { + OC_REGEXP, + xS|'a', xS|'w', xS|'|', + OC_UNARY|xV|P(9)|'p', OC_UNARY|xV|P(9)|'m', +- OC_UNARY|xV|P(9)|'P', OC_UNARY|xV|P(9)|'M', OC_FIELD|xV|P(5), ++#define TI_PREINC (OC_UNARY|xV|P(9)|'P') ++#define TI_PREDEC (OC_UNARY|xV|P(9)|'M') ++ TI_PREINC, TI_PREDEC, OC_FIELD|xV|P(5), + OC_COMPARE|VV|P(39)|5, OC_MOVE|VV|P(74), OC_REPLACE|NV|P(74)|'+', OC_REPLACE|NV|P(74)|'-', + OC_REPLACE|NV|P(74)|'*', OC_REPLACE|NV|P(74)|'/', OC_REPLACE|NV|P(74)|'%', OC_REPLACE|NV|P(74)|'&', + OC_BINARY|NV|P(29)|'+', OC_BINARY|NV|P(29)|'-', OC_REPLACE|NV|P(74)|'&', OC_BINARY|NV|P(15)|'&', +@@ -1070,6 +1108,10 @@ static uint32_t next_token(uint32_t expected) + uint32_t tc; + const uint32_t *ti; + ++ debug_printf_parse("%s() expected(%x):", __func__, expected); ++ debug_parse_print_tc(expected); ++ debug_printf_parse("\n"); ++ + if (t_rollback) { + debug_printf_parse("%s: using rolled-back token\n", __func__); + t_rollback = FALSE; +@@ -1226,7 +1268,9 @@ static uint32_t next_token(uint32_t expected) + EMSG_UNEXP_EOS : EMSG_UNEXP_TOKEN); + } + +- debug_printf_parse("%s: returning, ltclass:%x t_double:%f\n", __func__, ltclass, t_double); ++ debug_printf_parse("%s: returning, t_double:%f ltclass:", __func__, t_double); ++ debug_parse_print_tc(ltclass); ++ debug_printf_parse("\n"); + return ltclass; + #undef concat_inserted + #undef save_tclass +@@ -1266,7 +1310,7 @@ static node *condition(void) + + /* parse expression terminated by given argument, return ptr + * to built subtree. Terminator is eaten by parse_expr */ +-static node *parse_expr(uint32_t iexp) ++static node *parse_expr(uint32_t term_tc) + { + node sn; + node *cn = &sn; +@@ -1274,13 +1318,15 @@ static node *parse_expr(uint32_t iexp) + uint32_t tc, xtc; + var *v; + +- debug_printf_parse("%s(%x)\n", __func__, iexp); ++ debug_printf_parse("%s() term_tc(%x):", __func__, term_tc); ++ debug_parse_print_tc(term_tc); ++ debug_printf_parse("\n"); + + sn.info = PRIMASK; + sn.r.n = sn.a.n = glptr = NULL; +- xtc = TC_OPERAND | TC_UOPPRE | TC_REGEXP | iexp; ++ xtc = TC_OPERAND | TC_UOPPRE | TC_REGEXP | term_tc; + +- while (!((tc = next_token(xtc)) & iexp)) { ++ while (!((tc = next_token(xtc)) & term_tc)) { + + if (glptr && (t_info == (OC_COMPARE | VV | P(39) | 2))) { + /* input redirection (<) attached to glptr node */ +@@ -1313,25 +1359,28 @@ static node *parse_expr(uint32_t iexp) + next_token(TC_GETLINE); + /* give maximum priority to this pipe */ + cn->info &= ~PRIMASK; +- xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | iexp; ++ xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | term_tc; + } + } else { + cn->r.n = vn; +- xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | iexp; ++ xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | term_tc; + } + vn->a.n = cn; + + } else { +- debug_printf_parse("%s: other\n", __func__); ++ debug_printf_parse("%s: other, t_info:%x\n", __func__, t_info); + /* for operands and prefix-unary operators, attach them + * to last node */ + vn = cn; + cn = vn->r.n = new_node(t_info); + cn->a.n = vn; ++ + xtc = TC_OPERAND | TC_UOPPRE | TC_REGEXP; ++ if (t_info == TI_PREINC || t_info == TI_PREDEC) ++ xtc = TC_LVALUE | TC_UOPPRE1; + if (tc & (TC_OPERAND | TC_REGEXP)) { + debug_printf_parse("%s: TC_OPERAND | TC_REGEXP\n", __func__); +- xtc = TC_UOPPRE | TC_UOPPOST | TC_BINOP | TC_OPERAND | iexp; ++ xtc = TC_UOPPRE | TC_UOPPOST | TC_BINOP | TC_OPERAND | term_tc; + /* one should be very careful with switch on tclass - + * only simple tclasses should be used! */ + switch (tc) { +@@ -1388,7 +1437,7 @@ static node *parse_expr(uint32_t iexp) + case TC_GETLINE: + debug_printf_parse("%s: TC_GETLINE\n", __func__); + glptr = cn; +- xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | iexp; ++ xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | term_tc; + break; + + case TC_BUILTIN: +@@ -1603,6 +1652,8 @@ static void parse_program(char *p) + func *f; + var *v; + ++ debug_printf_parse("%s()\n", __func__); ++ + g_pos = p; + t_lineno = 1; + while ((tclass = next_token(TC_EOF | TC_OPSEQ | TC_GRPSTART | +-- +2.27.0 + + +From 3d0acb8934f496021a63471ef9e29c87520612a0 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sun, 20 Jun 2021 22:52:29 +0200 +Subject: [PATCH 04/61] qwk: make code clearer, no actual code changes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 2563722f9..5f1d670a4 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -455,7 +455,8 @@ static const uint32_t tokeninfo[] ALIGN4 = { + OC_BINARY|NV|P(29)|'+', OC_BINARY|NV|P(29)|'-', OC_REPLACE|NV|P(74)|'&', OC_BINARY|NV|P(15)|'&', + OC_BINARY|NV|P(25)|'/', OC_BINARY|NV|P(25)|'%', OC_BINARY|NV|P(15)|'&', OC_BINARY|NV|P(25)|'*', + OC_COMPARE|VV|P(39)|4, OC_COMPARE|VV|P(39)|3, OC_COMPARE|VV|P(39)|0, OC_COMPARE|VV|P(39)|1, +- OC_COMPARE|VV|P(39)|2, OC_MATCH|Sx|P(45)|'!', OC_MATCH|Sx|P(45)|'~', OC_LAND|Vx|P(55), ++#define TI_LESS (OC_COMPARE|VV|P(39)|2) ++ TI_LESS, OC_MATCH|Sx|P(45)|'!', OC_MATCH|Sx|P(45)|'~', OC_LAND|Vx|P(55), + OC_LOR|Vx|P(59), OC_TERNARY|Vx|P(64)|'?', OC_COLON|xx|P(67)|':', + OC_IN|SV|P(49), /* TC_IN */ + OC_COMMA|SS|P(80), +@@ -1328,7 +1329,7 @@ static node *parse_expr(uint32_t term_tc) + + while (!((tc = next_token(xtc)) & term_tc)) { + +- if (glptr && (t_info == (OC_COMPARE | VV | P(39) | 2))) { ++ if (glptr && (t_info == TI_LESS)) { + /* input redirection (<) attached to glptr node */ + debug_printf_parse("%s: input redir\n", __func__); + cn = glptr->l.n = new_node(OC_CONCAT | SS | P(37)); +-- +2.27.0 + + +From 3c18df6595f8efc0229d7afc948b8ef38fb6f1aa Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 25 Jun 2021 19:38:27 +0200 +Subject: [PATCH 05/61] awk: more efficient -f FILE, document what "some trick + in next_token" is + +function old new delta +awk_main 890 898 +8 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 33 ++++++++++++++++++++++++--------- + 1 file changed, 24 insertions(+), 9 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 5f1d670a4..1b23c17d2 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1217,6 +1217,8 @@ static uint32_t next_token(uint32_t expected) + if (!isalnum_(*p)) + syntax_error(EMSG_UNEXP_TOKEN); /* no */ + /* yes */ ++/* "move name one char back" trick: we need a byte for NUL terminator */ ++/* NB: this results in argv[i][-1] being used (!!!) in e.g. "awk -e 'NAME'" case */ + t_string = --p; + while (isalnum_(*++p)) { + p[-1] = *p; +@@ -3345,7 +3347,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS + llist_t *list_e = NULL; + #endif +- int i, j; ++ int i; + var *v; + var tv; + char **envp; +@@ -3417,30 +3419,43 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + bb_show_usage(); + } + while (list_f) { +- char *s = NULL; +- FILE *from_file; ++ int fd; ++ char *s; + + g_progname = llist_pop(&list_f); +- from_file = xfopen_stdin(g_progname); +- /* one byte is reserved for some trick in next_token */ +- for (i = j = 1; j > 0; i += j) { +- s = xrealloc(s, i + 4096); +- j = fread(s + i, 1, 4094, from_file); ++ fd = xopen_stdin(g_progname); ++ /* 1st byte is reserved for "move name one char back" trick in next_token */ ++ i = 1; ++ s = NULL; ++ for (;;) { ++ int sz; ++ s = xrealloc(s, i + 1000); ++ sz = safe_read(fd, s + i, 1000); ++ if (sz <= 0) ++ break; ++ i += sz; + } ++ s = xrealloc(s, i + 1); /* trim unused 999 bytes */ + s[i] = '\0'; +- fclose(from_file); ++ close(fd); + parse_program(s + 1); + free(s); + } + g_progname = "cmd. line"; + #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS + while (list_e) { ++ /* NB: "move name one char back" trick in next_token ++ * can use argv[i][-1] here. ++ */ + parse_program(llist_pop(&list_e)); + } + #endif + if (!(opt & (OPT_f | OPT_e))) { + if (!*argv) + bb_show_usage(); ++ /* NB: "move name one char back" trick in next_token ++ * can use argv[i][-1] here. ++ */ + parse_program(*argv++); + } + +-- +2.27.0 + + +From f8243879801f8d9d9fffbde592aee4264aa30d71 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 25 Jun 2021 19:41:05 +0200 +Subject: [PATCH 06/61] awk: move locals deeper into scopes where they are + used, no logic changes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 62 ++++++++++++++++++++++++++------------------------- + 1 file changed, 32 insertions(+), 30 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 1b23c17d2..86076d7b6 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -3254,20 +3254,19 @@ static var *evaluate(node *op, var *res) + + static int awk_exit(int r) + { +- var tv; + unsigned i; +- hash_item *hi; +- +- zero_out_var(&tv); + + if (!exiting) { ++ var tv; + exiting = TRUE; + nextrec = FALSE; ++ zero_out_var(&tv); + evaluate(endseq.first, &tv); + } + + /* waiting for children */ + for (i = 0; i < fdhash->csize; i++) { ++ hash_item *hi; + hi = fdhash->items[i]; + while (hi) { + if (hi->data.rs.F && hi->data.rs.is_pipe) +@@ -3348,11 +3347,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + llist_t *list_e = NULL; + #endif + int i; +- var *v; + var tv; +- char **envp; +- char *vnames = (char *)vNames; /* cheat */ +- char *vvalues = (char *)vValues; + + INIT_G(); + +@@ -3361,8 +3356,6 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + if (ENABLE_LOCALE_SUPPORT) + setlocale(LC_NUMERIC, "C"); + +- zero_out_var(&tv); +- + /* allocate global buffer */ + g_buf = xmalloc(MAXVARFMT + 1); + +@@ -3372,16 +3365,21 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + fnhash = hash_init(); + + /* initialize variables */ +- for (i = 0; *vnames; i++) { +- intvar[i] = v = newvar(nextword(&vnames)); +- if (*vvalues != '\377') +- setvar_s(v, nextword(&vvalues)); +- else +- setvar_i(v, 0); +- +- if (*vnames == '*') { +- v->type |= VF_SPECIAL; +- vnames++; ++ { ++ char *vnames = (char *)vNames; /* cheat */ ++ char *vvalues = (char *)vValues; ++ for (i = 0; *vnames; i++) { ++ var *v; ++ intvar[i] = v = newvar(nextword(&vnames)); ++ if (*vvalues != '\377') ++ setvar_s(v, nextword(&vvalues)); ++ else ++ setvar_i(v, 0); ++ ++ if (*vnames == '*') { ++ v->type |= VF_SPECIAL; ++ vnames++; ++ } + } + } + +@@ -3393,16 +3391,19 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + newfile("/dev/stderr")->F = stderr; + + /* Huh, people report that sometimes environ is NULL. Oh well. */ +- if (environ) for (envp = environ; *envp; envp++) { +- /* environ is writable, thus we don't strdup it needlessly */ +- char *s = *envp; +- char *s1 = strchr(s, '='); +- if (s1) { +- *s1 = '\0'; +- /* Both findvar and setvar_u take const char* +- * as 2nd arg -> environment is not trashed */ +- setvar_u(findvar(iamarray(intvar[ENVIRON]), s), s1 + 1); +- *s1 = '='; ++ if (environ) { ++ char **envp; ++ for (envp = environ; *envp; envp++) { ++ /* environ is writable, thus we don't strdup it needlessly */ ++ char *s = *envp; ++ char *s1 = strchr(s, '='); ++ if (s1) { ++ *s1 = '\0'; ++ /* Both findvar and setvar_u take const char* ++ * as 2nd arg -> environment is not trashed */ ++ setvar_u(findvar(iamarray(intvar[ENVIRON]), s), s1 + 1); ++ *s1 = '='; ++ } + } + } + opt = getopt32(argv, OPTSTR_AWK, &opt_F, &list_v, &list_f, IF_FEATURE_AWK_GNU_EXTENSIONS(&list_e,) NULL); +@@ -3466,6 +3467,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + setari_u(intvar[ARGV], ++i, *argv++); + setvar_i(intvar[ARGC], i + 1); + ++ zero_out_var(&tv); + evaluate(beginseq.first, &tv); + if (!mainseq.first && !endseq.first) + awk_exit(EXIT_SUCCESS); +-- +2.27.0 + + +From b52a50128d64e1f601e17507ffc118c180ef7b3d Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 01:03:42 +0200 +Subject: [PATCH 07/61] awk: remove redundant check + +function old new delta +next_token 785 784 -1 +parse_program 337 328 -9 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-10) Total: -10 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 32 ++++++++++++++++++-------------- + 1 file changed, 18 insertions(+), 14 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 86076d7b6..9826a57c6 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1093,8 +1093,9 @@ static void nvfree(var *v) + + /* ------- awk program text parsing ------- */ + +-/* Parse next token pointed by global pos, place results into global ttt. +- * If token isn't expected, give away. Return token class ++/* Parse next token pointed by global pos, place results into global t_XYZ variables. ++ * If token isn't expected, print error message and die. ++ * Return token class (also store it in t_tclass). + */ + static uint32_t next_token(uint32_t expected) + { +@@ -1248,33 +1249,35 @@ static uint32_t next_token(uint32_t expected) + goto readnext; + + /* insert concatenation operator when needed */ +- debug_printf_parse("%s: %x %x %x concat_inserted?\n", __func__, +- (ltclass & TC_CONCAT1), (tc & TC_CONCAT2), (expected & TC_BINOP)); ++ debug_printf_parse("%s: concat_inserted if all nonzero: %x %x %x %x\n", __func__, ++ (ltclass & TC_CONCAT1), (tc & TC_CONCAT2), (expected & TC_BINOP), ++ !(ltclass == TC_LENGTH && tc == TC_SEQSTART)); + if ((ltclass & TC_CONCAT1) && (tc & TC_CONCAT2) && (expected & TC_BINOP) + && !(ltclass == TC_LENGTH && tc == TC_SEQSTART) /* but not for "length(..." */ + ) { + concat_inserted = TRUE; + save_tclass = tc; + save_info = t_info; +- tc = TC_BINOP; ++ tc = TC_BINOPX; + t_info = OC_CONCAT | SS | P(35); + } + +- debug_printf_parse("%s: t_tclass=tc=%x\n", __func__, t_tclass); + t_tclass = tc; ++ debug_printf_parse("%s: t_tclass=tc=%x\n", __func__, tc); + } +- ltclass = t_tclass; +- + /* Are we ready for this? */ +- if (!(ltclass & expected)) { ++ if (!(t_tclass & expected)) { + syntax_error((ltclass & (TC_NEWLINE | TC_EOF)) ? + EMSG_UNEXP_EOS : EMSG_UNEXP_TOKEN); + } + +- debug_printf_parse("%s: returning, t_double:%f ltclass:", __func__, t_double); +- debug_parse_print_tc(ltclass); ++ debug_printf_parse("%s: returning, t_double:%f t_tclass:", __func__, t_double); ++ debug_parse_print_tc(t_tclass); + debug_printf_parse("\n"); +- return ltclass; ++ ++ ltclass = t_tclass; ++ ++ return t_tclass; + #undef concat_inserted + #undef save_tclass + #undef save_info +@@ -1700,8 +1703,9 @@ static void parse_program(char *p) + /* Arg followed either by end of arg list or 1 comma */ + if (next_token(TC_COMMA | TC_SEQTERM) & TC_SEQTERM) + break; +- if (t_tclass != TC_COMMA) +- syntax_error(EMSG_UNEXP_TOKEN); ++//Impossible: next_token() above would error out and die ++// if (t_tclass != TC_COMMA) ++// syntax_error(EMSG_UNEXP_TOKEN); + } + seq = &f->body; + chain_group(); +-- +2.27.0 + + +From 96368c3613c1b01c42b7b382d01142a07c919f60 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 01:09:08 +0200 +Subject: [PATCH 08/61] awk: make ltclass ("last token class") local to + next_token() + +function old new delta +next_token 784 790 +6 +next_input_file 219 216 -3 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 1/1 up/down: 6/-3) Total: 3 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 24 ++++++++++-------------- + 1 file changed, 10 insertions(+), 14 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 9826a57c6..418bda160 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -556,7 +556,6 @@ struct globals2 { + + uint32_t next_token__save_tclass; + uint32_t next_token__save_info; +- uint32_t next_token__ltclass; + smallint next_token__concat_inserted; + + smallint next_input_file__files_happen; +@@ -615,7 +614,7 @@ struct globals2 { + #define rsplitter (G.rsplitter ) + #define INIT_G() do { \ + SET_PTR_TO_GLOBALS((char*)xzalloc(sizeof(G1)+sizeof(G)) + sizeof(G1)); \ +- G.next_token__ltclass = TC_OPTERM; \ ++ t_tclass = TC_OPTERM; \ + G.evaluate__seed = 1; \ + } while (0) + +@@ -1102,13 +1101,13 @@ static uint32_t next_token(uint32_t expected) + #define concat_inserted (G.next_token__concat_inserted) + #define save_tclass (G.next_token__save_tclass) + #define save_info (G.next_token__save_info) +-/* Initialized to TC_OPTERM: */ +-#define ltclass (G.next_token__ltclass) + + char *p, *s; + const char *tl; +- uint32_t tc; + const uint32_t *ti; ++ uint32_t tc, last_token_class; ++ ++ last_token_class = t_tclass; /* t_tclass is initialized to TC_OPTERM */ + + debug_printf_parse("%s() expected(%x):", __func__, expected); + debug_parse_print_tc(expected); +@@ -1245,15 +1244,15 @@ static uint32_t next_token(uint32_t expected) + g_pos = p; + + /* skipping newlines in some cases */ +- if ((ltclass & TC_NOTERM) && (tc & TC_NEWLINE)) ++ if ((last_token_class & TC_NOTERM) && (tc & TC_NEWLINE)) + goto readnext; + + /* insert concatenation operator when needed */ + debug_printf_parse("%s: concat_inserted if all nonzero: %x %x %x %x\n", __func__, +- (ltclass & TC_CONCAT1), (tc & TC_CONCAT2), (expected & TC_BINOP), +- !(ltclass == TC_LENGTH && tc == TC_SEQSTART)); +- if ((ltclass & TC_CONCAT1) && (tc & TC_CONCAT2) && (expected & TC_BINOP) +- && !(ltclass == TC_LENGTH && tc == TC_SEQSTART) /* but not for "length(..." */ ++ (last_token_class & TC_CONCAT1), (tc & TC_CONCAT2), (expected & TC_BINOP), ++ !(last_token_class == TC_LENGTH && tc == TC_SEQSTART)); ++ if ((last_token_class & TC_CONCAT1) && (tc & TC_CONCAT2) && (expected & TC_BINOP) ++ && !(last_token_class == TC_LENGTH && tc == TC_SEQSTART) /* but not for "length(..." */ + ) { + concat_inserted = TRUE; + save_tclass = tc; +@@ -1267,7 +1266,7 @@ static uint32_t next_token(uint32_t expected) + } + /* Are we ready for this? */ + if (!(t_tclass & expected)) { +- syntax_error((ltclass & (TC_NEWLINE | TC_EOF)) ? ++ syntax_error((last_token_class & (TC_NEWLINE | TC_EOF)) ? + EMSG_UNEXP_EOS : EMSG_UNEXP_TOKEN); + } + +@@ -1275,13 +1274,10 @@ static uint32_t next_token(uint32_t expected) + debug_parse_print_tc(t_tclass); + debug_printf_parse("\n"); + +- ltclass = t_tclass; +- + return t_tclass; + #undef concat_inserted + #undef save_tclass + #undef save_info +-#undef ltclass + } + + static void rollback_token(void) +-- +2.27.0 + + +From 8b51ddd054a3454171440035ed7f125483e9697c Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 01:23:37 +0200 +Subject: [PATCH 09/61] awk: use TS_foo for combined token classes. No code + changes + +Confusion with "simple" classes was the cause of a bug fixed by previous commit + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 128 +++++++++++++++++++++++++------------------------- + 1 file changed, 64 insertions(+), 64 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 418bda160..764a3dd49 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -281,39 +281,39 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ + } while (0) + #endif + +-/* combined token classes */ +-#define TC_UOPPRE (TC_UOPPRE1 | TC_UOPPRE2) ++/* combined token classes ("token [class] sets") */ ++#define TS_UOPPRE (TC_UOPPRE1 | TC_UOPPRE2) + +-#define TC_BINOP (TC_BINOPX | TC_COMMA | TC_PIPE | TC_IN) +-//#define TC_UNARYOP (TC_UOPPRE | TC_UOPPOST) +-#define TC_OPERAND (TC_VARIABLE | TC_ARRAY | TC_FUNCTION \ +- | TC_BUILTIN | TC_LENGTH | TC_GETLINE \ +- | TC_SEQSTART | TC_STRING | TC_NUMBER) +-#define TC_LVALUE (TC_VARIABLE | TC_ARRAY) ++#define TS_BINOP (TC_BINOPX | TC_COMMA | TC_PIPE | TC_IN) ++//#define TS_UNARYOP (TS_UOPPRE | TC_UOPPOST) ++#define TS_OPERAND (TC_VARIABLE | TC_ARRAY | TC_FUNCTION \ ++ | TC_BUILTIN | TC_LENGTH | TC_GETLINE \ ++ | TC_SEQSTART | TC_STRING | TC_NUMBER) + +-#define TC_STATEMNT (TC_STATX | TC_WHILE) +-#define TC_OPTERM (TC_SEMICOL | TC_NEWLINE) ++#define TS_LVALUE (TC_VARIABLE | TC_ARRAY) ++#define TS_STATEMNT (TC_STATX | TC_WHILE) ++#define TS_OPTERM (TC_SEMICOL | TC_NEWLINE) + + /* word tokens, cannot mean something else if not expected */ +-#define TC_WORD (TC_IN | TC_STATEMNT | TC_ELSE \ +- | TC_BUILTIN | TC_LENGTH | TC_GETLINE \ +- | TC_FUNCDECL | TC_BEGIN | TC_END) ++#define TS_WORD (TC_IN | TS_STATEMNT | TC_ELSE \ ++ | TC_BUILTIN | TC_LENGTH | TC_GETLINE \ ++ | TC_FUNCDECL | TC_BEGIN | TC_END) + + /* discard newlines after these */ +-#define TC_NOTERM (TC_COMMA | TC_GRPSTART | TC_GRPTERM \ +- | TC_BINOP | TC_OPTERM) ++#define TS_NOTERM (TC_COMMA | TC_GRPSTART | TC_GRPTERM \ ++ | TS_BINOP | TS_OPTERM) + + /* what can expression begin with */ +-#define TC_OPSEQ (TC_OPERAND | TC_UOPPRE | TC_REGEXP) ++#define TS_OPSEQ (TS_OPERAND | TS_UOPPRE | TC_REGEXP) + /* what can group begin with */ +-#define TC_GRPSEQ (TC_OPSEQ | TC_OPTERM | TC_STATEMNT | TC_GRPSTART) ++#define TS_GRPSEQ (TS_OPSEQ | TS_OPTERM | TS_STATEMNT | TC_GRPSTART) + +-/* if previous token class is CONCAT1 and next is CONCAT2, concatenation */ ++/* if previous token class is CONCAT_L and next is CONCAT_R, concatenation */ + /* operator is inserted between them */ +-#define TC_CONCAT1 (TC_VARIABLE | TC_ARRTERM | TC_SEQTERM \ ++#define TS_CONCAT_L (TC_VARIABLE | TC_ARRTERM | TC_SEQTERM \ + | TC_STRING | TC_NUMBER | TC_UOPPOST \ + | TC_LENGTH) +-#define TC_CONCAT2 (TC_OPERAND | TC_UOPPRE) ++#define TS_CONCAT_R (TS_OPERAND | TS_UOPPRE) + + #define OF_RES1 0x010000 + #define OF_RES2 0x020000 +@@ -614,7 +614,7 @@ struct globals2 { + #define rsplitter (G.rsplitter ) + #define INIT_G() do { \ + SET_PTR_TO_GLOBALS((char*)xzalloc(sizeof(G1)+sizeof(G)) + sizeof(G1)); \ +- t_tclass = TC_OPTERM; \ ++ t_tclass = TS_OPTERM; \ + G.evaluate__seed = 1; \ + } while (0) + +@@ -1107,7 +1107,7 @@ static uint32_t next_token(uint32_t expected) + const uint32_t *ti; + uint32_t tc, last_token_class; + +- last_token_class = t_tclass; /* t_tclass is initialized to TC_OPTERM */ ++ last_token_class = t_tclass; /* t_tclass is initialized to TS_OPTERM */ + + debug_printf_parse("%s() expected(%x):", __func__, expected); + debug_parse_print_tc(expected); +@@ -1198,9 +1198,9 @@ static uint32_t next_token(uint32_t expected) + * token matches, + * and it's not a longer word, + */ +- if ((tc & (expected | TC_WORD | TC_NEWLINE)) ++ if ((tc & (expected | TS_WORD | TC_NEWLINE)) + && strncmp(p, tl, l) == 0 +- && !((tc & TC_WORD) && isalnum_(p[l])) ++ && !((tc & TS_WORD) && isalnum_(p[l])) + ) { + /* then this is what we are looking for */ + t_info = *ti; +@@ -1244,14 +1244,14 @@ static uint32_t next_token(uint32_t expected) + g_pos = p; + + /* skipping newlines in some cases */ +- if ((last_token_class & TC_NOTERM) && (tc & TC_NEWLINE)) ++ if ((last_token_class & TS_NOTERM) && (tc & TC_NEWLINE)) + goto readnext; + + /* insert concatenation operator when needed */ + debug_printf_parse("%s: concat_inserted if all nonzero: %x %x %x %x\n", __func__, +- (last_token_class & TC_CONCAT1), (tc & TC_CONCAT2), (expected & TC_BINOP), ++ (last_token_class & TS_CONCAT_L), (tc & TS_CONCAT_R), (expected & TS_BINOP), + !(last_token_class == TC_LENGTH && tc == TC_SEQSTART)); +- if ((last_token_class & TC_CONCAT1) && (tc & TC_CONCAT2) && (expected & TC_BINOP) ++ if ((last_token_class & TS_CONCAT_L) && (tc & TS_CONCAT_R) && (expected & TS_BINOP) + && !(last_token_class == TC_LENGTH && tc == TC_SEQSTART) /* but not for "length(..." */ + ) { + concat_inserted = TRUE; +@@ -1317,7 +1317,7 @@ static node *parse_expr(uint32_t term_tc) + node sn; + node *cn = &sn; + node *vn, *glptr; +- uint32_t tc, xtc; ++ uint32_t tc, expected_tc; + var *v; + + debug_printf_parse("%s() term_tc(%x):", __func__, term_tc); +@@ -1326,20 +1326,20 @@ static node *parse_expr(uint32_t term_tc) + + sn.info = PRIMASK; + sn.r.n = sn.a.n = glptr = NULL; +- xtc = TC_OPERAND | TC_UOPPRE | TC_REGEXP | term_tc; ++ expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP | term_tc; + +- while (!((tc = next_token(xtc)) & term_tc)) { ++ while (!((tc = next_token(expected_tc)) & term_tc)) { + + if (glptr && (t_info == TI_LESS)) { + /* input redirection (<) attached to glptr node */ + debug_printf_parse("%s: input redir\n", __func__); + cn = glptr->l.n = new_node(OC_CONCAT | SS | P(37)); + cn->a.n = glptr; +- xtc = TC_OPERAND | TC_UOPPRE; ++ expected_tc = TS_OPERAND | TS_UOPPRE; + glptr = NULL; + +- } else if (tc & (TC_BINOP | TC_UOPPOST)) { +- debug_printf_parse("%s: TC_BINOP | TC_UOPPOST tc:%x\n", __func__, tc); ++ } else if (tc & (TS_BINOP | TC_UOPPOST)) { ++ debug_printf_parse("%s: TS_BINOP | TC_UOPPOST tc:%x\n", __func__, tc); + /* for binary and postfix-unary operators, jump back over + * previous operators with higher priority */ + vn = cn; +@@ -1353,19 +1353,19 @@ static node *parse_expr(uint32_t term_tc) + t_info += P(6); + cn = vn->a.n->r.n = new_node(t_info); + cn->a.n = vn->a.n; +- if (tc & TC_BINOP) { ++ if (tc & TS_BINOP) { + cn->l.n = vn; +- xtc = TC_OPERAND | TC_UOPPRE | TC_REGEXP; ++ expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; + if ((t_info & OPCLSMASK) == OC_PGETLINE) { + /* it's a pipe */ + next_token(TC_GETLINE); + /* give maximum priority to this pipe */ + cn->info &= ~PRIMASK; +- xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | term_tc; ++ expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; + } + } else { + cn->r.n = vn; +- xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | term_tc; ++ expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; + } + vn->a.n = cn; + +@@ -1377,14 +1377,14 @@ static node *parse_expr(uint32_t term_tc) + cn = vn->r.n = new_node(t_info); + cn->a.n = vn; + +- xtc = TC_OPERAND | TC_UOPPRE | TC_REGEXP; ++ expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; + if (t_info == TI_PREINC || t_info == TI_PREDEC) +- xtc = TC_LVALUE | TC_UOPPRE1; +- if (tc & (TC_OPERAND | TC_REGEXP)) { +- debug_printf_parse("%s: TC_OPERAND | TC_REGEXP\n", __func__); +- xtc = TC_UOPPRE | TC_UOPPOST | TC_BINOP | TC_OPERAND | term_tc; ++ expected_tc = TS_LVALUE | TC_UOPPRE1; ++ if (tc & (TS_OPERAND | TC_REGEXP)) { ++ debug_printf_parse("%s: TS_OPERAND | TC_REGEXP\n", __func__); ++ expected_tc = TS_UOPPRE | TC_UOPPOST | TS_BINOP | TS_OPERAND | term_tc; + /* one should be very careful with switch on tclass - +- * only simple tclasses should be used! */ ++ * only simple tclasses should be used (TC_xyz, not TS_xyz) */ + switch (tc) { + case TC_VARIABLE: + case TC_ARRAY: +@@ -1412,7 +1412,7 @@ static node *parse_expr(uint32_t term_tc) + setvar_i(v, t_double); + else { + setvar_s(v, t_string); +- xtc &= ~TC_UOPPOST; /* "str"++ is not allowed */ ++ expected_tc &= ~TC_UOPPOST; /* "str"++ is not allowed */ + } + break; + +@@ -1439,7 +1439,7 @@ static node *parse_expr(uint32_t term_tc) + case TC_GETLINE: + debug_printf_parse("%s: TC_GETLINE\n", __func__); + glptr = cn; +- xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | term_tc; ++ expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; + break; + + case TC_BUILTIN: +@@ -1450,7 +1450,7 @@ static node *parse_expr(uint32_t term_tc) + case TC_LENGTH: + debug_printf_parse("%s: TC_LENGTH\n", __func__); + next_token(TC_SEQSTART /* length(...) */ +- | TC_OPTERM /* length; (or newline)*/ ++ | TS_OPTERM /* length; (or newline)*/ + | TC_GRPTERM /* length } */ + | TC_BINOPX /* length NUM */ + | TC_COMMA /* print length, 1 */ +@@ -1464,7 +1464,7 @@ static node *parse_expr(uint32_t term_tc) + } + } + } +- } ++ } /* while() */ + + debug_printf_parse("%s() returns %p\n", __func__, sn.r.n); + return sn.r.n; +@@ -1497,7 +1497,7 @@ static void chain_expr(uint32_t info) + + n = chain_node(info); + +- n->l.n = parse_expr(TC_OPTERM | TC_GRPTERM); ++ n->l.n = parse_expr(TS_OPTERM | TC_GRPTERM); + if ((info & OF_REQUIRED) && !n->l.n) + syntax_error(EMSG_TOO_FEW_ARGS); + +@@ -1535,12 +1535,12 @@ static void chain_group(void) + node *n, *n2, *n3; + + do { +- c = next_token(TC_GRPSEQ); ++ c = next_token(TS_GRPSEQ); + } while (c & TC_NEWLINE); + + if (c & TC_GRPSTART) { + debug_printf_parse("%s: TC_GRPSTART\n", __func__); +- while (next_token(TC_GRPSEQ | TC_GRPTERM) != TC_GRPTERM) { ++ while (next_token(TS_GRPSEQ | TC_GRPTERM) != TC_GRPTERM) { + debug_printf_parse("%s: !TC_GRPTERM\n", __func__); + if (t_tclass & TC_NEWLINE) + continue; +@@ -1548,13 +1548,13 @@ static void chain_group(void) + chain_group(); + } + debug_printf_parse("%s: TC_GRPTERM\n", __func__); +- } else if (c & (TC_OPSEQ | TC_OPTERM)) { +- debug_printf_parse("%s: TC_OPSEQ | TC_OPTERM\n", __func__); ++ } else if (c & (TS_OPSEQ | TS_OPTERM)) { ++ debug_printf_parse("%s: TS_OPSEQ | TS_OPTERM\n", __func__); + rollback_token(); + chain_expr(OC_EXEC | Vx); + } else { +- /* TC_STATEMNT */ +- debug_printf_parse("%s: TC_STATEMNT(?)\n", __func__); ++ /* TS_STATEMNT */ ++ debug_printf_parse("%s: TS_STATEMNT(?)\n", __func__); + switch (t_info & OPCLSMASK) { + case ST_IF: + debug_printf_parse("%s: ST_IF\n", __func__); +@@ -1563,7 +1563,7 @@ static void chain_group(void) + chain_group(); + n2 = chain_node(OC_EXEC); + n->r.n = seq->last; +- if (next_token(TC_GRPSEQ | TC_GRPTERM | TC_ELSE) == TC_ELSE) { ++ if (next_token(TS_GRPSEQ | TC_GRPTERM | TC_ELSE) == TC_ELSE) { + chain_group(); + n2->a.n = seq->last; + } else { +@@ -1616,10 +1616,10 @@ static void chain_group(void) + case OC_PRINTF: + debug_printf_parse("%s: OC_PRINT[F]\n", __func__); + n = chain_node(t_info); +- n->l.n = parse_expr(TC_OPTERM | TC_OUTRDR | TC_GRPTERM); ++ n->l.n = parse_expr(TS_OPTERM | TC_OUTRDR | TC_GRPTERM); + if (t_tclass & TC_OUTRDR) { + n->info |= t_info; +- n->r.n = parse_expr(TC_OPTERM | TC_GRPTERM); ++ n->r.n = parse_expr(TS_OPTERM | TC_GRPTERM); + } + if (t_tclass & TC_GRPTERM) + rollback_token(); +@@ -1658,11 +1658,11 @@ static void parse_program(char *p) + + g_pos = p; + t_lineno = 1; +- while ((tclass = next_token(TC_EOF | TC_OPSEQ | TC_GRPSTART | +- TC_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL)) != TC_EOF) { ++ while ((tclass = next_token(TC_EOF | TS_OPSEQ | TC_GRPSTART | ++ TS_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL)) != TC_EOF) { + +- if (tclass & TC_OPTERM) { +- debug_printf_parse("%s: TC_OPTERM\n", __func__); ++ if (tclass & TS_OPTERM) { ++ debug_printf_parse("%s: TS_OPTERM\n", __func__); + continue; + } + +@@ -1706,11 +1706,11 @@ static void parse_program(char *p) + seq = &f->body; + chain_group(); + clear_array(ahash); +- } else if (tclass & TC_OPSEQ) { +- debug_printf_parse("%s: TC_OPSEQ\n", __func__); ++ } else if (tclass & TS_OPSEQ) { ++ debug_printf_parse("%s: TS_OPSEQ\n", __func__); + rollback_token(); + cn = chain_node(OC_TEST); +- cn->l.n = parse_expr(TC_OPTERM | TC_EOF | TC_GRPSTART); ++ cn->l.n = parse_expr(TS_OPTERM | TC_EOF | TC_GRPSTART); + if (t_tclass & TC_GRPSTART) { + debug_printf_parse("%s: TC_GRPSTART\n", __func__); + rollback_token(); +-- +2.27.0 + + +From 01cbacb45972e14aa3072bf539c391dd03ed3955 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 01:30:49 +0200 +Subject: [PATCH 10/61] awk: deindent code block, no code changes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 177 +++++++++++++++++++++++++------------------------- + 1 file changed, 90 insertions(+), 87 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 764a3dd49..9a3b63df6 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1337,8 +1337,9 @@ static node *parse_expr(uint32_t term_tc) + cn->a.n = glptr; + expected_tc = TS_OPERAND | TS_UOPPRE; + glptr = NULL; +- +- } else if (tc & (TS_BINOP | TC_UOPPOST)) { ++ continue; ++ } ++ if (tc & (TS_BINOP | TC_UOPPOST)) { + debug_printf_parse("%s: TS_BINOP | TC_UOPPOST tc:%x\n", __func__, tc); + /* for binary and postfix-unary operators, jump back over + * previous operators with higher priority */ +@@ -1368,101 +1369,103 @@ static node *parse_expr(uint32_t term_tc) + expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; + } + vn->a.n = cn; ++ continue; ++ } + +- } else { +- debug_printf_parse("%s: other, t_info:%x\n", __func__, t_info); +- /* for operands and prefix-unary operators, attach them +- * to last node */ +- vn = cn; +- cn = vn->r.n = new_node(t_info); +- cn->a.n = vn; ++ debug_printf_parse("%s: other, t_info:%x\n", __func__, t_info); ++ /* for operands and prefix-unary operators, attach them ++ * to last node */ ++ vn = cn; ++ cn = vn->r.n = new_node(t_info); ++ cn->a.n = vn; + +- expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; +- if (t_info == TI_PREINC || t_info == TI_PREDEC) +- expected_tc = TS_LVALUE | TC_UOPPRE1; +- if (tc & (TS_OPERAND | TC_REGEXP)) { +- debug_printf_parse("%s: TS_OPERAND | TC_REGEXP\n", __func__); +- expected_tc = TS_UOPPRE | TC_UOPPOST | TS_BINOP | TS_OPERAND | term_tc; +- /* one should be very careful with switch on tclass - +- * only simple tclasses should be used (TC_xyz, not TS_xyz) */ +- switch (tc) { +- case TC_VARIABLE: +- case TC_ARRAY: +- debug_printf_parse("%s: TC_VARIABLE | TC_ARRAY\n", __func__); +- cn->info = OC_VAR; +- v = hash_search(ahash, t_string); +- if (v != NULL) { +- cn->info = OC_FNARG; +- cn->l.aidx = v->x.aidx; +- } else { +- cn->l.v = newvar(t_string); +- } +- if (tc & TC_ARRAY) { +- cn->info |= xS; +- cn->r.n = parse_expr(TC_ARRTERM); +- } +- break; ++ expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; ++ if (t_info == TI_PREINC || t_info == TI_PREDEC) ++ expected_tc = TS_LVALUE | TC_UOPPRE1; + +- case TC_NUMBER: +- case TC_STRING: +- debug_printf_parse("%s: TC_NUMBER | TC_STRING\n", __func__); +- cn->info = OC_VAR; +- v = cn->l.v = xzalloc(sizeof(var)); +- if (tc & TC_NUMBER) +- setvar_i(v, t_double); +- else { +- setvar_s(v, t_string); +- expected_tc &= ~TC_UOPPOST; /* "str"++ is not allowed */ +- } +- break; ++ if (!(tc & (TS_OPERAND | TC_REGEXP))) ++ continue; + +- case TC_REGEXP: +- debug_printf_parse("%s: TC_REGEXP\n", __func__); +- mk_re_node(t_string, cn, xzalloc(sizeof(regex_t)*2)); +- break; ++ debug_printf_parse("%s: TS_OPERAND | TC_REGEXP\n", __func__); ++ expected_tc = TS_UOPPRE | TC_UOPPOST | TS_BINOP | TS_OPERAND | term_tc; ++ /* one should be very careful with switch on tclass - ++ * only simple tclasses should be used (TC_xyz, not TS_xyz) */ ++ switch (tc) { ++ case TC_VARIABLE: ++ case TC_ARRAY: ++ debug_printf_parse("%s: TC_VARIABLE | TC_ARRAY\n", __func__); ++ cn->info = OC_VAR; ++ v = hash_search(ahash, t_string); ++ if (v != NULL) { ++ cn->info = OC_FNARG; ++ cn->l.aidx = v->x.aidx; ++ } else { ++ cn->l.v = newvar(t_string); ++ } ++ if (tc & TC_ARRAY) { ++ cn->info |= xS; ++ cn->r.n = parse_expr(TC_ARRTERM); ++ } ++ break; + +- case TC_FUNCTION: +- debug_printf_parse("%s: TC_FUNCTION\n", __func__); +- cn->info = OC_FUNC; +- cn->r.f = newfunc(t_string); +- cn->l.n = condition(); +- break; ++ case TC_NUMBER: ++ case TC_STRING: ++ debug_printf_parse("%s: TC_NUMBER | TC_STRING\n", __func__); ++ cn->info = OC_VAR; ++ v = cn->l.v = xzalloc(sizeof(var)); ++ if (tc & TC_NUMBER) ++ setvar_i(v, t_double); ++ else { ++ setvar_s(v, t_string); ++ expected_tc &= ~TC_UOPPOST; /* "str"++ is not allowed */ ++ } ++ break; + +- case TC_SEQSTART: +- debug_printf_parse("%s: TC_SEQSTART\n", __func__); +- cn = vn->r.n = parse_expr(TC_SEQTERM); +- if (!cn) +- syntax_error("Empty sequence"); +- cn->a.n = vn; +- break; ++ case TC_REGEXP: ++ debug_printf_parse("%s: TC_REGEXP\n", __func__); ++ mk_re_node(t_string, cn, xzalloc(sizeof(regex_t)*2)); ++ break; + +- case TC_GETLINE: +- debug_printf_parse("%s: TC_GETLINE\n", __func__); +- glptr = cn; +- expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; +- break; ++ case TC_FUNCTION: ++ debug_printf_parse("%s: TC_FUNCTION\n", __func__); ++ cn->info = OC_FUNC; ++ cn->r.f = newfunc(t_string); ++ cn->l.n = condition(); ++ break; + +- case TC_BUILTIN: +- debug_printf_parse("%s: TC_BUILTIN\n", __func__); +- cn->l.n = condition(); +- break; ++ case TC_SEQSTART: ++ debug_printf_parse("%s: TC_SEQSTART\n", __func__); ++ cn = vn->r.n = parse_expr(TC_SEQTERM); ++ if (!cn) ++ syntax_error("Empty sequence"); ++ cn->a.n = vn; ++ break; + +- case TC_LENGTH: +- debug_printf_parse("%s: TC_LENGTH\n", __func__); +- next_token(TC_SEQSTART /* length(...) */ +- | TS_OPTERM /* length; (or newline)*/ +- | TC_GRPTERM /* length } */ +- | TC_BINOPX /* length NUM */ +- | TC_COMMA /* print length, 1 */ +- ); +- rollback_token(); +- if (t_tclass & TC_SEQSTART) { +- /* It was a "(" token. Handle just like TC_BUILTIN */ +- cn->l.n = condition(); +- } +- break; +- } ++ case TC_GETLINE: ++ debug_printf_parse("%s: TC_GETLINE\n", __func__); ++ glptr = cn; ++ expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; ++ break; ++ ++ case TC_BUILTIN: ++ debug_printf_parse("%s: TC_BUILTIN\n", __func__); ++ cn->l.n = condition(); ++ break; ++ ++ case TC_LENGTH: ++ debug_printf_parse("%s: TC_LENGTH\n", __func__); ++ next_token(TC_SEQSTART /* length(...) */ ++ | TS_OPTERM /* length; (or newline)*/ ++ | TC_GRPTERM /* length } */ ++ | TC_BINOPX /* length NUM */ ++ | TC_COMMA /* print length, 1 */ ++ ); ++ rollback_token(); ++ if (t_tclass & TC_SEQSTART) { ++ /* It was a "(" token. Handle just like TC_BUILTIN */ ++ cn->l.n = condition(); + } ++ break; + } + } /* while() */ + +-- +2.27.0 + + +From acea2fffaa696b855d5189a8a1cd7591fac8891d Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 01:50:47 +0200 +Subject: [PATCH 11/61] awk: rename TC_SEQSTART/END to L/RPAREN, no code + changes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 94 +++++++++++++++++++++++++-------------------------- + 1 file changed, 47 insertions(+), 47 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 9a3b63df6..d31b97d86 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -207,48 +207,48 @@ typedef struct tsplitter_s { + } tsplitter; + + /* simple token classes */ +-/* Order and hex values are very important!!! See next_token() */ +-#define TC_SEQSTART (1 << 0) /* ( */ +-#define TC_SEQTERM (1 << 1) /* ) */ ++/* order and hex values are very important!!! See next_token() */ ++#define TC_LPAREN (1 << 0) /* ( */ ++#define TC_RPAREN (1 << 1) /* ) */ + #define TC_REGEXP (1 << 2) /* /.../ */ + #define TC_OUTRDR (1 << 3) /* | > >> */ + #define TC_UOPPOST (1 << 4) /* unary postfix operator ++ -- */ + #define TC_UOPPRE1 (1 << 5) /* unary prefix operator ++ -- $ */ + #define TC_BINOPX (1 << 6) /* two-opnd operator */ +-#define TC_IN (1 << 7) +-#define TC_COMMA (1 << 8) +-#define TC_PIPE (1 << 9) /* input redirection pipe */ ++#define TC_IN (1 << 7) /* 'in' */ ++#define TC_COMMA (1 << 8) /* , */ ++#define TC_PIPE (1 << 9) /* input redirection pipe | */ + #define TC_UOPPRE2 (1 << 10) /* unary prefix operator + - ! */ + #define TC_ARRTERM (1 << 11) /* ] */ + #define TC_GRPSTART (1 << 12) /* { */ + #define TC_GRPTERM (1 << 13) /* } */ +-#define TC_SEMICOL (1 << 14) ++#define TC_SEMICOL (1 << 14) /* ; */ + #define TC_NEWLINE (1 << 15) + #define TC_STATX (1 << 16) /* ctl statement (for, next...) */ +-#define TC_WHILE (1 << 17) +-#define TC_ELSE (1 << 18) ++#define TC_WHILE (1 << 17) /* 'while' */ ++#define TC_ELSE (1 << 18) /* 'else' */ + #define TC_BUILTIN (1 << 19) + /* This costs ~50 bytes of code. + * A separate class to support deprecated "length" form. If we don't need that + * (i.e. if we demand that only "length()" with () is valid), then TC_LENGTH + * can be merged with TC_BUILTIN: + */ +-#define TC_LENGTH (1 << 20) +-#define TC_GETLINE (1 << 21) ++#define TC_LENGTH (1 << 20) /* 'length' */ ++#define TC_GETLINE (1 << 21) /* 'getline' */ + #define TC_FUNCDECL (1 << 22) /* 'function' 'func' */ +-#define TC_BEGIN (1 << 23) +-#define TC_END (1 << 24) ++#define TC_BEGIN (1 << 23) /* 'BEGIN' */ ++#define TC_END (1 << 24) /* 'END' */ + #define TC_EOF (1 << 25) +-#define TC_VARIABLE (1 << 26) +-#define TC_ARRAY (1 << 27) +-#define TC_FUNCTION (1 << 28) +-#define TC_STRING (1 << 29) ++#define TC_VARIABLE (1 << 26) /* name */ ++#define TC_ARRAY (1 << 27) /* name[ */ ++#define TC_FUNCTION (1 << 28) /* name( - but unlike TC_ARRAY, parser does not consume '(' */ ++#define TC_STRING (1 << 29) /* "..." */ + #define TC_NUMBER (1 << 30) + + #ifndef debug_parse_print_tc + #define debug_parse_print_tc(n) do { \ +-if ((n) & TC_SEQSTART) debug_printf_parse(" SEQSTART"); \ +-if ((n) & TC_SEQTERM ) debug_printf_parse(" SEQTERM" ); \ ++if ((n) & TC_LPAREN ) debug_printf_parse(" LPAREN" ); \ ++if ((n) & TC_RPAREN ) debug_printf_parse(" RPAREN" ); \ + if ((n) & TC_REGEXP ) debug_printf_parse(" REGEXP" ); \ + if ((n) & TC_OUTRDR ) debug_printf_parse(" OUTRDR" ); \ + if ((n) & TC_UOPPOST ) debug_printf_parse(" UOPPOST" ); \ +@@ -288,7 +288,7 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ + //#define TS_UNARYOP (TS_UOPPRE | TC_UOPPOST) + #define TS_OPERAND (TC_VARIABLE | TC_ARRAY | TC_FUNCTION \ + | TC_BUILTIN | TC_LENGTH | TC_GETLINE \ +- | TC_SEQSTART | TC_STRING | TC_NUMBER) ++ | TC_LPAREN | TC_STRING | TC_NUMBER) + + #define TS_LVALUE (TC_VARIABLE | TC_ARRAY) + #define TS_STATEMNT (TC_STATX | TC_WHILE) +@@ -310,7 +310,7 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ + + /* if previous token class is CONCAT_L and next is CONCAT_R, concatenation */ + /* operator is inserted between them */ +-#define TS_CONCAT_L (TC_VARIABLE | TC_ARRTERM | TC_SEQTERM \ ++#define TS_CONCAT_L (TC_VARIABLE | TC_ARRTERM | TC_RPAREN \ + | TC_STRING | TC_NUMBER | TC_UOPPOST \ + | TC_LENGTH) + #define TS_CONCAT_R (TS_OPERAND | TS_UOPPRE) +@@ -394,8 +394,8 @@ enum { + #define NTCC '\377' + + static const char tokenlist[] ALIGN1 = +- "\1(" NTC /* TC_SEQSTART */ +- "\1)" NTC /* TC_SEQTERM */ ++ "\1(" NTC /* TC_LPAREN */ ++ "\1)" NTC /* TC_RPAREN */ + "\1/" NTC /* TC_REGEXP */ + "\2>>" "\1>" "\1|" NTC /* TC_OUTRDR */ + "\2++" "\2--" NTC /* TC_UOPPOST */ +@@ -1250,9 +1250,9 @@ static uint32_t next_token(uint32_t expected) + /* insert concatenation operator when needed */ + debug_printf_parse("%s: concat_inserted if all nonzero: %x %x %x %x\n", __func__, + (last_token_class & TS_CONCAT_L), (tc & TS_CONCAT_R), (expected & TS_BINOP), +- !(last_token_class == TC_LENGTH && tc == TC_SEQSTART)); ++ !(last_token_class == TC_LENGTH && tc == TC_LPAREN)); + if ((last_token_class & TS_CONCAT_L) && (tc & TS_CONCAT_R) && (expected & TS_BINOP) +- && !(last_token_class == TC_LENGTH && tc == TC_SEQSTART) /* but not for "length(..." */ ++ && !(last_token_class == TC_LENGTH && tc == TC_LPAREN) /* but not for "length(..." */ + ) { + concat_inserted = TRUE; + save_tclass = tc; +@@ -1304,10 +1304,10 @@ static void mk_re_node(const char *s, node *n, regex_t *re) + xregcomp(re + 1, s, REG_EXTENDED | REG_ICASE); + } + +-static node *condition(void) ++static node *parse_lrparen_list(void) + { +- next_token(TC_SEQSTART); +- return parse_expr(TC_SEQTERM); ++ next_token(TC_LPAREN); ++ return parse_expr(TC_RPAREN); + } + + /* parse expression terminated by given argument, return ptr +@@ -1430,12 +1430,12 @@ static node *parse_expr(uint32_t term_tc) + debug_printf_parse("%s: TC_FUNCTION\n", __func__); + cn->info = OC_FUNC; + cn->r.f = newfunc(t_string); +- cn->l.n = condition(); ++ cn->l.n = parse_lrparen_list(); + break; + +- case TC_SEQSTART: +- debug_printf_parse("%s: TC_SEQSTART\n", __func__); +- cn = vn->r.n = parse_expr(TC_SEQTERM); ++ case TC_LPAREN: ++ debug_printf_parse("%s: TC_LPAREN\n", __func__); ++ cn = vn->r.n = parse_expr(TC_RPAREN); + if (!cn) + syntax_error("Empty sequence"); + cn->a.n = vn; +@@ -1449,21 +1449,21 @@ static node *parse_expr(uint32_t term_tc) + + case TC_BUILTIN: + debug_printf_parse("%s: TC_BUILTIN\n", __func__); +- cn->l.n = condition(); ++ cn->l.n = parse_lrparen_list(); + break; + + case TC_LENGTH: + debug_printf_parse("%s: TC_LENGTH\n", __func__); +- next_token(TC_SEQSTART /* length(...) */ ++ next_token(TC_LPAREN /* length(...) */ + | TS_OPTERM /* length; (or newline)*/ + | TC_GRPTERM /* length } */ + | TC_BINOPX /* length NUM */ + | TC_COMMA /* print length, 1 */ + ); + rollback_token(); +- if (t_tclass & TC_SEQSTART) { ++ if (t_tclass & TC_LPAREN) { + /* It was a "(" token. Handle just like TC_BUILTIN */ +- cn->l.n = condition(); ++ cn->l.n = parse_lrparen_list(); + } + break; + } +@@ -1562,7 +1562,7 @@ static void chain_group(void) + case ST_IF: + debug_printf_parse("%s: ST_IF\n", __func__); + n = chain_node(OC_BR | Vx); +- n->l.n = condition(); ++ n->l.n = parse_lrparen_list(); + chain_group(); + n2 = chain_node(OC_EXEC); + n->r.n = seq->last; +@@ -1576,7 +1576,7 @@ static void chain_group(void) + + case ST_WHILE: + debug_printf_parse("%s: ST_WHILE\n", __func__); +- n2 = condition(); ++ n2 = parse_lrparen_list(); + n = chain_loop(NULL); + n->l.n = n2; + break; +@@ -1587,14 +1587,14 @@ static void chain_group(void) + n = chain_loop(NULL); + n2->a.n = n->a.n; + next_token(TC_WHILE); +- n->l.n = condition(); ++ n->l.n = parse_lrparen_list(); + break; + + case ST_FOR: + debug_printf_parse("%s: ST_FOR\n", __func__); +- next_token(TC_SEQSTART); +- n2 = parse_expr(TC_SEMICOL | TC_SEQTERM); +- if (t_tclass & TC_SEQTERM) { /* for-in */ ++ next_token(TC_LPAREN); ++ n2 = parse_expr(TC_SEMICOL | TC_RPAREN); ++ if (t_tclass & TC_RPAREN) { /* for-in */ + if (!n2 || (n2->info & OPCLSMASK) != OC_IN) + syntax_error(EMSG_UNEXP_TOKEN); + n = chain_node(OC_WALKINIT | VV); +@@ -1607,7 +1607,7 @@ static void chain_group(void) + n = chain_node(OC_EXEC | Vx); + n->l.n = n2; + n2 = parse_expr(TC_SEMICOL); +- n3 = parse_expr(TC_SEQTERM); ++ n3 = parse_expr(TC_RPAREN); + n = chain_loop(n3); + n->l.n = n2; + if (!n2) +@@ -1686,13 +1686,13 @@ static void parse_program(char *p) + f->body.first = NULL; + f->nargs = 0; + /* Match func arg list: a comma sep list of >= 0 args, and a close paren */ +- while (next_token(TC_VARIABLE | TC_SEQTERM | TC_COMMA)) { ++ while (next_token(TC_VARIABLE | TC_RPAREN | TC_COMMA)) { + /* Either an empty arg list, or trailing comma from prev iter + * must be followed by an arg */ +- if (f->nargs == 0 && t_tclass == TC_SEQTERM) ++ if (f->nargs == 0 && t_tclass == TC_RPAREN) + break; + +- /* TC_SEQSTART/TC_COMMA must be followed by TC_VARIABLE */ ++ /* TC_LPAREN/TC_COMMA must be followed by TC_VARIABLE */ + if (t_tclass != TC_VARIABLE) + syntax_error(EMSG_UNEXP_TOKEN); + +@@ -1700,7 +1700,7 @@ static void parse_program(char *p) + v->x.aidx = f->nargs++; + + /* Arg followed either by end of arg list or 1 comma */ +- if (next_token(TC_COMMA | TC_SEQTERM) & TC_SEQTERM) ++ if (next_token(TC_COMMA | TC_RPAREN) & TC_RPAREN) + break; + //Impossible: next_token() above would error out and die + // if (t_tclass != TC_COMMA) +-- +2.27.0 + + +From 100c649a6d5b8085be19fdcbf02218cf2bcb3cae Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 02:32:32 +0200 +Subject: [PATCH 12/61] awk: simplify parsing of function declaration + +function old new delta +parse_program 328 313 -15 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 26 ++++++++++---------------- + 1 file changed, 10 insertions(+), 16 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index d31b97d86..08ff02adb 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -769,7 +769,7 @@ static void hash_remove(xhash *hash, const char *name) + + static char *skip_spaces(char *p) + { +- while (1) { ++ for (;;) { + if (*p == '\\' && p[1] == '\n') { + p++; + t_lineno++; +@@ -1685,26 +1685,20 @@ static void parse_program(char *p) + f = newfunc(t_string); + f->body.first = NULL; + f->nargs = 0; +- /* Match func arg list: a comma sep list of >= 0 args, and a close paren */ +- while (next_token(TC_VARIABLE | TC_RPAREN | TC_COMMA)) { +- /* Either an empty arg list, or trailing comma from prev iter +- * must be followed by an arg */ +- if (f->nargs == 0 && t_tclass == TC_RPAREN) +- break; +- +- /* TC_LPAREN/TC_COMMA must be followed by TC_VARIABLE */ +- if (t_tclass != TC_VARIABLE) ++ /* func arg list: comma sep list of args, and a close paren */ ++ for (;;) { ++ if (next_token(TC_VARIABLE | TC_RPAREN) == TC_RPAREN) { ++ if (f->nargs == 0) ++ break; /* func() is ok */ ++ /* func(a,) is not ok */ + syntax_error(EMSG_UNEXP_TOKEN); +- ++ } + v = findvar(ahash, t_string); + v->x.aidx = f->nargs++; +- + /* Arg followed either by end of arg list or 1 comma */ +- if (next_token(TC_COMMA | TC_RPAREN) & TC_RPAREN) ++ if (next_token(TC_COMMA | TC_RPAREN) == TC_RPAREN) + break; +-//Impossible: next_token() above would error out and die +-// if (t_tclass != TC_COMMA) +-// syntax_error(EMSG_UNEXP_TOKEN); ++ /* it was a comma, we ate it */ + } + seq = &f->body; + chain_group(); +-- +2.27.0 + + +From 38cbb39458b554d5bcfb5d326dd235f81e3c9b9d Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 02:43:02 +0200 +Subject: [PATCH 13/61] awk: g_buf[] does not need a separate allocation + +function old new delta +exec_builtin 1400 1414 +14 +evaluate 3132 3141 +9 +getvar_s 121 125 +4 +awk_main 902 886 -16 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 3/1 up/down: 27/-16) Total: 11 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 08ff02adb..7e4f0d142 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -535,7 +535,6 @@ struct globals { + var *Fields; + nvblock *g_cb; + char *g_pos; +- char *g_buf; + smallint icase; + smallint exiting; + smallint nextrec; +@@ -571,6 +570,8 @@ struct globals2 { + + /* biggest and least used members go last */ + tsplitter fsplitter, rsplitter; ++ ++ char g_buf[MAXVARFMT + 1]; + }; + #define G1 (ptr_to_globals[-1]) + #define G (*(struct globals2 *)ptr_to_globals) +@@ -598,7 +599,6 @@ struct globals2 { + #define Fields (G1.Fields ) + #define g_cb (G1.g_cb ) + #define g_pos (G1.g_pos ) +-#define g_buf (G1.g_buf ) + #define icase (G1.icase ) + #define exiting (G1.exiting ) + #define nextrec (G1.nextrec ) +@@ -612,6 +612,7 @@ struct globals2 { + #define intvar (G.intvar ) + #define fsplitter (G.fsplitter ) + #define rsplitter (G.rsplitter ) ++#define g_buf (G.g_buf ) + #define INIT_G() do { \ + SET_PTR_TO_GLOBALS((char*)xzalloc(sizeof(G1)+sizeof(G)) + sizeof(G1)); \ + t_tclass = TS_OPTERM; \ +@@ -3353,9 +3354,6 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + if (ENABLE_LOCALE_SUPPORT) + setlocale(LC_NUMERIC, "C"); + +- /* allocate global buffer */ +- g_buf = xmalloc(MAXVARFMT + 1); +- + vhash = hash_init(); + ahash = hash_init(); + fdhash = hash_init(); +-- +2.27.0 + + +From 743b012550834fe032bdc71257e646e202eac2b2 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 03:02:21 +0200 +Subject: [PATCH 14/61] awk: when parsing TC_FUNCTION token, eat its opening + '(' + +...like we do for array references. + +function old new delta +parse_expr 938 948 +10 +next_token 788 791 +3 +parse_program 313 310 -3 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 2/1 up/down: 13/-3) Total: 10 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 30 +++++++++++++++--------------- + 1 file changed, 15 insertions(+), 15 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 7e4f0d142..1a4468a53 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -241,7 +241,7 @@ typedef struct tsplitter_s { + #define TC_EOF (1 << 25) + #define TC_VARIABLE (1 << 26) /* name */ + #define TC_ARRAY (1 << 27) /* name[ */ +-#define TC_FUNCTION (1 << 28) /* name( - but unlike TC_ARRAY, parser does not consume '(' */ ++#define TC_FUNCTION (1 << 28) /* name( */ + #define TC_STRING (1 << 29) /* "..." */ + #define TC_NUMBER (1 << 30) + +@@ -959,6 +959,7 @@ static double getvar_i(var *v) + v->number = my_strtod(&s); + debug_printf_eval("%f (s:'%s')\n", v->number, s); + if (v->type & VF_USER) { ++//TODO: skip_spaces() also skips backslash+newline, is it intended here? + s = skip_spaces(s); + if (*s != '\0') + v->type &= ~VF_USER; +@@ -1103,7 +1104,7 @@ static uint32_t next_token(uint32_t expected) + #define save_tclass (G.next_token__save_tclass) + #define save_info (G.next_token__save_info) + +- char *p, *s; ++ char *p; + const char *tl; + const uint32_t *ti; + uint32_t tc, last_token_class; +@@ -1131,15 +1132,12 @@ static uint32_t next_token(uint32_t expected) + while (*p != '\n' && *p != '\0') + p++; + +- if (*p == '\n') +- t_lineno++; +- + if (*p == '\0') { + tc = TC_EOF; + debug_printf_parse("%s: token found: TC_EOF\n", __func__); + } else if (*p == '\"') { + /* it's a string */ +- t_string = s = ++p; ++ char *s = t_string = ++p; + while (*p != '\"') { + char *pp; + if (*p == '\0' || *p == '\n') +@@ -1154,7 +1152,7 @@ static uint32_t next_token(uint32_t expected) + debug_printf_parse("%s: token found:'%s' TC_STRING\n", __func__, t_string); + } else if ((expected & TC_REGEXP) && *p == '/') { + /* it's regexp */ +- t_string = s = ++p; ++ char *s = t_string = ++p; + while (*p != '/') { + if (*p == '\0' || *p == '\n') + syntax_error(EMSG_UNEXP_EOS); +@@ -1185,6 +1183,9 @@ static uint32_t next_token(uint32_t expected) + tc = TC_NUMBER; + debug_printf_parse("%s: token found:%f TC_NUMBER\n", __func__, t_double); + } else { ++ if (*p == '\n') ++ t_lineno++; ++ + /* search for something known */ + tl = tokenlist; + tc = 0x00000001; +@@ -1230,15 +1231,15 @@ static uint32_t next_token(uint32_t expected) + if (!(expected & TC_VARIABLE) || (expected & TC_ARRAY)) + p = skip_spaces(p); + if (*p == '(') { ++ p++; + tc = TC_FUNCTION; + debug_printf_parse("%s: token found:'%s' TC_FUNCTION\n", __func__, t_string); ++ } else if (*p == '[') { ++ p++; ++ tc = TC_ARRAY; ++ debug_printf_parse("%s: token found:'%s' TC_ARRAY\n", __func__, t_string); + } else { +- if (*p == '[') { +- p++; +- tc = TC_ARRAY; +- debug_printf_parse("%s: token found:'%s' TC_ARRAY\n", __func__, t_string); +- } else +- debug_printf_parse("%s: token found:'%s' TC_VARIABLE\n", __func__, t_string); ++ debug_printf_parse("%s: token found:'%s' TC_VARIABLE\n", __func__, t_string); + } + } + token_found: +@@ -1431,7 +1432,7 @@ static node *parse_expr(uint32_t term_tc) + debug_printf_parse("%s: TC_FUNCTION\n", __func__); + cn->info = OC_FUNC; + cn->r.f = newfunc(t_string); +- cn->l.n = parse_lrparen_list(); ++ cn->l.n = parse_expr(TC_RPAREN); + break; + + case TC_LPAREN: +@@ -1682,7 +1683,6 @@ static void parse_program(char *p) + } else if (tclass & TC_FUNCDECL) { + debug_printf_parse("%s: TC_FUNCDECL\n", __func__); + next_token(TC_FUNCTION); +- g_pos++; + f = newfunc(t_string); + f->body.first = NULL; + f->nargs = 0; +-- +2.27.0 + + +From f80dfb802b4a0984293d50f80cd41519b109b524 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 03:27:07 +0200 +Subject: [PATCH 15/61] awk: get rid of "move name one char back" trick in + next_token() + +function old new delta +next_token 791 812 +21 +awk_main 886 831 -55 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 1/1 up/down: 21/-55) Total: -34 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 54 +++++++++++++++++++++++++-------------------------- + 1 file changed, 27 insertions(+), 27 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 1a4468a53..fb1e5d59b 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -535,6 +535,7 @@ struct globals { + var *Fields; + nvblock *g_cb; + char *g_pos; ++ char g_saved_ch; + smallint icase; + smallint exiting; + smallint nextrec; +@@ -599,6 +600,7 @@ struct globals2 { + #define Fields (G1.Fields ) + #define g_cb (G1.g_cb ) + #define g_pos (G1.g_pos ) ++#define g_saved_ch (G1.g_saved_ch ) + #define icase (G1.icase ) + #define exiting (G1.exiting ) + #define nextrec (G1.nextrec ) +@@ -1125,6 +1127,10 @@ static uint32_t next_token(uint32_t expected) + t_info = save_info; + } else { + p = g_pos; ++ if (g_saved_ch != '\0') { ++ *p = g_saved_ch; ++ g_saved_ch = '\0'; ++ } + readnext: + p = skip_spaces(p); + g_lineno = t_lineno; +@@ -1183,6 +1189,8 @@ static uint32_t next_token(uint32_t expected) + tc = TC_NUMBER; + debug_printf_parse("%s: token found:%f TC_NUMBER\n", __func__, t_double); + } else { ++ char *end_of_name; ++ + if (*p == '\n') + t_lineno++; + +@@ -1219,16 +1227,14 @@ static uint32_t next_token(uint32_t expected) + if (!isalnum_(*p)) + syntax_error(EMSG_UNEXP_TOKEN); /* no */ + /* yes */ +-/* "move name one char back" trick: we need a byte for NUL terminator */ +-/* NB: this results in argv[i][-1] being used (!!!) in e.g. "awk -e 'NAME'" case */ +- t_string = --p; +- while (isalnum_(*++p)) { +- p[-1] = *p; +- } +- p[-1] = '\0'; ++ t_string = p; ++ while (isalnum_(*p)) ++ p++; ++ end_of_name = p; + tc = TC_VARIABLE; + /* also consume whitespace between functionname and bracket */ + if (!(expected & TC_VARIABLE) || (expected & TC_ARRAY)) ++//TODO: why if variable can be here (but not array ref), skipping is not allowed? Example where it matters? + p = skip_spaces(p); + if (*p == '(') { + p++; +@@ -1240,7 +1246,19 @@ static uint32_t next_token(uint32_t expected) + debug_printf_parse("%s: token found:'%s' TC_ARRAY\n", __func__, t_string); + } else { + debug_printf_parse("%s: token found:'%s' TC_VARIABLE\n", __func__, t_string); ++ if (end_of_name == p) { ++ /* there is no space for trailing NUL in t_string! ++ * We need to save the char we are going to NUL. ++ * (we'll use it in future call to next_token()) ++ */ ++ g_saved_ch = *end_of_name; ++// especially pathological example is V="abc"; V.2 - it's V concatenated to .2 ++// (it evaluates to "abc0.2"). Because of this case, we can't simply cache ++// '.' and analyze it later: we also have to *store it back* in next ++// next_token(), in order to give my_strtod() the undamaged ".2" string. ++ } + } ++ *end_of_name = '\0'; /* terminate t_string */ + } + token_found: + g_pos = p; +@@ -3420,38 +3438,20 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + + g_progname = llist_pop(&list_f); + fd = xopen_stdin(g_progname); +- /* 1st byte is reserved for "move name one char back" trick in next_token */ +- i = 1; +- s = NULL; +- for (;;) { +- int sz; +- s = xrealloc(s, i + 1000); +- sz = safe_read(fd, s + i, 1000); +- if (sz <= 0) +- break; +- i += sz; +- } +- s = xrealloc(s, i + 1); /* trim unused 999 bytes */ +- s[i] = '\0'; ++ s = xmalloc_read(fd, NULL); /* it's NUL-terminated */ + close(fd); +- parse_program(s + 1); ++ parse_program(s); + free(s); + } + g_progname = "cmd. line"; + #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS + while (list_e) { +- /* NB: "move name one char back" trick in next_token +- * can use argv[i][-1] here. +- */ + parse_program(llist_pop(&list_e)); + } + #endif + if (!(opt & (OPT_f | OPT_e))) { + if (!*argv) + bb_show_usage(); +- /* NB: "move name one char back" trick in next_token +- * can use argv[i][-1] here. +- */ + parse_program(*argv++); + } + +-- +2.27.0 + + +From 7fbe3864b057dd6c1ba39d7b5071502c09280767 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 03:44:56 +0200 +Subject: [PATCH 16/61] awk: code shrink + +function old new delta +parse_expr 948 945 -3 +chain_expr 65 62 -3 +chain_group 655 649 -6 +parse_program 310 303 -7 +rollback_token 10 - -10 +------------------------------------------------------------------------------ +(add/remove: 0/1 grow/shrink: 0/4 up/down: 0/-29) Total: -29 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index fb1e5d59b..3d1c04a32 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1300,7 +1300,7 @@ static uint32_t next_token(uint32_t expected) + #undef save_info + } + +-static void rollback_token(void) ++static ALWAYS_INLINE void rollback_token(void) + { + t_rollback = TRUE; + } +@@ -1474,14 +1474,14 @@ static node *parse_expr(uint32_t term_tc) + + case TC_LENGTH: + debug_printf_parse("%s: TC_LENGTH\n", __func__); +- next_token(TC_LPAREN /* length(...) */ ++ tc = next_token(TC_LPAREN /* length(...) */ + | TS_OPTERM /* length; (or newline)*/ + | TC_GRPTERM /* length } */ + | TC_BINOPX /* length NUM */ + | TC_COMMA /* print length, 1 */ + ); + rollback_token(); +- if (t_tclass & TC_LPAREN) { ++ if (tc & TC_LPAREN) { + /* It was a "(" token. Handle just like TC_BUILTIN */ + cn->l.n = parse_lrparen_list(); + } +@@ -1563,19 +1563,23 @@ static void chain_group(void) + + if (c & TC_GRPSTART) { + debug_printf_parse("%s: TC_GRPSTART\n", __func__); +- while (next_token(TS_GRPSEQ | TC_GRPTERM) != TC_GRPTERM) { ++ while ((c = next_token(TS_GRPSEQ | TC_GRPTERM)) != TC_GRPTERM) { + debug_printf_parse("%s: !TC_GRPTERM\n", __func__); +- if (t_tclass & TC_NEWLINE) ++ if (c & TC_NEWLINE) + continue; + rollback_token(); + chain_group(); + } + debug_printf_parse("%s: TC_GRPTERM\n", __func__); +- } else if (c & (TS_OPSEQ | TS_OPTERM)) { ++ return; ++ } ++ if (c & (TS_OPSEQ | TS_OPTERM)) { + debug_printf_parse("%s: TS_OPSEQ | TS_OPTERM\n", __func__); + rollback_token(); + chain_expr(OC_EXEC | Vx); +- } else { ++ return; ++ } ++ { + /* TS_STATEMNT */ + debug_printf_parse("%s: TS_STATEMNT(?)\n", __func__); + switch (t_info & OPCLSMASK) { +-- +2.27.0 + + +From 9dba9fae14ec415943d1fda31b6b48d56d5cb0d0 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 03:47:46 +0200 +Subject: [PATCH 17/61] awk: deindent a block, no code changes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 167 +++++++++++++++++++++++++------------------------- + 1 file changed, 83 insertions(+), 84 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 3d1c04a32..34bcc1798 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1579,98 +1579,97 @@ static void chain_group(void) + chain_expr(OC_EXEC | Vx); + return; + } +- { +- /* TS_STATEMNT */ +- debug_printf_parse("%s: TS_STATEMNT(?)\n", __func__); +- switch (t_info & OPCLSMASK) { +- case ST_IF: +- debug_printf_parse("%s: ST_IF\n", __func__); +- n = chain_node(OC_BR | Vx); +- n->l.n = parse_lrparen_list(); ++ ++ /* TS_STATEMNT */ ++ debug_printf_parse("%s: TS_STATEMNT(?)\n", __func__); ++ switch (t_info & OPCLSMASK) { ++ case ST_IF: ++ debug_printf_parse("%s: ST_IF\n", __func__); ++ n = chain_node(OC_BR | Vx); ++ n->l.n = parse_lrparen_list(); ++ chain_group(); ++ n2 = chain_node(OC_EXEC); ++ n->r.n = seq->last; ++ if (next_token(TS_GRPSEQ | TC_GRPTERM | TC_ELSE) == TC_ELSE) { + chain_group(); +- n2 = chain_node(OC_EXEC); +- n->r.n = seq->last; +- if (next_token(TS_GRPSEQ | TC_GRPTERM | TC_ELSE) == TC_ELSE) { +- chain_group(); +- n2->a.n = seq->last; +- } else { +- rollback_token(); +- } +- break; ++ n2->a.n = seq->last; ++ } else { ++ rollback_token(); ++ } ++ break; + +- case ST_WHILE: +- debug_printf_parse("%s: ST_WHILE\n", __func__); +- n2 = parse_lrparen_list(); +- n = chain_loop(NULL); +- n->l.n = n2; +- break; ++ case ST_WHILE: ++ debug_printf_parse("%s: ST_WHILE\n", __func__); ++ n2 = parse_lrparen_list(); ++ n = chain_loop(NULL); ++ n->l.n = n2; ++ break; + +- case ST_DO: +- debug_printf_parse("%s: ST_DO\n", __func__); +- n2 = chain_node(OC_EXEC); +- n = chain_loop(NULL); +- n2->a.n = n->a.n; +- next_token(TC_WHILE); +- n->l.n = parse_lrparen_list(); +- break; ++ case ST_DO: ++ debug_printf_parse("%s: ST_DO\n", __func__); ++ n2 = chain_node(OC_EXEC); ++ n = chain_loop(NULL); ++ n2->a.n = n->a.n; ++ next_token(TC_WHILE); ++ n->l.n = parse_lrparen_list(); ++ break; + +- case ST_FOR: +- debug_printf_parse("%s: ST_FOR\n", __func__); +- next_token(TC_LPAREN); +- n2 = parse_expr(TC_SEMICOL | TC_RPAREN); +- if (t_tclass & TC_RPAREN) { /* for-in */ +- if (!n2 || (n2->info & OPCLSMASK) != OC_IN) +- syntax_error(EMSG_UNEXP_TOKEN); +- n = chain_node(OC_WALKINIT | VV); +- n->l.n = n2->l.n; +- n->r.n = n2->r.n; +- n = chain_loop(NULL); +- n->info = OC_WALKNEXT | Vx; +- n->l.n = n2->l.n; +- } else { /* for (;;) */ +- n = chain_node(OC_EXEC | Vx); +- n->l.n = n2; +- n2 = parse_expr(TC_SEMICOL); +- n3 = parse_expr(TC_RPAREN); +- n = chain_loop(n3); +- n->l.n = n2; +- if (!n2) +- n->info = OC_EXEC; +- } +- break; ++ case ST_FOR: ++ debug_printf_parse("%s: ST_FOR\n", __func__); ++ next_token(TC_LPAREN); ++ n2 = parse_expr(TC_SEMICOL | TC_RPAREN); ++ if (t_tclass & TC_RPAREN) { /* for-in */ ++ if (!n2 || (n2->info & OPCLSMASK) != OC_IN) ++ syntax_error(EMSG_UNEXP_TOKEN); ++ n = chain_node(OC_WALKINIT | VV); ++ n->l.n = n2->l.n; ++ n->r.n = n2->r.n; ++ n = chain_loop(NULL); ++ n->info = OC_WALKNEXT | Vx; ++ n->l.n = n2->l.n; ++ } else { /* for (;;) */ ++ n = chain_node(OC_EXEC | Vx); ++ n->l.n = n2; ++ n2 = parse_expr(TC_SEMICOL); ++ n3 = parse_expr(TC_RPAREN); ++ n = chain_loop(n3); ++ n->l.n = n2; ++ if (!n2) ++ n->info = OC_EXEC; ++ } ++ break; + +- case OC_PRINT: +- case OC_PRINTF: +- debug_printf_parse("%s: OC_PRINT[F]\n", __func__); +- n = chain_node(t_info); +- n->l.n = parse_expr(TS_OPTERM | TC_OUTRDR | TC_GRPTERM); +- if (t_tclass & TC_OUTRDR) { +- n->info |= t_info; +- n->r.n = parse_expr(TS_OPTERM | TC_GRPTERM); +- } +- if (t_tclass & TC_GRPTERM) +- rollback_token(); +- break; ++ case OC_PRINT: ++ case OC_PRINTF: ++ debug_printf_parse("%s: OC_PRINT[F]\n", __func__); ++ n = chain_node(t_info); ++ n->l.n = parse_expr(TS_OPTERM | TC_OUTRDR | TC_GRPTERM); ++ if (t_tclass & TC_OUTRDR) { ++ n->info |= t_info; ++ n->r.n = parse_expr(TS_OPTERM | TC_GRPTERM); ++ } ++ if (t_tclass & TC_GRPTERM) ++ rollback_token(); ++ break; + +- case OC_BREAK: +- debug_printf_parse("%s: OC_BREAK\n", __func__); +- n = chain_node(OC_EXEC); +- n->a.n = break_ptr; +- chain_expr(t_info); +- break; ++ case OC_BREAK: ++ debug_printf_parse("%s: OC_BREAK\n", __func__); ++ n = chain_node(OC_EXEC); ++ n->a.n = break_ptr; ++ chain_expr(t_info); ++ break; + +- case OC_CONTINUE: +- debug_printf_parse("%s: OC_CONTINUE\n", __func__); +- n = chain_node(OC_EXEC); +- n->a.n = continue_ptr; +- chain_expr(t_info); +- break; ++ case OC_CONTINUE: ++ debug_printf_parse("%s: OC_CONTINUE\n", __func__); ++ n = chain_node(OC_EXEC); ++ n->a.n = continue_ptr; ++ chain_expr(t_info); ++ break; + +- /* delete, next, nextfile, return, exit */ +- default: +- debug_printf_parse("%s: default\n", __func__); +- chain_expr(t_info); +- } ++ /* delete, next, nextfile, return, exit */ ++ default: ++ debug_printf_parse("%s: default\n", __func__); ++ chain_expr(t_info); + } + } + +-- +2.27.0 + + +From bc9e60546c860c130ed9c312517fbbaf0ad51871 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 12:16:36 +0200 +Subject: [PATCH 18/61] awk: fix parsing of expressions such as "v (a)" + +function old new delta +next_token 812 825 +13 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 22 ++++++++++++++++++---- + testsuite/awk.tests | 11 +++++++++++ + 2 files changed, 29 insertions(+), 4 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 34bcc1798..ce860dc04 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1231,11 +1231,24 @@ static uint32_t next_token(uint32_t expected) + while (isalnum_(*p)) + p++; + end_of_name = p; +- tc = TC_VARIABLE; +- /* also consume whitespace between functionname and bracket */ +- if (!(expected & TC_VARIABLE) || (expected & TC_ARRAY)) +-//TODO: why if variable can be here (but not array ref), skipping is not allowed? Example where it matters? ++ ++ if (last_token_class == TC_FUNCDECL) ++ /* eat space in "function FUNC (...) {...}" declaration */ + p = skip_spaces(p); ++ else if (expected & TC_ARRAY) { ++ /* eat space between array name and [ */ ++ char *s = skip_spaces(p); ++ if (*s == '[') /* array ref, not just a name? */ ++ p = s; ++ } ++ /* else: do NOT consume whitespace after variable name! ++ * gawk allows definition "function FUNC (p) {...}" - note space, ++ * but disallows the call "FUNC (p)" because it isn't one - ++ * expression "v (a)" should NOT be parsed as TC_FUNCTION: ++ * it is a valid concatenation if "v" is a variable, ++ * not a function name (and type of name is not known at parse time). ++ */ ++ + if (*p == '(') { + p++; + tc = TC_FUNCTION; +@@ -1245,6 +1258,7 @@ static uint32_t next_token(uint32_t expected) + tc = TC_ARRAY; + debug_printf_parse("%s: token found:'%s' TC_ARRAY\n", __func__, t_string); + } else { ++ tc = TC_VARIABLE; + debug_printf_parse("%s: token found:'%s' TC_VARIABLE\n", __func__, t_string); + if (end_of_name == p) { + /* there is no space for trailing NUL in t_string! +diff --git a/testsuite/awk.tests b/testsuite/awk.tests +index cf9b722dc..6e35d33dd 100755 +--- a/testsuite/awk.tests ++++ b/testsuite/awk.tests +@@ -71,6 +71,17 @@ testing "awk properly handles undefined function" \ + "L1\n\nawk: cmd. line:5: Call to undefined function\n" \ + "" "" + ++prg=' ++BEGIN { ++ v=1 ++ a=2 ++ print v (a) ++}' ++testing "'v (a)' is not a function call, it is a concatenation" \ ++ "awk '$prg' 2>&1" \ ++ "12\n" \ ++ "" "" ++ + + optional DESKTOP + testing "awk hex const 1" "awk '{ print or(0xffffffff,1) }'" "4294967295\n" "" "\n" +-- +2.27.0 + + +From 08444111ee05f6514bcf6a8c8898ab4e4b827982 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 14:33:04 +0200 +Subject: [PATCH 19/61] awk: document which hashes are used at what state + (parse/execute) + +We can free them after they are no longer needed. +(Currently, being a NOEXEC applet is much larger waste of memory +for the case of long-running awk script). + +function old new delta +awk_main 831 827 -4 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 30 ++++++++++++++++++++---------- + 1 file changed, 20 insertions(+), 10 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index ce860dc04..6142144bb 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -527,7 +527,10 @@ struct globals { + chain *seq; + node *break_ptr, *continue_ptr; + rstream *iF; +- xhash *vhash, *ahash, *fdhash, *fnhash; ++ xhash *ahash; /* argument names, used only while parsing function bodies */ ++ xhash *fnhash; /* function names, used only in parsing stage */ ++ xhash *vhash; /* variables and arrays */ ++ xhash *fdhash; /* file objects, used only in execution stage */ + const char *g_progname; + int g_lineno; + int nfields; +@@ -1719,6 +1722,7 @@ static void parse_program(char *p) + debug_printf_parse("%s: TC_FUNCDECL\n", __func__); + next_token(TC_FUNCTION); + f = newfunc(t_string); ++//FIXME: dup check: functions can't be redefined, this is not ok: awk 'func f(){}; func f(){}' + f->body.first = NULL; + f->nargs = 0; + /* func arg list: comma sep list of args, and a close paren */ +@@ -3389,12 +3393,8 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + if (ENABLE_LOCALE_SUPPORT) + setlocale(LC_NUMERIC, "C"); + +- vhash = hash_init(); +- ahash = hash_init(); +- fdhash = hash_init(); +- fnhash = hash_init(); +- + /* initialize variables */ ++ vhash = hash_init(); + { + char *vnames = (char *)vNames; /* cheat */ + char *vvalues = (char *)vValues; +@@ -3416,10 +3416,6 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + handle_special(intvar[FS]); + handle_special(intvar[RS]); + +- newfile("/dev/stdin")->F = stdin; +- newfile("/dev/stdout")->F = stdout; +- newfile("/dev/stderr")->F = stderr; +- + /* Huh, people report that sometimes environ is NULL. Oh well. */ + if (environ) { + char **envp; +@@ -3449,6 +3445,10 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + if (!is_assignment(llist_pop(&list_v))) + bb_show_usage(); + } ++ ++ /* Parse all supplied programs */ ++ fnhash = hash_init(); ++ ahash = hash_init(); + while (list_f) { + int fd; + char *s; +@@ -3471,6 +3471,11 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + bb_show_usage(); + parse_program(*argv++); + } ++ //free_hash(ahash) // ~250 bytes, arg names, used only during parse of function bodies ++ //ahash = NULL; // debug ++ //free_hash(fnhash) // ~250 bytes, used only for function names ++ //fnhash = NULL; // debug ++ /* parsing done, on to executing */ + + /* fill in ARGV array */ + setari_u(intvar[ARGV], 0, "awk"); +@@ -3479,6 +3484,11 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + setari_u(intvar[ARGV], ++i, *argv++); + setvar_i(intvar[ARGC], i + 1); + ++ fdhash = hash_init(); ++ newfile("/dev/stdin")->F = stdin; ++ newfile("/dev/stdout")->F = stdout; ++ newfile("/dev/stderr")->F = stderr; ++ + zero_out_var(&tv); + evaluate(beginseq.first, &tv); + if (!mainseq.first && !endseq.first) +-- +2.27.0 + + +From ce151c62189985344d90fc554f8780c7305112f8 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 18:33:25 +0200 +Subject: [PATCH 20/61] awk: free unused parsing structures after parse is done + +function old new delta +hash_clear - 90 +90 +awk_main 827 849 +22 +clear_array 90 - -90 +------------------------------------------------------------------------------ +(add/remove: 1/1 grow/shrink: 1/0 up/down: 112/-90) Total: 22 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 74 ++++++++++++++++++++++++++++++++------------------- + 1 file changed, 47 insertions(+), 27 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 6142144bb..4e29b28cf 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -530,7 +530,8 @@ struct globals { + xhash *ahash; /* argument names, used only while parsing function bodies */ + xhash *fnhash; /* function names, used only in parsing stage */ + xhash *vhash; /* variables and arrays */ +- xhash *fdhash; /* file objects, used only in execution stage */ ++ //xhash *fdhash; /* file objects, used only in execution stage */ ++ //we are reusing ahash as fdhash, via define (see later) + const char *g_progname; + int g_lineno; + int nfields; +@@ -592,10 +593,13 @@ struct globals2 { + #define break_ptr (G1.break_ptr ) + #define continue_ptr (G1.continue_ptr) + #define iF (G1.iF ) +-#define vhash (G1.vhash ) + #define ahash (G1.ahash ) +-#define fdhash (G1.fdhash ) + #define fnhash (G1.fnhash ) ++#define vhash (G1.vhash ) ++#define fdhash ahash ++//^^^^^^^^^^^^^^^^^^ ahash is cleared after every function parsing, ++// and ends up empty after parsing phase. Thus, we can simply reuse it ++// for fdhash in execution stage. + #define g_progname (G1.g_progname ) + #define g_lineno (G1.g_lineno ) + #define nfields (G1.nfields ) +@@ -682,6 +686,33 @@ static xhash *hash_init(void) + return newhash; + } + ++static void hash_clear(xhash *hash) ++{ ++ unsigned i; ++ hash_item *hi, *thi; ++ ++ for (i = 0; i < hash->csize; i++) { ++ hi = hash->items[i]; ++ while (hi) { ++ thi = hi; ++ hi = hi->next; ++ free(thi->data.v.string); ++ free(thi); ++ } ++ hash->items[i] = NULL; ++ } ++ hash->glen = hash->nel = 0; ++} ++ ++#if 0 //UNUSED ++static void hash_free(xhash *hash) ++{ ++ hash_clear(hash); ++ free(hash->items); ++ free(hash); ++} ++#endif ++ + /* find item in hash, return ptr to data, NULL if not found */ + static void *hash_search(xhash *hash, const char *name) + { +@@ -869,23 +900,7 @@ static xhash *iamarray(var *v) + return a->x.array; + } + +-static void clear_array(xhash *array) +-{ +- unsigned i; +- hash_item *hi, *thi; +- +- for (i = 0; i < array->csize; i++) { +- hi = array->items[i]; +- while (hi) { +- thi = hi; +- hi = hi->next; +- free(thi->data.v.string); +- free(thi); +- } +- array->items[i] = NULL; +- } +- array->glen = array->nel = 0; +-} ++#define clear_array(array) hash_clear(array) + + /* clear a variable */ + static var *clrvar(var *v) +@@ -1742,7 +1757,7 @@ static void parse_program(char *p) + } + seq = &f->body; + chain_group(); +- clear_array(ahash); ++ hash_clear(ahash); + } else if (tclass & TS_OPSEQ) { + debug_printf_parse("%s: TS_OPSEQ\n", __func__); + rollback_token(); +@@ -3471,11 +3486,16 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + bb_show_usage(); + parse_program(*argv++); + } +- //free_hash(ahash) // ~250 bytes, arg names, used only during parse of function bodies +- //ahash = NULL; // debug +- //free_hash(fnhash) // ~250 bytes, used only for function names +- //fnhash = NULL; // debug +- /* parsing done, on to executing */ ++ /* Free unused parse structures */ ++ //hash_free(fnhash); // ~250 bytes when empty, used only for function names ++ //^^^^^^^^^^^^^^^^^ does not work, hash_clear() inside SEGVs ++ // (IOW: hash_clear() assumes it's a hash of variables. fnhash is not). ++ free(fnhash->items); ++ free(fnhash); ++ fnhash = NULL; // debug ++ //hash_free(ahash); // empty after parsing, will reuse as fdhash instead of freeing ++ ++ /* Parsing done, on to executing */ + + /* fill in ARGV array */ + setari_u(intvar[ARGV], 0, "awk"); +@@ -3484,7 +3504,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + setari_u(intvar[ARGV], ++i, *argv++); + setvar_i(intvar[ARGC], i + 1); + +- fdhash = hash_init(); ++ //fdhash = ahash - done via define + newfile("/dev/stdin")->F = stdin; + newfile("/dev/stdout")->F = stdout; + newfile("/dev/stderr")->F = stderr; +-- +2.27.0 + + +From 465eba0f032c96966d2547f116784fb0d8751943 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Tue, 29 Jun 2021 19:07:36 +0200 +Subject: [PATCH 21/61] awk: assorted optimizations + +hash_find(): do not caclculate hash twice. Do not divide - can use +cheap multiply-by-8 shift. + +nextword(): do not repeatedly increment in-memory value, do it in register, +then store final result. + +hashwalk_init(): do not strlen() twice. + +function old new delta +hash_search3 - 49 +49 +hash_find 259 281 +22 +nextword 19 16 -3 +evaluate 3141 3137 -4 +hash_search 54 28 -26 +------------------------------------------------------------------------------ +(add/remove: 1/0 grow/shrink: 1/3 up/down: 71/-33) Total: 38 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 26 +++++++++++++++++--------- + 1 file changed, 17 insertions(+), 9 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 4e29b28cf..a4cd3cf93 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -696,6 +696,7 @@ static void hash_clear(xhash *hash) + while (hi) { + thi = hi; + hi = hi->next; ++//FIXME: this assumes that it's a hash of *variables*: + free(thi->data.v.string); + free(thi); + } +@@ -714,11 +715,11 @@ static void hash_free(xhash *hash) + #endif + + /* find item in hash, return ptr to data, NULL if not found */ +-static void *hash_search(xhash *hash, const char *name) ++static NOINLINE void *hash_search3(xhash *hash, const char *name, unsigned idx) + { + hash_item *hi; + +- hi = hash->items[hashidx(name) % hash->csize]; ++ hi = hash->items[idx % hash->csize]; + while (hi) { + if (strcmp(hi->name, name) == 0) + return &hi->data; +@@ -727,6 +728,11 @@ static void *hash_search(xhash *hash, const char *name) + return NULL; + } + ++static void *hash_search(xhash *hash, const char *name) ++{ ++ return hash_search3(hash, name, hashidx(name)); ++} ++ + /* grow hash if it becomes too big */ + static void hash_rebuild(xhash *hash) + { +@@ -762,16 +768,17 @@ static void *hash_find(xhash *hash, const char *name) + unsigned idx; + int l; + +- hi = hash_search(hash, name); ++ idx = hashidx(name); ++ hi = hash_search3(hash, name, idx); + if (!hi) { +- if (++hash->nel / hash->csize > 10) ++ if (++hash->nel > hash->csize * 8) + hash_rebuild(hash); + + l = strlen(name) + 1; + hi = xzalloc(sizeof(*hi) + l); + strcpy(hi->name, name); + +- idx = hashidx(name) % hash->csize; ++ idx = idx % hash->csize; + hi->next = hash->items[idx]; + hash->items[idx] = hi; + hash->glen += l; +@@ -822,8 +829,10 @@ static char *skip_spaces(char *p) + static char *nextword(char **s) + { + char *p = *s; +- while (*(*s)++ != '\0') ++ char *q = p; ++ while (*q++ != '\0') + continue; ++ *s = q; + return p; + } + +@@ -2116,8 +2125,7 @@ static void hashwalk_init(var *v, xhash *array) + for (i = 0; i < array->csize; i++) { + hi = array->items[i]; + while (hi) { +- strcpy(w->end, hi->name); +- nextword(&w->end); ++ w->end = stpcpy(w->end, hi->name) + 1; + hi = hi->next; + } + } +@@ -3504,7 +3512,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + setari_u(intvar[ARGV], ++i, *argv++); + setvar_i(intvar[ARGC], i + 1); + +- //fdhash = ahash - done via define ++ //fdhash = ahash; // done via define + newfile("/dev/stdin")->F = stdin; + newfile("/dev/stdout")->F = stdout; + newfile("/dev/stderr")->F = stderr; +-- +2.27.0 + + +From 467708ee9c852a4535d554214bb70b916743335a Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Wed, 30 Jun 2021 02:12:27 +0200 +Subject: [PATCH 22/61] awk: remove custom pool allocator for temporary awk + variables + +It seems to be designed to reduce overhead of malloc's auxiliary data, +by allocating at least 64 variables as a block. +With "struct var" being about 20-32 bytes long (32/64 bits), +malloc overhead for one temporary indeed is high, ~33% more memory used +than needed. + +function old new delta +evaluate 3137 3145 +8 +modprobe_main 798 803 +5 +exec_builtin 1414 1419 +5 +awk_printf 476 481 +5 +as_regex 132 137 +5 +EMSG_INTERNAL_ERROR 15 - -15 +nvfree 169 116 -53 +nvalloc 145 - -145 +------------------------------------------------------------------------------ +(add/remove: 0/2 grow/shrink: 5/1 up/down: 28/-213) Total: -185 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 164 +++++++++++++++++++------------------------------- + 1 file changed, 61 insertions(+), 103 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index a4cd3cf93..35c11ec58 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -93,7 +93,6 @@ enum { + }; + + #define MAXVARFMT 240 +-#define MINNVBLOCK 64 + + /* variable flags */ + #define VF_NUMBER 0x0001 /* 1 = primary type is number */ +@@ -120,8 +119,8 @@ typedef struct walker_list { + /* Variable */ + typedef struct var_s { + unsigned type; /* flags */ +- double number; + char *string; ++ double number; + union { + int aidx; /* func arg idx (for compilation stage) */ + struct xhash_s *array; /* array ptr */ +@@ -192,15 +191,6 @@ typedef struct node_s { + } a; + } node; + +-/* Block of temporary variables */ +-typedef struct nvblock_s { +- int size; +- var *pos; +- struct nvblock_s *prev; +- struct nvblock_s *next; +- var nv[]; +-} nvblock; +- + typedef struct tsplitter_s { + node n; + regex_t re[2]; +@@ -537,7 +527,6 @@ struct globals { + int nfields; + int maxfields; /* used in fsrealloc() only */ + var *Fields; +- nvblock *g_cb; + char *g_pos; + char g_saved_ch; + smallint icase; +@@ -605,7 +594,6 @@ struct globals2 { + #define nfields (G1.nfields ) + #define maxfields (G1.maxfields ) + #define Fields (G1.Fields ) +-#define g_cb (G1.g_cb ) + #define g_pos (G1.g_pos ) + #define g_saved_ch (G1.g_saved_ch ) + #define icase (G1.icase ) +@@ -640,7 +628,6 @@ static int awk_exit(int) NORETURN; + + /* ---- error handling ---- */ + +-static const char EMSG_INTERNAL_ERROR[] ALIGN1 = "Internal error"; + static const char EMSG_UNEXP_EOS[] ALIGN1 = "Unexpected end of string"; + static const char EMSG_UNEXP_TOKEN[] ALIGN1 = "Unexpected token"; + static const char EMSG_DIV_BY_ZERO[] ALIGN1 = "Division by zero"; +@@ -1050,77 +1037,6 @@ static int istrue(var *v) + return (v->string && v->string[0]); + } + +-/* temporary variables allocator. Last allocated should be first freed */ +-static var *nvalloc(int n) +-{ +- nvblock *pb = NULL; +- var *v, *r; +- int size; +- +- while (g_cb) { +- pb = g_cb; +- if ((g_cb->pos - g_cb->nv) + n <= g_cb->size) +- break; +- g_cb = g_cb->next; +- } +- +- if (!g_cb) { +- size = (n <= MINNVBLOCK) ? MINNVBLOCK : n; +- g_cb = xzalloc(sizeof(nvblock) + size * sizeof(var)); +- g_cb->size = size; +- g_cb->pos = g_cb->nv; +- g_cb->prev = pb; +- /*g_cb->next = NULL; - xzalloc did it */ +- if (pb) +- pb->next = g_cb; +- } +- +- v = r = g_cb->pos; +- g_cb->pos += n; +- +- while (v < g_cb->pos) { +- v->type = 0; +- v->string = NULL; +- v++; +- } +- +- return r; +-} +- +-static void nvfree(var *v) +-{ +- var *p; +- +- if (v < g_cb->nv || v >= g_cb->pos) +- syntax_error(EMSG_INTERNAL_ERROR); +- +- for (p = v; p < g_cb->pos; p++) { +- if ((p->type & (VF_ARRAY | VF_CHILD)) == VF_ARRAY) { +- clear_array(iamarray(p)); +- free(p->x.array->items); +- free(p->x.array); +- } +- if (p->type & VF_WALK) { +- walker_list *n; +- walker_list *w = p->x.walker; +- debug_printf_walker("nvfree: freeing walker @%p\n", &p->x.walker); +- p->x.walker = NULL; +- while (w) { +- n = w->prev; +- debug_printf_walker(" free(%p)\n", w); +- free(w); +- w = n; +- } +- } +- clrvar(p); +- } +- +- g_cb->pos = v; +- while (g_cb->prev && g_cb->pos == g_cb->nv) { +- g_cb = g_cb->prev; +- } +-} +- + /* ------- awk program text parsing ------- */ + + /* Parse next token pointed by global pos, place results into global t_XYZ variables. +@@ -1793,6 +1709,41 @@ static void parse_program(char *p) + + /* -------- program execution part -------- */ + ++/* temporary variables allocator */ ++static var *nvalloc(int sz) ++{ ++ return xzalloc(sz * sizeof(var)); ++} ++ ++static void nvfree(var *v, int sz) ++{ ++ var *p = v; ++ ++ while (--sz >= 0) { ++ if ((p->type & (VF_ARRAY | VF_CHILD)) == VF_ARRAY) { ++ clear_array(iamarray(p)); ++ free(p->x.array->items); ++ free(p->x.array); ++ } ++ if (p->type & VF_WALK) { ++ walker_list *n; ++ walker_list *w = p->x.walker; ++ debug_printf_walker("nvfree: freeing walker @%p\n", &p->x.walker); ++ p->x.walker = NULL; ++ while (w) { ++ n = w->prev; ++ debug_printf_walker(" free(%p)\n", w); ++ free(w); ++ w = n; ++ } ++ } ++ clrvar(p); ++ p++; ++ } ++ ++ free(v); ++} ++ + static node *mk_splitter(const char *s, tsplitter *spl) + { + regex_t *re, *ire; +@@ -1814,9 +1765,9 @@ static node *mk_splitter(const char *s, tsplitter *spl) + return n; + } + +-/* use node as a regular expression. Supplied with node ptr and regex_t ++/* Use node as a regular expression. Supplied with node ptr and regex_t + * storage space. Return ptr to regex (if result points to preg, it should +- * be later regfree'd manually ++ * be later regfree'd manually). + */ + static regex_t *as_regex(node *op, regex_t *preg) + { +@@ -1840,7 +1791,7 @@ static regex_t *as_regex(node *op, regex_t *preg) + cflags &= ~REG_EXTENDED; + xregcomp(preg, s, cflags); + } +- nvfree(v); ++ nvfree(v, 1); + return preg; + } + +@@ -2292,6 +2243,8 @@ static char *awk_printf(node *n, int *len) + var *v, *arg; + + v = nvalloc(1); ++//TODO: above, to avoid allocating a single temporary var, take a pointer ++//to a temporary that our caller (evaluate()) already has? + fmt = f = xstrdup(getvar_s(evaluate(nextarg(&n), v))); + + i = 0; +@@ -2333,7 +2286,7 @@ static char *awk_printf(node *n, int *len) + } + + free(fmt); +- nvfree(v); ++ nvfree(v, 1); + b = xrealloc(b, i + 1); + b[i] = '\0'; + #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS +@@ -2661,14 +2614,14 @@ static NOINLINE var *exec_builtin(node *op, var *res) + break; + } + +- nvfree(tv); ++ nvfree(tv, 4); + return res; + #undef tspl + } + + /* + * Evaluate node - the heart of the program. Supplied with subtree +- * and place where to store result. returns ptr to result. ++ * and place where to store result. Returns ptr to result. + */ + #define XC(n) ((n) >> 8) + +@@ -2953,33 +2906,38 @@ static var *evaluate(node *op, var *res) + break; + + case XC( OC_FUNC ): { +- var *vbeg, *v; ++ var *tv, *sv_fnargs; + const char *sv_progname; ++ int nargs1, i; ++ + debug_printf_eval("FUNC\n"); + +- /* The body might be empty, still has to eval the args */ + if (!op->r.n->info && !op->r.f->body.first) + syntax_error(EMSG_UNDEF_FUNC); + +- vbeg = v = nvalloc(op->r.f->nargs + 1); ++ /* The body might be empty, still has to eval the args */ ++ nargs1 = op->r.f->nargs + 1; ++ tv = nvalloc(nargs1); ++ i = 0; + while (op1) { ++//TODO: explain why one iteration is done even for the case p->r.f->nargs == 0 + var *arg = evaluate(nextarg(&op1), v1); +- copyvar(v, arg); +- v->type |= VF_CHILD; +- v->x.parent = arg; +- if (++v - vbeg >= op->r.f->nargs) ++ copyvar(&tv[i], arg); ++ tv[i].type |= VF_CHILD; ++ tv[i].x.parent = arg; ++ if (++i >= op->r.f->nargs) + break; + } + +- v = fnargs; +- fnargs = vbeg; ++ sv_fnargs = fnargs; + sv_progname = g_progname; + ++ fnargs = tv; + res = evaluate(op->r.f->body.first, res); ++ nvfree(fnargs, nargs1); + + g_progname = sv_progname; +- nvfree(fnargs); +- fnargs = v; ++ fnargs = sv_fnargs; + + break; + } +@@ -3301,7 +3259,7 @@ static var *evaluate(node *op, var *res) + break; + } /* while (op) */ + +- nvfree(v1); ++ nvfree(v1, 2); + debug_printf_eval("returning from %s(): %p\n", __func__, res); + return res; + #undef fnargs +-- +2.27.0 + + +From c5ddfb36e34c93d63546bc3a7f458b946fa64825 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Wed, 30 Jun 2021 12:12:20 +0200 +Subject: [PATCH 23/61] awk: replace incorrect use of union in undefined + function check (no code changes) + +...which reveals that it's buggy: it thinks "func f(){}" is an undefined function! + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 35c11ec58..1115085da 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -2912,7 +2912,7 @@ static var *evaluate(node *op, var *res) + + debug_printf_eval("FUNC\n"); + +- if (!op->r.n->info && !op->r.f->body.first) ++ if (op->r.f->nargs == 0 && !op->r.f->body.first) + syntax_error(EMSG_UNDEF_FUNC); + + /* The body might be empty, still has to eval the args */ +-- +2.27.0 + + +From 1295da1db50adb2b6db53c6d057fdcc952b0bc78 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Wed, 30 Jun 2021 12:23:51 +0200 +Subject: [PATCH 24/61] awk: allow empty fuinctions with no arguments, disallow + function redefinitions + +function old new delta +.rodata 103681 103700 +19 +parse_program 303 307 +4 +evaluate 3145 3141 -4 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 2/1 up/down: 23/-4) Total: 19 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 11 +++++++---- + testsuite/awk.tests | 10 ++++++++++ + 2 files changed, 17 insertions(+), 4 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 1115085da..c05d5d651 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -139,6 +139,7 @@ typedef struct chain_s { + /* Function */ + typedef struct func_s { + unsigned nargs; ++ smallint defined; + struct chain_s body; + } func; + +@@ -1662,9 +1663,11 @@ static void parse_program(char *p) + debug_printf_parse("%s: TC_FUNCDECL\n", __func__); + next_token(TC_FUNCTION); + f = newfunc(t_string); +-//FIXME: dup check: functions can't be redefined, this is not ok: awk 'func f(){}; func f(){}' +- f->body.first = NULL; +- f->nargs = 0; ++ if (f->defined) ++ syntax_error("Duplicate function"); ++ f->defined = 1; ++ //f->body.first = NULL; - already is ++ //f->nargs = 0; - already is + /* func arg list: comma sep list of args, and a close paren */ + for (;;) { + if (next_token(TC_VARIABLE | TC_RPAREN) == TC_RPAREN) { +@@ -2912,7 +2915,7 @@ static var *evaluate(node *op, var *res) + + debug_printf_eval("FUNC\n"); + +- if (op->r.f->nargs == 0 && !op->r.f->body.first) ++ if (!op->r.f->defined) + syntax_error(EMSG_UNDEF_FUNC); + + /* The body might be empty, still has to eval the args */ +diff --git a/testsuite/awk.tests b/testsuite/awk.tests +index 6e35d33dd..873cc3680 100755 +--- a/testsuite/awk.tests ++++ b/testsuite/awk.tests +@@ -44,6 +44,16 @@ testing "awk handles empty function f(arg){}" \ + "L1\n\nL2\n\n" \ + "" "" + ++prg=' ++function empty_fun(){} ++END {empty_fun() ++ print "Ok" ++}' ++testing "awk handles empty function f(){}" \ ++ "awk '$prg'" \ ++ "Ok\n" \ ++ "" "" ++ + prg=' + function outer_fun() { + return 1 +-- +2.27.0 + + +From d88539017ebe731ba507fda8def65969bd14e582 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Wed, 30 Jun 2021 12:42:39 +0200 +Subject: [PATCH 25/61] awk: rewrite "print" logic a bit to make it clearer + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index c05d5d651..0fbca0433 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -2792,7 +2792,7 @@ static var *evaluate(node *op, var *res) + if (!op1) { + fputs(getvar_s(intvar[F0]), F); + } else { +- while (op1) { ++ for (;;) { + var *v = evaluate(nextarg(&op1), v1); + if (v->type & VF_NUMBER) { + fmt_num(g_buf, MAXVARFMT, getvar_s(intvar[OFMT]), +@@ -2801,13 +2801,12 @@ static var *evaluate(node *op, var *res) + } else { + fputs(getvar_s(v), F); + } +- +- if (op1) +- fputs(getvar_s(intvar[OFS]), F); ++ if (!op1) ++ break; ++ fputs(getvar_s(intvar[OFS]), F); + } + } + fputs(getvar_s(intvar[ORS]), F); +- + } else { /* OC_PRINTF */ + char *s = awk_printf(op1, &len); + #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS +-- +2.27.0 + + +From 04a90dbf88727415f4bcd3d1125d463255557d55 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Wed, 30 Jun 2021 12:52:51 +0200 +Subject: [PATCH 26/61] awk: evaluate all, even superfluous function args + +function old new delta +evaluate 3128 3135 +7 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 19 ++++++++++++------- + testsuite/awk.tests | 8 +++++++- + 2 files changed, 19 insertions(+), 8 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 0fbca0433..47bbc10a6 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -2910,7 +2910,7 @@ static var *evaluate(node *op, var *res) + case XC( OC_FUNC ): { + var *tv, *sv_fnargs; + const char *sv_progname; +- int nargs1, i; ++ int nargs, i; + + debug_printf_eval("FUNC\n"); + +@@ -2918,17 +2918,22 @@ static var *evaluate(node *op, var *res) + syntax_error(EMSG_UNDEF_FUNC); + + /* The body might be empty, still has to eval the args */ +- nargs1 = op->r.f->nargs + 1; +- tv = nvalloc(nargs1); ++ nargs = op->r.f->nargs; ++ tv = nvalloc(nargs); + i = 0; + while (op1) { +-//TODO: explain why one iteration is done even for the case p->r.f->nargs == 0 + var *arg = evaluate(nextarg(&op1), v1); ++ if (i == nargs) { ++ /* call with more arguments than function takes. ++ * (gawk warns: "warning: function 'f' called with more arguments than declared"). ++ * They are still evaluated, but discarded: */ ++ clrvar(arg); ++ continue; ++ } + copyvar(&tv[i], arg); + tv[i].type |= VF_CHILD; + tv[i].x.parent = arg; +- if (++i >= op->r.f->nargs) +- break; ++ i++; + } + + sv_fnargs = fnargs; +@@ -2936,7 +2941,7 @@ static var *evaluate(node *op, var *res) + + fnargs = tv; + res = evaluate(op->r.f->body.first, res); +- nvfree(fnargs, nargs1); ++ nvfree(fnargs, nargs); + + g_progname = sv_progname; + fnargs = sv_fnargs; +diff --git a/testsuite/awk.tests b/testsuite/awk.tests +index 873cc3680..3c230393f 100755 +--- a/testsuite/awk.tests ++++ b/testsuite/awk.tests +@@ -87,11 +87,17 @@ BEGIN { + a=2 + print v (a) + }' +-testing "'v (a)' is not a function call, it is a concatenation" \ ++testing "awk 'v (a)' is not a function call, it is a concatenation" \ + "awk '$prg' 2>&1" \ + "12\n" \ + "" "" + ++prg='func f(){print"F"};func g(){print"G"};BEGIN{f(g(),g())}' ++testing "awk unused function args are evaluated" \ ++ "awk '$prg' 2>&1" \ ++ "G\nG\nF\n" \ ++ "" "" ++ + + optional DESKTOP + testing "awk hex const 1" "awk '{ print or(0xffffffff,1) }'" "4294967295\n" "" "\n" +-- +2.27.0 + + +From fd5451c7894cd617a812d095a5d4d3cdc215b218 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Thu, 1 Jul 2021 16:02:16 +0200 +Subject: [PATCH 27/61] awk: rename temp variables, no code changes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 76 +++++++++++++++++++++++++++++++-------------------- + 1 file changed, 46 insertions(+), 30 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 47bbc10a6..2c2cb74d7 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1775,14 +1775,14 @@ static node *mk_splitter(const char *s, tsplitter *spl) + static regex_t *as_regex(node *op, regex_t *preg) + { + int cflags; +- var *v; ++ var *tmpvar; + const char *s; + + if ((op->info & OPCLSMASK) == OC_REGEXP) { + return icase ? op->r.ire : op->l.re; + } +- v = nvalloc(1); +- s = getvar_s(evaluate(op, v)); ++ tmpvar = nvalloc(1); ++ s = getvar_s(evaluate(op, tmpvar)); + + cflags = icase ? REG_EXTENDED | REG_ICASE : REG_EXTENDED; + /* Testcase where REG_EXTENDED fails (unpaired '{'): +@@ -1794,7 +1794,7 @@ static regex_t *as_regex(node *op, regex_t *preg) + cflags &= ~REG_EXTENDED; + xregcomp(preg, s, cflags); + } +- nvfree(v, 1); ++ nvfree(tmpvar, 1); + return preg; + } + +@@ -2243,12 +2243,12 @@ static char *awk_printf(node *n, int *len) + const char *s1; + int i, j, incr, bsize; + char c, c1; +- var *v, *arg; ++ var *tmpvar, *arg; + +- v = nvalloc(1); ++ tmpvar = nvalloc(1); + //TODO: above, to avoid allocating a single temporary var, take a pointer + //to a temporary that our caller (evaluate()) already has? +- fmt = f = xstrdup(getvar_s(evaluate(nextarg(&n), v))); ++ fmt = f = xstrdup(getvar_s(evaluate(nextarg(&n), tmpvar))); + + i = 0; + while (*f) { +@@ -2268,7 +2268,7 @@ static char *awk_printf(node *n, int *len) + f++; + c1 = *f; + *f = '\0'; +- arg = evaluate(nextarg(&n), v); ++ arg = evaluate(nextarg(&n), tmpvar); + + j = i; + if (c == 'c' || !c) { +@@ -2289,7 +2289,7 @@ static char *awk_printf(node *n, int *len) + } + + free(fmt); +- nvfree(v, 1); ++ nvfree(tmpvar, 1); + b = xrealloc(b, i + 1); + b[i] = '\0'; + #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS +@@ -2429,7 +2429,7 @@ static NOINLINE var *exec_builtin(node *op, var *res) + { + #define tspl (G.exec_builtin__tspl) + +- var *tv; ++ var *tmpvars; + node *an[4]; + var *av[4]; + const char *as[4]; +@@ -2441,7 +2441,12 @@ static NOINLINE var *exec_builtin(node *op, var *res) + time_t tt; + int i, l, ll, n; + +- tv = nvalloc(4); ++ tmpvars = nvalloc(4); ++#define TMPVAR0 (tmpvars) ++#define TMPVAR1 (tmpvars + 1) ++#define TMPVAR2 (tmpvars + 2) ++#define TMPVAR3 (tmpvars + 3) ++#define TMPVAR(i) (tmpvars + (i)) + isr = info = op->info; + op = op->l.n; + +@@ -2449,7 +2454,7 @@ static NOINLINE var *exec_builtin(node *op, var *res) + for (i = 0; i < 4 && op; i++) { + an[i] = nextarg(&op); + if (isr & 0x09000000) +- av[i] = evaluate(an[i], &tv[i]); ++ av[i] = evaluate(an[i], TMPVAR(i)); + if (isr & 0x08000000) + as[i] = getvar_s(av[i]); + isr >>= 1; +@@ -2474,7 +2479,7 @@ static NOINLINE var *exec_builtin(node *op, var *res) + + if (nargs > 2) { + spl = (an[2]->info & OPCLSMASK) == OC_REGEXP ? +- an[2] : mk_splitter(getvar_s(evaluate(an[2], &tv[2])), &tspl); ++ an[2] : mk_splitter(getvar_s(evaluate(an[2], TMPVAR2)), &tspl); + } else { + spl = &fsplitter.n; + } +@@ -2617,7 +2622,13 @@ static NOINLINE var *exec_builtin(node *op, var *res) + break; + } + +- nvfree(tv, 4); ++ nvfree(tmpvars, 4); ++#undef TMPVAR0 ++#undef TMPVAR1 ++#undef TMPVAR2 ++#undef TMPVAR3 ++#undef TMPVAR ++ + return res; + #undef tspl + } +@@ -2636,14 +2647,16 @@ static var *evaluate(node *op, var *res) + #define seed (G.evaluate__seed) + #define sreg (G.evaluate__sreg) + +- var *v1; ++ var *tmpvars; ++#define TMPVAR0 (tmpvars) ++#define TMPVAR1 (tmpvars + 1) + + if (!op) + return setvar_s(res, NULL); + + debug_printf_eval("entered %s()\n", __func__); + +- v1 = nvalloc(2); ++ tmpvars = nvalloc(2); + + while (op) { + struct { +@@ -2683,7 +2696,7 @@ static var *evaluate(node *op, var *res) + } + if (op1->r.n) { /* array ref? */ + const char *s; +- s = getvar_s(evaluate(op1->r.n, v1)); ++ s = getvar_s(evaluate(op1->r.n, TMPVAR0)); + hash_remove(iamarray(v), s); + } else { + clear_array(iamarray(v)); +@@ -2693,7 +2706,7 @@ static var *evaluate(node *op, var *res) + + /* execute inevitable things */ + if (opinfo & OF_RES1) +- L.v = evaluate(op1, v1); ++ L.v = evaluate(op1, TMPVAR0); + if (opinfo & OF_STR1) { + L.s = getvar_s(L.v); + debug_printf_eval("L.s:'%s'\n", L.s); +@@ -2710,7 +2723,7 @@ static var *evaluate(node *op, var *res) + * (Seen trying to evaluate "$444 $44444") + */ + if (opinfo & OF_RES2) { +- R.v = evaluate(op->r.n, v1+1); ++ R.v = evaluate(op->r.n, TMPVAR1); + //TODO: L.v may be invalid now, set L.v to NULL to catch bugs? + //L.v = NULL; + } +@@ -2793,7 +2806,7 @@ static var *evaluate(node *op, var *res) + fputs(getvar_s(intvar[F0]), F); + } else { + for (;;) { +- var *v = evaluate(nextarg(&op1), v1); ++ var *v = evaluate(nextarg(&op1), TMPVAR0); + if (v->type & VF_NUMBER) { + fmt_num(g_buf, MAXVARFMT, getvar_s(intvar[OFMT]), + getvar_i(v), TRUE); +@@ -2892,7 +2905,7 @@ static var *evaluate(node *op, var *res) + /* if source is a temporary string, jusk relink it to dest */ + //Disabled: if R.v is numeric but happens to have cached R.v->string, + //then L.v ends up being a string, which is wrong +-// if (R.v == v1+1 && R.v->string) { ++// if (R.v == TMPVAR1 && R.v->string) { + // res = setvar_p(L.v, R.v->string); + // R.v->string = NULL; + // } else { +@@ -2908,7 +2921,7 @@ static var *evaluate(node *op, var *res) + break; + + case XC( OC_FUNC ): { +- var *tv, *sv_fnargs; ++ var *argvars, *sv_fnargs; + const char *sv_progname; + int nargs, i; + +@@ -2919,10 +2932,10 @@ static var *evaluate(node *op, var *res) + + /* The body might be empty, still has to eval the args */ + nargs = op->r.f->nargs; +- tv = nvalloc(nargs); ++ argvars = nvalloc(nargs); + i = 0; + while (op1) { +- var *arg = evaluate(nextarg(&op1), v1); ++ var *arg = evaluate(nextarg(&op1), TMPVAR0); + if (i == nargs) { + /* call with more arguments than function takes. + * (gawk warns: "warning: function 'f' called with more arguments than declared"). +@@ -2930,18 +2943,18 @@ static var *evaluate(node *op, var *res) + clrvar(arg); + continue; + } +- copyvar(&tv[i], arg); +- tv[i].type |= VF_CHILD; +- tv[i].x.parent = arg; ++ copyvar(&argvars[i], arg); ++ argvars[i].type |= VF_CHILD; ++ argvars[i].x.parent = arg; + i++; + } + + sv_fnargs = fnargs; + sv_progname = g_progname; + +- fnargs = tv; ++ fnargs = argvars; + res = evaluate(op->r.f->body.first, res); +- nvfree(fnargs, nargs); ++ nvfree(argvars, nargs); + + g_progname = sv_progname; + fnargs = sv_fnargs; +@@ -3266,7 +3279,10 @@ static var *evaluate(node *op, var *res) + break; + } /* while (op) */ + +- nvfree(v1, 2); ++ nvfree(tmpvars, 2); ++#undef TMPVAR0 ++#undef TMPVAR1 ++ + debug_printf_eval("returning from %s(): %p\n", __func__, res); + return res; + #undef fnargs +-- +2.27.0 + + +From b1abb8374ff4bd36d9e850a92ab7a3a7668615d2 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Thu, 1 Jul 2021 17:50:26 +0200 +Subject: [PATCH 28/61] awk: use static tmpvars instead of nvalloc(1)ed ones + +ptest() was using this idea already. + +As far as I can see, this is safe. Ttestsuite passes. + +One downside is that a temporary from e.g. printf invocation +won't be freed until the next printf call. + +function old new delta +awk_printf 481 468 -13 +as_regex 137 111 -26 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-39) Total: -39 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 49 ++++++++++++++++++++++++++++++++++--------------- + 1 file changed, 34 insertions(+), 15 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 2c2cb74d7..0be044eef 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -559,7 +559,9 @@ struct globals2 { + unsigned evaluate__seed; + regex_t evaluate__sreg; + +- var ptest__v; ++ var ptest__tmpvar; ++ var awk_printf__tmpvar; ++ var as_regex__tmpvar; + + tsplitter exec_builtin__tspl; + +@@ -1775,14 +1777,19 @@ static node *mk_splitter(const char *s, tsplitter *spl) + static regex_t *as_regex(node *op, regex_t *preg) + { + int cflags; +- var *tmpvar; + const char *s; + + if ((op->info & OPCLSMASK) == OC_REGEXP) { + return icase ? op->r.ire : op->l.re; + } +- tmpvar = nvalloc(1); +- s = getvar_s(evaluate(op, tmpvar)); ++ ++#define TMPVAR (&G.as_regex__tmpvar) ++ //tmpvar = nvalloc(1); ++ // We use a single "static" tmpvar (instead of on-stack or malloced one) ++ // to decrease memory consumption in deeply-recursive awk programs. ++ // The rule to work safely is to never call evaluate() while our static ++ // TMPVAR's value is still needed. ++ s = getvar_s(evaluate(op, TMPVAR)); + + cflags = icase ? REG_EXTENDED | REG_ICASE : REG_EXTENDED; + /* Testcase where REG_EXTENDED fails (unpaired '{'): +@@ -1794,7 +1801,8 @@ static regex_t *as_regex(node *op, regex_t *preg) + cflags &= ~REG_EXTENDED; + xregcomp(preg, s, cflags); + } +- nvfree(tmpvar, 1); ++ //nvfree(tmpvar, 1); ++#undef TMPVAR + return preg; + } + +@@ -2105,8 +2113,11 @@ static int hashwalk_next(var *v) + /* evaluate node, return 1 when result is true, 0 otherwise */ + static int ptest(node *pattern) + { +- /* ptest__v is "static": to save stack space? */ +- return istrue(evaluate(pattern, &G.ptest__v)); ++ // We use a single "static" tmpvar (instead of on-stack or malloced one) ++ // to decrease memory consumption in deeply-recursive awk programs. ++ // The rule to work safely is to never call evaluate() while our static ++ // TMPVAR's value is still needed. ++ return istrue(evaluate(pattern, &G.ptest__tmpvar)); + } + + /* read next record from stream rsm into a variable v */ +@@ -2243,12 +2254,18 @@ static char *awk_printf(node *n, int *len) + const char *s1; + int i, j, incr, bsize; + char c, c1; +- var *tmpvar, *arg; +- +- tmpvar = nvalloc(1); +-//TODO: above, to avoid allocating a single temporary var, take a pointer +-//to a temporary that our caller (evaluate()) already has? +- fmt = f = xstrdup(getvar_s(evaluate(nextarg(&n), tmpvar))); ++ var *arg; ++ ++ //tmpvar = nvalloc(1); ++#define TMPVAR (&G.awk_printf__tmpvar) ++ // We use a single "static" tmpvar (instead of on-stack or malloced one) ++ // to decrease memory consumption in deeply-recursive awk programs. ++ // The rule to work safely is to never call evaluate() while our static ++ // TMPVAR's value is still needed. ++ fmt = f = xstrdup(getvar_s(evaluate(nextarg(&n), TMPVAR))); ++ // ^^^^^^^^^ here we immediately strdup() the value, so the later call ++ // to evaluate() potentially recursing into another awk_printf() can't ++ // mangle the value. + + i = 0; + while (*f) { +@@ -2268,7 +2285,7 @@ static char *awk_printf(node *n, int *len) + f++; + c1 = *f; + *f = '\0'; +- arg = evaluate(nextarg(&n), tmpvar); ++ arg = evaluate(nextarg(&n), TMPVAR); + + j = i; + if (c == 'c' || !c) { +@@ -2289,7 +2306,9 @@ static char *awk_printf(node *n, int *len) + } + + free(fmt); +- nvfree(tmpvar, 1); ++// nvfree(tmpvar, 1); ++#undef TMPVAR ++ + b = xrealloc(b, i + 1); + b[i] = '\0'; + #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS +-- +2.27.0 + + +From de5007b20bc226273fb50130f2cb0fcaf7abfd3b Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 2 Jul 2021 14:27:40 +0200 +Subject: [PATCH 29/61] awk: shuffle functions to reduce forward declarations, + no code changes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 192 ++++++++++++++++++++++++-------------------------- + 1 file changed, 94 insertions(+), 98 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 0be044eef..6833c2f0d 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -619,18 +619,6 @@ struct globals2 { + G.evaluate__seed = 1; \ + } while (0) + +- +-/* function prototypes */ +-static void handle_special(var *); +-static node *parse_expr(uint32_t); +-static void chain_group(void); +-static var *evaluate(node *, var *); +-static rstream *next_input_file(void); +-static int fmt_num(char *, int, const char *, double, int); +-static int awk_exit(int) NORETURN; +- +-/* ---- error handling ---- */ +- + static const char EMSG_UNEXP_EOS[] ALIGN1 = "Unexpected end of string"; + static const char EMSG_UNEXP_TOKEN[] ALIGN1 = "Unexpected token"; + static const char EMSG_DIV_BY_ZERO[] ALIGN1 = "Division by zero"; +@@ -642,10 +630,7 @@ static const char EMSG_UNDEF_FUNC[] ALIGN1 = "Call to undefined function"; + static const char EMSG_NO_MATH[] ALIGN1 = "Math support is not compiled in"; + static const char EMSG_NEGATIVE_FIELD[] ALIGN1 = "Access to negative field"; + +-static void zero_out_var(var *vp) +-{ +- memset(vp, 0, sizeof(*vp)); +-} ++static int awk_exit(int) NORETURN; + + static void syntax_error(const char *message) NORETURN; + static void syntax_error(const char *message) +@@ -653,6 +638,11 @@ static void syntax_error(const char *message) + bb_error_msg_and_die("%s:%i: %s", g_progname, g_lineno, message); + } + ++static void zero_out_var(var *vp) ++{ ++ memset(vp, 0, sizeof(*vp)); ++} ++ + /* ---- hash stuff ---- */ + + static unsigned hashidx(const char *name) +@@ -885,10 +875,29 @@ static double my_strtod(char **pp) + + /* -------- working with variables (set/get/copy/etc) -------- */ + +-static xhash *iamarray(var *v) ++static int fmt_num(char *b, int size, const char *format, double n, int int_as_int) + { +- var *a = v; ++ int r = 0; ++ char c; ++ const char *s = format; ++ ++ if (int_as_int && n == (long long)n) { ++ r = snprintf(b, size, "%lld", (long long)n); ++ } else { ++ do { c = *s; } while (c && *++s); ++ if (strchr("diouxX", c)) { ++ r = snprintf(b, size, format, (int)n); ++ } else if (strchr("eEfgG", c)) { ++ r = snprintf(b, size, format, n); ++ } else { ++ syntax_error(EMSG_INV_FMT); ++ } ++ } ++ return r; ++} + ++static xhash *iamarray(var *a) ++{ + while (a->type & VF_CHILD) + a = a->x.parent; + +@@ -913,6 +922,8 @@ static var *clrvar(var *v) + return v; + } + ++static void handle_special(var *); ++ + /* assign string value to variable */ + static var *setvar_p(var *v, char *value) + { +@@ -1284,6 +1295,8 @@ static void mk_re_node(const char *s, node *n, regex_t *re) + xregcomp(re + 1, s, REG_EXTENDED | REG_ICASE); + } + ++static node *parse_expr(uint32_t); ++ + static node *parse_lrparen_list(void) + { + next_token(TC_LPAREN); +@@ -1488,6 +1501,8 @@ static void chain_expr(uint32_t info) + rollback_token(); + } + ++static void chain_group(void); ++ + static node *chain_loop(node *nn) + { + node *n, *n2, *save_brk, *save_cont; +@@ -1770,6 +1785,8 @@ static node *mk_splitter(const char *s, tsplitter *spl) + return n; + } + ++static var *evaluate(node *, var *); ++ + /* Use node as a regular expression. Supplied with node ptr and regex_t + * storage space. Return ptr to regex (if result points to preg, it should + * be later regfree'd manually). +@@ -2222,27 +2239,6 @@ static int awk_getline(rstream *rsm, var *v) + return r; + } + +-static int fmt_num(char *b, int size, const char *format, double n, int int_as_int) +-{ +- int r = 0; +- char c; +- const char *s = format; +- +- if (int_as_int && n == (long long)n) { +- r = snprintf(b, size, "%lld", (long long)n); +- } else { +- do { c = *s; } while (c && *++s); +- if (strchr("diouxX", c)) { +- r = snprintf(b, size, format, (int)n); +- } else if (strchr("eEfgG", c)) { +- r = snprintf(b, size, format, n); +- } else { +- syntax_error(EMSG_INV_FMT); +- } +- } +- return r; +-} +- + /* formatted output into an allocated buffer, return ptr to buffer */ + #if !ENABLE_FEATURE_AWK_GNU_EXTENSIONS + # define awk_printf(a, b) awk_printf(a) +@@ -2306,7 +2302,7 @@ static char *awk_printf(node *n, int *len) + } + + free(fmt); +-// nvfree(tmpvar, 1); ++ //nvfree(tmpvar, 1); + #undef TMPVAR + + b = xrealloc(b, i + 1); +@@ -2652,6 +2648,64 @@ static NOINLINE var *exec_builtin(node *op, var *res) + #undef tspl + } + ++/* if expr looks like "var=value", perform assignment and return 1, ++ * otherwise return 0 */ ++static int is_assignment(const char *expr) ++{ ++ char *exprc, *val; ++ ++ if (!isalnum_(*expr) || (val = strchr(expr, '=')) == NULL) { ++ return FALSE; ++ } ++ ++ exprc = xstrdup(expr); ++ val = exprc + (val - expr); ++ *val++ = '\0'; ++ ++ unescape_string_in_place(val); ++ setvar_u(newvar(exprc), val); ++ free(exprc); ++ return TRUE; ++} ++ ++/* switch to next input file */ ++static rstream *next_input_file(void) ++{ ++#define rsm (G.next_input_file__rsm) ++#define files_happen (G.next_input_file__files_happen) ++ ++ FILE *F; ++ const char *fname, *ind; ++ ++ if (rsm.F) ++ fclose(rsm.F); ++ rsm.F = NULL; ++ rsm.pos = rsm.adv = 0; ++ ++ for (;;) { ++ if (getvar_i(intvar[ARGIND])+1 >= getvar_i(intvar[ARGC])) { ++ if (files_happen) ++ return NULL; ++ fname = "-"; ++ F = stdin; ++ break; ++ } ++ ind = getvar_s(incvar(intvar[ARGIND])); ++ fname = getvar_s(findvar(iamarray(intvar[ARGV]), ind)); ++ if (fname && *fname && !is_assignment(fname)) { ++ F = xfopen_stdin(fname); ++ break; ++ } ++ } ++ ++ files_happen = TRUE; ++ setvar_s(intvar[FILENAME], fname); ++ rsm.F = F; ++ return &rsm; ++#undef rsm ++#undef files_happen ++} ++ + /* + * Evaluate node - the heart of the program. Supplied with subtree + * and place where to store result. Returns ptr to result. +@@ -3338,64 +3392,6 @@ static int awk_exit(int r) + exit(r); + } + +-/* if expr looks like "var=value", perform assignment and return 1, +- * otherwise return 0 */ +-static int is_assignment(const char *expr) +-{ +- char *exprc, *val; +- +- if (!isalnum_(*expr) || (val = strchr(expr, '=')) == NULL) { +- return FALSE; +- } +- +- exprc = xstrdup(expr); +- val = exprc + (val - expr); +- *val++ = '\0'; +- +- unescape_string_in_place(val); +- setvar_u(newvar(exprc), val); +- free(exprc); +- return TRUE; +-} +- +-/* switch to next input file */ +-static rstream *next_input_file(void) +-{ +-#define rsm (G.next_input_file__rsm) +-#define files_happen (G.next_input_file__files_happen) +- +- FILE *F; +- const char *fname, *ind; +- +- if (rsm.F) +- fclose(rsm.F); +- rsm.F = NULL; +- rsm.pos = rsm.adv = 0; +- +- for (;;) { +- if (getvar_i(intvar[ARGIND])+1 >= getvar_i(intvar[ARGC])) { +- if (files_happen) +- return NULL; +- fname = "-"; +- F = stdin; +- break; +- } +- ind = getvar_s(incvar(intvar[ARGIND])); +- fname = getvar_s(findvar(iamarray(intvar[ARGV]), ind)); +- if (fname && *fname && !is_assignment(fname)) { +- F = xfopen_stdin(fname); +- break; +- } +- } +- +- files_happen = TRUE; +- setvar_s(intvar[FILENAME], fname); +- rsm.F = F; +- return &rsm; +-#undef rsm +-#undef files_happen +-} +- + int awk_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; + int awk_main(int argc UNUSED_PARAM, char **argv) + { +-- +2.27.0 + + +From c14ab33f2d8eb07dbf27570be30121cc9734ba04 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 2 Jul 2021 14:29:01 +0200 +Subject: [PATCH 30/61] awk: when parsing length(), simplify eating of LPAREN + +function old new delta +parse_expr 945 948 +3 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 6833c2f0d..f65449a09 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1453,10 +1453,11 @@ static node *parse_expr(uint32_t term_tc) + | TC_BINOPX /* length NUM */ + | TC_COMMA /* print length, 1 */ + ); +- rollback_token(); +- if (tc & TC_LPAREN) { ++ if (tc != TC_LPAREN) ++ rollback_token(); ++ else { + /* It was a "(" token. Handle just like TC_BUILTIN */ +- cn->l.n = parse_lrparen_list(); ++ cn->l.n = parse_expr(TC_RPAREN); + } + break; + } +-- +2.27.0 + + +From 8be97151d5ba9f98f27f58068416c203565708d0 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 2 Jul 2021 14:33:13 +0200 +Subject: [PATCH 31/61] awk: use "static" tmpvars in main and exit + +function old new delta +awk_exit 103 93 -10 +awk_main 850 832 -18 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-28) Total: -28 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 17 +++++------------ + 1 file changed, 5 insertions(+), 12 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index f65449a09..9f5a94037 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -562,6 +562,8 @@ struct globals2 { + var ptest__tmpvar; + var awk_printf__tmpvar; + var as_regex__tmpvar; ++ var exit__tmpvar; ++ var main__tmpvar; + + tsplitter exec_builtin__tspl; + +@@ -638,11 +640,6 @@ static void syntax_error(const char *message) + bb_error_msg_and_die("%s:%i: %s", g_progname, g_lineno, message); + } + +-static void zero_out_var(var *vp) +-{ +- memset(vp, 0, sizeof(*vp)); +-} +- + /* ---- hash stuff ---- */ + + static unsigned hashidx(const char *name) +@@ -3372,11 +3369,9 @@ static int awk_exit(int r) + unsigned i; + + if (!exiting) { +- var tv; + exiting = TRUE; + nextrec = FALSE; +- zero_out_var(&tv); +- evaluate(endseq.first, &tv); ++ evaluate(endseq.first, &G.exit__tmpvar); + } + + /* waiting for children */ +@@ -3404,7 +3399,6 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + llist_t *list_e = NULL; + #endif + int i; +- var tv; + + INIT_G(); + +@@ -3514,8 +3508,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + newfile("/dev/stdout")->F = stdout; + newfile("/dev/stderr")->F = stderr; + +- zero_out_var(&tv); +- evaluate(beginseq.first, &tv); ++ evaluate(beginseq.first, &G.main__tmpvar); + if (!mainseq.first && !endseq.first) + awk_exit(EXIT_SUCCESS); + +@@ -3532,7 +3525,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + nextrec = FALSE; + incvar(intvar[NR]); + incvar(intvar[FNR]); +- evaluate(mainseq.first, &tv); ++ evaluate(mainseq.first, &G.main__tmpvar); + + if (nextfile) + break; +-- +2.27.0 + + +From 7f4cd583daf8dcb431f07fd3402ca7ddc11b21ab Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 2 Jul 2021 14:53:52 +0200 +Subject: [PATCH 32/61] awk: shuffle globals for smaller offsets + +function old new delta +awk_main 832 829 -3 +evaluate 3229 3223 -6 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-9) Total: -9 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 25 +++++++++++++------------ + 1 file changed, 13 insertions(+), 12 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 9f5a94037..068ed687b 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -536,6 +536,11 @@ struct globals { + smallint nextfile; + smallint is_f0_split; + smallint t_rollback; ++ ++ /* former statics from various functions */ ++ smallint next_token__concat_inserted; ++ uint32_t next_token__save_tclass; ++ uint32_t next_token__save_info; + }; + struct globals2 { + uint32_t t_info; /* often used */ +@@ -548,15 +553,11 @@ struct globals2 { + /* former statics from various functions */ + char *split_f0__fstrings; + +- uint32_t next_token__save_tclass; +- uint32_t next_token__save_info; +- smallint next_token__concat_inserted; +- +- smallint next_input_file__files_happen; + rstream next_input_file__rsm; ++ smallint next_input_file__files_happen; + +- var *evaluate__fnargs; + unsigned evaluate__seed; ++ var *evaluate__fnargs; + regex_t evaluate__sreg; + + var ptest__tmpvar; +@@ -575,10 +576,10 @@ struct globals2 { + #define G1 (ptr_to_globals[-1]) + #define G (*(struct globals2 *)ptr_to_globals) + /* For debug. nm --size-sort awk.o | grep -vi ' [tr] ' */ +-/*char G1size[sizeof(G1)]; - 0x74 */ +-/*char Gsize[sizeof(G)]; - 0x1c4 */ ++//char G1size[sizeof(G1)]; // 0x70 ++//char Gsize[sizeof(G)]; // 0x2f8 + /* Trying to keep most of members accessible with short offsets: */ +-/*char Gofs_seed[offsetof(struct globals2, evaluate__seed)]; - 0x90 */ ++//char Gofs_seed[offsetof(struct globals2, evaluate__seed)]; // 0x7c + #define t_double (G1.t_double ) + #define beginseq (G1.beginseq ) + #define mainseq (G1.mainseq ) +@@ -1056,9 +1057,9 @@ static int istrue(var *v) + */ + static uint32_t next_token(uint32_t expected) + { +-#define concat_inserted (G.next_token__concat_inserted) +-#define save_tclass (G.next_token__save_tclass) +-#define save_info (G.next_token__save_info) ++#define concat_inserted (G1.next_token__concat_inserted) ++#define save_tclass (G1.next_token__save_tclass) ++#define save_info (G1.next_token__save_info) + + char *p; + const char *tl; +-- +2.27.0 + + +From 51262cc2c47f586d9478cc3c4f4977d98b36222b Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 2 Jul 2021 15:19:14 +0200 +Subject: [PATCH 33/61] awk: do not special-case "delete" + +Rework of the previous fix: +Can use operation attributes to disable arg evaluation instead of special-casing. + +function old new delta +.rodata 104032 104036 +4 +evaluate 3223 3215 -8 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 1/1 up/down: 4/-8) Total: -4 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 56 +++++++++++++++++++++++++-------------------------- + 1 file changed, 27 insertions(+), 29 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 068ed687b..a3dda6959 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -319,7 +319,7 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ + #define xV OF_RES2 + #define xS (OF_RES2 | OF_STR2) + #define Vx OF_RES1 +-#define Rx (OF_RES1 | OF_NUM1 | OF_REQUIRED) ++#define Rx OF_REQUIRED + #define VV (OF_RES1 | OF_RES2) + #define Nx (OF_RES1 | OF_NUM1) + #define NV (OF_RES1 | OF_NUM1 | OF_RES2) +@@ -2750,32 +2750,6 @@ static var *evaluate(node *op, var *res) + op1 = op->l.n; + debug_printf_eval("opinfo:%08x opn:%08x\n", opinfo, opn); + +- /* "delete" is special: +- * "delete array[var--]" must evaluate index expr only once, +- * must not evaluate it in "execute inevitable things" part. +- */ +- if (XC(opinfo & OPCLSMASK) == XC(OC_DELETE)) { +- uint32_t info = op1->info & OPCLSMASK; +- var *v; +- +- debug_printf_eval("DELETE\n"); +- if (info == OC_VAR) { +- v = op1->l.v; +- } else if (info == OC_FNARG) { +- v = &fnargs[op1->l.aidx]; +- } else { +- syntax_error(EMSG_NOT_ARRAY); +- } +- if (op1->r.n) { /* array ref? */ +- const char *s; +- s = getvar_s(evaluate(op1->r.n, TMPVAR0)); +- hash_remove(iamarray(v), s); +- } else { +- clear_array(iamarray(v)); +- } +- goto next; +- } +- + /* execute inevitable things */ + if (opinfo & OF_RES1) + L.v = evaluate(op1, TMPVAR0); +@@ -2905,7 +2879,31 @@ static var *evaluate(node *op, var *res) + break; + } + +- /* case XC( OC_DELETE ): - moved to happen before arg evaluation */ ++ case XC( OC_DELETE ): ++ debug_printf_eval("DELETE\n"); ++ { ++ /* "delete" is special: ++ * "delete array[var--]" must evaluate index expr only once. ++ */ ++ uint32_t info = op1->info & OPCLSMASK; ++ var *v; ++ ++ if (info == OC_VAR) { ++ v = op1->l.v; ++ } else if (info == OC_FNARG) { ++ v = &fnargs[op1->l.aidx]; ++ } else { ++ syntax_error(EMSG_NOT_ARRAY); ++ } ++ if (op1->r.n) { /* array ref? */ ++ const char *s; ++ s = getvar_s(evaluate(op1->r.n, TMPVAR0)); ++ hash_remove(iamarray(v), s); ++ } else { ++ clear_array(iamarray(v)); ++ } ++ break; ++ } + + case XC( OC_NEWSOURCE ): + debug_printf_eval("NEWSOURCE\n"); +@@ -3342,7 +3340,7 @@ static var *evaluate(node *op, var *res) + default: + syntax_error(EMSG_POSSIBLE_ERROR); + } /* switch */ +- next: ++ + if ((opinfo & OPCLSMASK) <= SHIFT_TIL_THIS) + op = op->a.n; + if ((opinfo & OPCLSMASK) >= RECUR_FROM_THIS) +-- +2.27.0 + + +From 2f36bdf0eb01846b23c1a340ff6f19fd9377ed6a Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 2 Jul 2021 17:32:08 +0200 +Subject: [PATCH 34/61] awk: make builtin definitions more understandable, no + code changes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 71 +++++++++++++++++++++++++++++++++++---------------- + 1 file changed, 49 insertions(+), 22 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index a3dda6959..fb841687e 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -331,8 +331,7 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ + #define OPNMASK 0x007F + + /* operator priority is a highest byte (even: r->l, odd: l->r grouping) +- * For builtins it has different meaning: n n s3 s2 s1 v3 v2 v1, +- * n - min. number of args, vN - resolve Nth arg to var, sN - resolve to string ++ * (for builtins it has different meaning) + */ + #undef P + #undef PRIMASK +@@ -430,8 +429,6 @@ static const char tokenlist[] ALIGN1 = + /* compiler adds trailing "\0" */ + ; + +-#define OC_B OC_BUILTIN +- + static const uint32_t tokeninfo[] ALIGN4 = { + 0, + 0, +@@ -464,20 +461,43 @@ static const uint32_t tokeninfo[] ALIGN4 = { + OC_RETURN|Vx, OC_EXIT|Nx, + ST_WHILE, + 0, /* else */ +- OC_B|B_an|P(0x83), OC_B|B_co|P(0x41), OC_B|B_ls|P(0x83), OC_B|B_or|P(0x83), +- OC_B|B_rs|P(0x83), OC_B|B_xo|P(0x83), +- OC_FBLTIN|Sx|F_cl, OC_FBLTIN|Sx|F_sy, OC_FBLTIN|Sx|F_ff, OC_B|B_a2|P(0x83), +- OC_FBLTIN|Nx|F_co, OC_FBLTIN|Nx|F_ex, OC_FBLTIN|Nx|F_in, OC_FBLTIN|Nx|F_lg, +- OC_FBLTIN|F_rn, OC_FBLTIN|Nx|F_si, OC_FBLTIN|Nx|F_sq, OC_FBLTIN|Nx|F_sr, +- OC_B|B_ge|P(0xd6), OC_B|B_gs|P(0xb6), OC_B|B_ix|P(0x9b), /* OC_FBLTIN|Sx|F_le, was here */ +- OC_B|B_ma|P(0x89), OC_B|B_sp|P(0x8b), OC_SPRINTF, OC_B|B_su|P(0xb6), +- OC_B|B_ss|P(0x8f), OC_FBLTIN|F_ti, OC_B|B_ti|P(0x0b), OC_B|B_mt|P(0x0b), +- OC_B|B_lo|P(0x49), OC_B|B_up|P(0x49), +- OC_FBLTIN|Sx|F_le, /* TC_LENGTH */ +- OC_GETLINE|SV|P(0), +- 0, 0, +- 0, +- 0 /* TC_END */ ++// OC_B's are builtins with enforced minimum number of arguments (two upper bits). ++// Highest byte bit pattern: nn s3s2s1 v3v2v1 ++// nn - min. number of args, sN - resolve Nth arg to string, vN - resolve to var ++// OC_FBLTIN's are builtins with one optional argument, ++// TODO: enforce exactly one arg for: system, close, cos, sin, exp, int, log, sqrt ++// zero args for: rand systime ++// Do have one optional arg: fflush, srand, length ++#define OC_B OC_BUILTIN ++#define A1 P(0x40) /*one arg*/ ++#define A2 P(0x80) /*two args*/ ++#define A3 P(0xc0) /*three args*/ ++#define __v P(1) ++#define _vv P(3) ++#define __s__v P(9) ++#define __s_vv P(0x0b) ++#define __svvv P(0x0f) ++#define _ss_vv P(0x1b) ++#define _s_vv_ P(0x16) ++#define ss_vv_ P(0x36) ++ OC_B|B_an|_vv|A2, OC_B|B_co|__v|A1, OC_B|B_ls|_vv|A2, OC_B|B_or|_vv|A2, // and compl lshift or ++ OC_B|B_rs|_vv|A2, OC_B|B_xo|_vv|A2, // rshift xor ++ OC_FBLTIN|Sx|F_cl, OC_FBLTIN|Sx|F_sy, OC_FBLTIN|Sx|F_ff, OC_B|B_a2|_vv|A2, // close system fflush atan2 ++ OC_FBLTIN|Nx|F_co, OC_FBLTIN|Nx|F_ex, OC_FBLTIN|Nx|F_in, OC_FBLTIN|Nx|F_lg, // cos exp int log ++ OC_FBLTIN|F_rn, OC_FBLTIN|Nx|F_si, OC_FBLTIN|Nx|F_sq, OC_FBLTIN|Nx|F_sr, // rand sin sqrt srand ++ OC_B|B_ge|_s_vv_|A3, OC_B|B_gs|ss_vv_|A2, OC_B|B_ix|_ss_vv|A2, // gensub gsub index /*length was here*/ ++ OC_B|B_ma|__s__v|A2, OC_B|B_sp|__s_vv|A2, OC_SPRINTF, OC_B|B_su|ss_vv_|A2, // match split sprintf sub ++ OC_B|B_ss|__svvv|A2, OC_FBLTIN|F_ti, OC_B|B_ti|__s_vv, OC_B|B_mt|__s_vv, // substr systime strftime mktime ++ OC_B|B_lo|__s__v|A1, OC_B|B_up|__s__v|A1, // tolower toupper ++ OC_FBLTIN|Sx|F_le, // length ++ OC_GETLINE|SV, // getline ++ 0, 0, // func function ++ 0, // BEGIN ++ 0 // END ++#undef A1 ++#undef A2 ++#undef A3 ++#undef OC_B + }; + + /* internal variable names and their initial values */ +@@ -1630,6 +1650,7 @@ static void chain_group(void) + debug_printf_parse("%s: OC_BREAK\n", __func__); + n = chain_node(OC_EXEC); + n->a.n = break_ptr; ++//TODO: if break_ptr is NULL, syntax error (not in the loop)? + chain_expr(t_info); + break; + +@@ -1637,6 +1658,7 @@ static void chain_group(void) + debug_printf_parse("%s: OC_CONTINUE\n", __func__); + n = chain_node(OC_EXEC); + n->a.n = continue_ptr; ++//TODO: if continue_ptr is NULL, syntax error (not in the loop)? + chain_expr(t_info); + break; + +@@ -1799,8 +1821,8 @@ static regex_t *as_regex(node *op, regex_t *preg) + return icase ? op->r.ire : op->l.re; + } + +-#define TMPVAR (&G.as_regex__tmpvar) + //tmpvar = nvalloc(1); ++#define TMPVAR (&G.as_regex__tmpvar) + // We use a single "static" tmpvar (instead of on-stack or malloced one) + // to decrease memory consumption in deeply-recursive awk programs. + // The rule to work safely is to never call evaluate() while our static +@@ -2720,8 +2742,6 @@ static var *evaluate(node *op, var *res) + #define sreg (G.evaluate__sreg) + + var *tmpvars; +-#define TMPVAR0 (tmpvars) +-#define TMPVAR1 (tmpvars + 1) + + if (!op) + return setvar_s(res, NULL); +@@ -2729,6 +2749,8 @@ static var *evaluate(node *op, var *res) + debug_printf_eval("entered %s()\n", __func__); + + tmpvars = nvalloc(2); ++#define TMPVAR0 (tmpvars) ++#define TMPVAR1 (tmpvars + 1) + + while (op) { + struct { +@@ -3166,7 +3188,7 @@ static var *evaluate(node *op, var *res) + rstream *rsm; + int err = 0; + rsm = (rstream *)hash_search(fdhash, L.s); +- debug_printf_eval("OC_FBLTIN F_cl rsm:%p\n", rsm); ++ debug_printf_eval("OC_FBLTIN close: op1:%p s:'%s' rsm:%p\n", op1, L.s, rsm); + if (rsm) { + debug_printf_eval("OC_FBLTIN F_cl " + "rsm->is_pipe:%d, ->F:%p\n", +@@ -3177,6 +3199,11 @@ static var *evaluate(node *op, var *res) + */ + if (rsm->F) + err = rsm->is_pipe ? pclose(rsm->F) : fclose(rsm->F); ++//TODO: fix this case: ++// $ awk 'BEGIN { print close(""); print ERRNO }' ++// -1 ++// close of redirection that was never opened ++// (we print 0, 0) + free(rsm->buffer); + hash_remove(fdhash, L.s); + } +-- +2.27.0 + + +From 8eb26034fb7225862c73f1dfa947a5d4910a0935 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 2 Jul 2021 18:28:12 +0200 +Subject: [PATCH 35/61] awk: enforce simple builtins' argument number + +function old new delta +evaluate 3215 3303 +88 +.rodata 104036 104107 +71 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 2/0 up/down: 159/0) Total: 159 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 45 ++++++++++++++++++++++++++++----------------- + 1 file changed, 28 insertions(+), 17 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index fb841687e..1925e0771 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -464,11 +464,11 @@ static const uint32_t tokeninfo[] ALIGN4 = { + // OC_B's are builtins with enforced minimum number of arguments (two upper bits). + // Highest byte bit pattern: nn s3s2s1 v3v2v1 + // nn - min. number of args, sN - resolve Nth arg to string, vN - resolve to var +-// OC_FBLTIN's are builtins with one optional argument, +-// TODO: enforce exactly one arg for: system, close, cos, sin, exp, int, log, sqrt +-// zero args for: rand systime +-// Do have one optional arg: fflush, srand, length +-#define OC_B OC_BUILTIN ++// OC_FBLTIN's are builtins with zero or one argument. ++// |Rx| enforces that arg is present for: system, close, cos, sin, exp, int, log, sqrt. ++// Check for no args is present in builtins' code (not in this table): rand, systime. ++// Have one _optional_ arg: fflush, srand, length ++#define OC_B OC_BUILTIN + #define A1 P(0x40) /*one arg*/ + #define A2 P(0x80) /*two args*/ + #define A3 P(0xc0) /*three args*/ +@@ -480,15 +480,15 @@ static const uint32_t tokeninfo[] ALIGN4 = { + #define _ss_vv P(0x1b) + #define _s_vv_ P(0x16) + #define ss_vv_ P(0x36) +- OC_B|B_an|_vv|A2, OC_B|B_co|__v|A1, OC_B|B_ls|_vv|A2, OC_B|B_or|_vv|A2, // and compl lshift or +- OC_B|B_rs|_vv|A2, OC_B|B_xo|_vv|A2, // rshift xor +- OC_FBLTIN|Sx|F_cl, OC_FBLTIN|Sx|F_sy, OC_FBLTIN|Sx|F_ff, OC_B|B_a2|_vv|A2, // close system fflush atan2 +- OC_FBLTIN|Nx|F_co, OC_FBLTIN|Nx|F_ex, OC_FBLTIN|Nx|F_in, OC_FBLTIN|Nx|F_lg, // cos exp int log +- OC_FBLTIN|F_rn, OC_FBLTIN|Nx|F_si, OC_FBLTIN|Nx|F_sq, OC_FBLTIN|Nx|F_sr, // rand sin sqrt srand +- OC_B|B_ge|_s_vv_|A3, OC_B|B_gs|ss_vv_|A2, OC_B|B_ix|_ss_vv|A2, // gensub gsub index /*length was here*/ +- OC_B|B_ma|__s__v|A2, OC_B|B_sp|__s_vv|A2, OC_SPRINTF, OC_B|B_su|ss_vv_|A2, // match split sprintf sub +- OC_B|B_ss|__svvv|A2, OC_FBLTIN|F_ti, OC_B|B_ti|__s_vv, OC_B|B_mt|__s_vv, // substr systime strftime mktime +- OC_B|B_lo|__s__v|A1, OC_B|B_up|__s__v|A1, // tolower toupper ++ OC_B|B_an|_vv|A2, OC_B|B_co|__v|A1, OC_B|B_ls|_vv|A2, OC_B|B_or|_vv|A2, // and compl lshift or ++ OC_B|B_rs|_vv|A2, OC_B|B_xo|_vv|A2, // rshift xor ++ OC_FBLTIN|Sx|Rx|F_cl,OC_FBLTIN|Sx|Rx|F_sy,OC_FBLTIN|Sx|F_ff, OC_B|B_a2|_vv|A2, // close system fflush atan2 ++ OC_FBLTIN|Nx|Rx|F_co,OC_FBLTIN|Nx|Rx|F_ex,OC_FBLTIN|Nx|Rx|F_in,OC_FBLTIN|Nx|Rx|F_lg,// cos exp int log ++ OC_FBLTIN|F_rn, OC_FBLTIN|Nx|Rx|F_si,OC_FBLTIN|Nx|Rx|F_sq,OC_FBLTIN|Nx|F_sr, // rand sin sqrt srand ++ OC_B|B_ge|_s_vv_|A3, OC_B|B_gs|ss_vv_|A2, OC_B|B_ix|_ss_vv|A2, // gensub gsub index /*length was here*/ ++ OC_B|B_ma|__s__v|A2, OC_B|B_sp|__s_vv|A2, OC_SPRINTF, OC_B|B_su|ss_vv_|A2, // match split sprintf sub ++ OC_B|B_ss|__svvv|A2, OC_FBLTIN|F_ti, OC_B|B_ti|__s_vv, OC_B|B_mt|__s_vv, // substr systime strftime mktime ++ OC_B|B_lo|__s__v|A1, OC_B|B_up|__s__v|A1, // tolower toupper + OC_FBLTIN|Sx|F_le, // length + OC_GETLINE|SV, // getline + 0, 0, // func function +@@ -2773,8 +2773,11 @@ static var *evaluate(node *op, var *res) + debug_printf_eval("opinfo:%08x opn:%08x\n", opinfo, opn); + + /* execute inevitable things */ +- if (opinfo & OF_RES1) ++ if (opinfo & OF_RES1) { ++ if ((opinfo & OF_REQUIRED) && !op1) ++ syntax_error(EMSG_TOO_FEW_ARGS); + L.v = evaluate(op1, TMPVAR0); ++ } + if (opinfo & OF_STR1) { + L.s = getvar_s(L.v); + debug_printf_eval("L.s:'%s'\n", L.s); +@@ -3101,12 +3104,18 @@ static var *evaluate(node *op, var *res) + double R_d = R_d; /* for compiler */ + debug_printf_eval("FBLTIN\n"); + ++ if (op1 && (op1->info & OPCLSMASK) == OC_COMMA) ++ /* Simple builtins take one arg maximum */ ++ syntax_error("Too many arguments"); ++ + switch (opn) { + case F_in: + R_d = (long long)L_d; + break; + +- case F_rn: ++ case F_rn: /*rand*/ ++ if (op1) ++ syntax_error("Too many arguments"); + R_d = (double)rand() / (double)RAND_MAX; + break; + +@@ -3149,7 +3158,9 @@ static var *evaluate(node *op, var *res) + srand(seed); + break; + +- case F_ti: ++ case F_ti: /*systime*/ ++ if (op1) ++ syntax_error("Too many arguments"); + R_d = time(NULL); + break; + +-- +2.27.0 + + +From bd554e662f7246fd1518db37049aaf9ecf61bce9 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 2 Jul 2021 18:55:00 +0200 +Subject: [PATCH 36/61] awk: beautify builtins table, no code changes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 26 ++++++++++++++------------ + 1 file changed, 14 insertions(+), 12 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 1925e0771..8d7777ca6 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -464,11 +464,12 @@ static const uint32_t tokeninfo[] ALIGN4 = { + // OC_B's are builtins with enforced minimum number of arguments (two upper bits). + // Highest byte bit pattern: nn s3s2s1 v3v2v1 + // nn - min. number of args, sN - resolve Nth arg to string, vN - resolve to var +-// OC_FBLTIN's are builtins with zero or one argument. ++// OC_F's are builtins with zero or one argument. + // |Rx| enforces that arg is present for: system, close, cos, sin, exp, int, log, sqrt. + // Check for no args is present in builtins' code (not in this table): rand, systime. + // Have one _optional_ arg: fflush, srand, length + #define OC_B OC_BUILTIN ++#define OC_F OC_FBLTIN + #define A1 P(0x40) /*one arg*/ + #define A2 P(0x80) /*two args*/ + #define A3 P(0xc0) /*three args*/ +@@ -480,17 +481,17 @@ static const uint32_t tokeninfo[] ALIGN4 = { + #define _ss_vv P(0x1b) + #define _s_vv_ P(0x16) + #define ss_vv_ P(0x36) +- OC_B|B_an|_vv|A2, OC_B|B_co|__v|A1, OC_B|B_ls|_vv|A2, OC_B|B_or|_vv|A2, // and compl lshift or +- OC_B|B_rs|_vv|A2, OC_B|B_xo|_vv|A2, // rshift xor +- OC_FBLTIN|Sx|Rx|F_cl,OC_FBLTIN|Sx|Rx|F_sy,OC_FBLTIN|Sx|F_ff, OC_B|B_a2|_vv|A2, // close system fflush atan2 +- OC_FBLTIN|Nx|Rx|F_co,OC_FBLTIN|Nx|Rx|F_ex,OC_FBLTIN|Nx|Rx|F_in,OC_FBLTIN|Nx|Rx|F_lg,// cos exp int log +- OC_FBLTIN|F_rn, OC_FBLTIN|Nx|Rx|F_si,OC_FBLTIN|Nx|Rx|F_sq,OC_FBLTIN|Nx|F_sr, // rand sin sqrt srand +- OC_B|B_ge|_s_vv_|A3, OC_B|B_gs|ss_vv_|A2, OC_B|B_ix|_ss_vv|A2, // gensub gsub index /*length was here*/ +- OC_B|B_ma|__s__v|A2, OC_B|B_sp|__s_vv|A2, OC_SPRINTF, OC_B|B_su|ss_vv_|A2, // match split sprintf sub +- OC_B|B_ss|__svvv|A2, OC_FBLTIN|F_ti, OC_B|B_ti|__s_vv, OC_B|B_mt|__s_vv, // substr systime strftime mktime +- OC_B|B_lo|__s__v|A1, OC_B|B_up|__s__v|A1, // tolower toupper +- OC_FBLTIN|Sx|F_le, // length +- OC_GETLINE|SV, // getline ++ OC_B|B_an|_vv|A2, OC_B|B_co|__v|A1, OC_B|B_ls|_vv|A2, OC_B|B_or|_vv|A2, // and compl lshift or ++ OC_B|B_rs|_vv|A2, OC_B|B_xo|_vv|A2, // rshift xor ++ OC_F|F_cl|Sx|Rx, OC_F|F_sy|Sx|Rx, OC_F|F_ff|Sx, OC_B|B_a2|_vv|A2, // close system fflush atan2 ++ OC_F|F_co|Nx|Rx, OC_F|F_ex|Nx|Rx, OC_F|F_in|Nx|Rx, OC_F|F_lg|Nx|Rx, // cos exp int log ++ OC_F|F_rn, OC_F|F_si|Nx|Rx, OC_F|F_sq|Nx|Rx, OC_F|F_sr|Nx, // rand sin sqrt srand ++ OC_B|B_ge|_s_vv_|A3,OC_B|B_gs|ss_vv_|A2,OC_B|B_ix|_ss_vv|A2, // gensub gsub index /*length was here*/ ++ OC_B|B_ma|__s__v|A2,OC_B|B_sp|__s_vv|A2,OC_SPRINTF, OC_B|B_su|ss_vv_|A2,// match split sprintf sub ++ OC_B|B_ss|__svvv|A2,OC_F|F_ti, OC_B|B_ti|__s_vv, OC_B|B_mt|__s_vv, // substr systime strftime mktime ++ OC_B|B_lo|__s__v|A1,OC_B|B_up|__s__v|A1, // tolower toupper ++ OC_F|F_le|Sx, // length ++ OC_GETLINE|SV, // getline + 0, 0, // func function + 0, // BEGIN + 0 // END +@@ -498,6 +499,7 @@ static const uint32_t tokeninfo[] ALIGN4 = { + #undef A2 + #undef A3 + #undef OC_B ++#undef OC_F + }; + + /* internal variable names and their initial values */ +-- +2.27.0 + + +From 2fcb86ed0176fcfe85d279d637a3d1b15ecf24bb Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 2 Jul 2021 19:38:03 +0200 +Subject: [PATCH 37/61] awk: rand() could return 1.0, fix this - should be in + [0,1) + +While at it, make it finer-grained (63 bits of randomness) + +function old new delta +evaluate 3303 3336 +33 +.rodata 104107 104111 +4 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 2/0 up/down: 37/0) Total: 37 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 15 +++++++++++++-- + 1 file changed, 13 insertions(+), 2 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 8d7777ca6..64fe81be4 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -3118,9 +3118,20 @@ static var *evaluate(node *op, var *res) + case F_rn: /*rand*/ + if (op1) + syntax_error("Too many arguments"); +- R_d = (double)rand() / (double)RAND_MAX; ++ { ++#if RAND_MAX >= 0x7fffffff ++ uint32_t u = ((uint32_t)rand() << 16) ^ rand(); ++ uint64_t v = ((uint64_t)rand() << 32) | u; ++ /* the above shift+or is optimized out on 32-bit arches */ ++# if RAND_MAX > 0x7fffffff ++ v &= 0x7fffffffffffffffUL; ++# endif ++ R_d = (double)v / 0x8000000000000000UL; ++#else ++# error Not implemented for this value of RAND_MAX ++#endif + break; +- ++ } + case F_co: + if (ENABLE_FEATURE_AWK_LIBM) { + R_d = cos(L_d); +-- +2.27.0 + + +From c4aa325fa23237d1c9452ed2be468730d6e2c615 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 2 Jul 2021 22:28:51 +0200 +Subject: [PATCH 38/61] awk: fix beavior of "exit" without parameter + +function old new delta +evaluate 3336 3339 +3 +awk_exit 93 94 +1 +awk_main 829 827 -2 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 2/1 up/down: 4/-2) Total: 2 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 40 ++++++++++++++++++++++------------------ + testsuite/awk.tests | 5 +++++ + 2 files changed, 27 insertions(+), 18 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 64fe81be4..86cb7a95f 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -578,6 +578,8 @@ struct globals2 { + rstream next_input_file__rsm; + smallint next_input_file__files_happen; + ++ smalluint exitcode; ++ + unsigned evaluate__seed; + var *evaluate__fnargs; + regex_t evaluate__sreg; +@@ -655,7 +657,7 @@ static const char EMSG_UNDEF_FUNC[] ALIGN1 = "Call to undefined function"; + static const char EMSG_NO_MATH[] ALIGN1 = "Math support is not compiled in"; + static const char EMSG_NEGATIVE_FIELD[] ALIGN1 = "Access to negative field"; + +-static int awk_exit(int) NORETURN; ++static int awk_exit(void) NORETURN; + + static void syntax_error(const char *message) NORETURN; + static void syntax_error(const char *message) +@@ -2779,14 +2781,14 @@ static var *evaluate(node *op, var *res) + if ((opinfo & OF_REQUIRED) && !op1) + syntax_error(EMSG_TOO_FEW_ARGS); + L.v = evaluate(op1, TMPVAR0); +- } +- if (opinfo & OF_STR1) { +- L.s = getvar_s(L.v); +- debug_printf_eval("L.s:'%s'\n", L.s); +- } +- if (opinfo & OF_NUM1) { +- L_d = getvar_i(L.v); +- debug_printf_eval("L_d:%f\n", L_d); ++ if (opinfo & OF_STR1) { ++ L.s = getvar_s(L.v); ++ debug_printf_eval("L.s:'%s'\n", L.s); ++ } ++ if (opinfo & OF_NUM1) { ++ L_d = getvar_i(L.v); ++ debug_printf_eval("L_d:%f\n", L_d); ++ } + } + /* NB: Must get string/numeric values of L (done above) + * _before_ evaluate()'ing R.v: if both L and R are $NNNs, +@@ -2799,10 +2801,10 @@ static var *evaluate(node *op, var *res) + R.v = evaluate(op->r.n, TMPVAR1); + //TODO: L.v may be invalid now, set L.v to NULL to catch bugs? + //L.v = NULL; +- } +- if (opinfo & OF_STR2) { +- R.s = getvar_s(R.v); +- debug_printf_eval("R.s:'%s'\n", R.s); ++ if (opinfo & OF_STR2) { ++ R.s = getvar_s(R.v); ++ debug_printf_eval("R.s:'%s'\n", R.s); ++ } + } + + debug_printf_eval("switch(0x%x)\n", XC(opinfo & OPCLSMASK)); +@@ -2955,7 +2957,9 @@ static var *evaluate(node *op, var *res) + + case XC( OC_EXIT ): + debug_printf_eval("EXIT\n"); +- awk_exit(L_d); ++ if (op1) ++ G.exitcode = (int)L_d; ++ awk_exit(); + + /* -- recursive node type -- */ + +@@ -3414,7 +3418,7 @@ static var *evaluate(node *op, var *res) + + /* -------- main & co. -------- */ + +-static int awk_exit(int r) ++static int awk_exit(void) + { + unsigned i; + +@@ -3435,7 +3439,7 @@ static int awk_exit(int r) + } + } + +- exit(r); ++ exit(G.exitcode); + } + + int awk_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; +@@ -3560,7 +3564,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + + evaluate(beginseq.first, &G.main__tmpvar); + if (!mainseq.first && !endseq.first) +- awk_exit(EXIT_SUCCESS); ++ awk_exit(); + + /* input file could already be opened in BEGIN block */ + if (!iF) +@@ -3587,6 +3591,6 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + iF = next_input_file(); + } + +- awk_exit(EXIT_SUCCESS); ++ awk_exit(); + /*return 0;*/ + } +diff --git a/testsuite/awk.tests b/testsuite/awk.tests +index 3c230393f..770d8ffce 100755 +--- a/testsuite/awk.tests ++++ b/testsuite/awk.tests +@@ -445,4 +445,9 @@ testing 'awk $NF is empty' \ + '' \ + 'a=====123=' + ++testing "awk exit N propagates through END's exit" \ ++ "awk 'BEGIN { exit 42 } END { exit }'; echo \$?" \ ++ "42\n" \ ++ '' '' ++ + exit $FAILCOUNT +-- +2.27.0 + + +From 1829a5b292a37553e8cc8f544448c591b3a7b3f6 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 2 Jul 2021 23:07:21 +0200 +Subject: [PATCH 39/61] awk: fix detection of VAR=VAL arguments + +1NAME=VAL is not it, neither is VA.R=VAL + +function old new delta +next_input_file 216 214 -2 +is_assignment 115 91 -24 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-26) Total: -26 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 86cb7a95f..9f14f0f9a 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -2679,7 +2679,8 @@ static int is_assignment(const char *expr) + { + char *exprc, *val; + +- if (!isalnum_(*expr) || (val = strchr(expr, '=')) == NULL) { ++ val = (char*)endofname(expr); ++ if (val == (char*)expr || *val != '=') { + return FALSE; + } + +@@ -2699,7 +2700,6 @@ static rstream *next_input_file(void) + #define rsm (G.next_input_file__rsm) + #define files_happen (G.next_input_file__files_happen) + +- FILE *F; + const char *fname, *ind; + + if (rsm.F) +@@ -2712,20 +2712,19 @@ static rstream *next_input_file(void) + if (files_happen) + return NULL; + fname = "-"; +- F = stdin; ++ rsm.F = stdin; + break; + } + ind = getvar_s(incvar(intvar[ARGIND])); + fname = getvar_s(findvar(iamarray(intvar[ARGV]), ind)); + if (fname && *fname && !is_assignment(fname)) { +- F = xfopen_stdin(fname); ++ rsm.F = xfopen_stdin(fname); + break; + } + } + + files_happen = TRUE; + setvar_s(intvar[FILENAME], fname); +- rsm.F = F; + return &rsm; + #undef rsm + #undef files_happen +-- +2.27.0 + + +From 2e495deee760595d6b0df37f1f9b7d1e4ecab1ed Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 2 Jul 2021 23:24:52 +0200 +Subject: [PATCH 40/61] awk: use smaller regmatch_t arrays, they had 2 elements + for no apparent reason + +function old new delta +exec_builtin 1479 1434 -45 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 9f14f0f9a..c06dd2304 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1937,7 +1937,7 @@ static int awk_split(const char *s, node *spl, char **slist) + n++; /* at least one field will be there */ + do { + int l; +- regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough... ++ regmatch_t pmatch[1]; + + l = strcspn(s, c+2); /* len till next NUL or \n */ + if (regexec1_nonempty(icase ? spl->r.ire : spl->l.re, s, pmatch) == 0 +@@ -2166,7 +2166,7 @@ static int ptest(node *pattern) + static int awk_getline(rstream *rsm, var *v) + { + char *b; +- regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough... ++ regmatch_t pmatch[1]; + int size, a, p, pp = 0; + int fd, so, eo, r, rp; + char c, *m, *s; +@@ -2473,7 +2473,7 @@ static NOINLINE var *exec_builtin(node *op, var *res) + node *an[4]; + var *av[4]; + const char *as[4]; +- regmatch_t pmatch[2]; ++ regmatch_t pmatch[1]; + regex_t sreg, *re; + node *spl; + uint32_t isr, info; +@@ -3533,6 +3533,8 @@ int awk_main(int argc UNUSED_PARAM, char **argv) + parse_program(llist_pop(&list_e)); + } + #endif ++//FIXME: preserve order of -e and -f ++//TODO: implement -i LIBRARY and -E FILE too, they are easy-ish + if (!(opt & (OPT_f | OPT_e))) { + if (!*argv) + bb_show_usage(); +-- +2.27.0 + + +From bb55cde906cbaf136d6487ed7738003aa41b4bd5 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Fri, 2 Jul 2021 23:38:50 +0200 +Subject: [PATCH 41/61] awk: move match() code out-of-line + +function old new delta +exec_builtin_match - 202 +202 +exec_builtin 1434 1157 -277 +------------------------------------------------------------------------------ +(add/remove: 1/0 grow/shrink: 0/1 up/down: 202/-277) Total: -75 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 45 ++++++++++++++++++++++++++++----------------- + 1 file changed, 28 insertions(+), 17 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index c06dd2304..96e06db25 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -2465,6 +2465,30 @@ static NOINLINE int do_mktime(const char *ds) + return mktime(&then); + } + ++/* Reduce stack usage in exec_builtin() by keeping match() code separate */ ++static NOINLINE void exec_builtin_match(node *an1, const char *as0, var *res) ++{ ++ regmatch_t pmatch[1]; ++ regex_t sreg, *re; ++ int n; ++ ++ re = as_regex(an1, &sreg); ++ n = regexec(re, as0, 1, pmatch, 0); ++ if (n == 0) { ++ pmatch[0].rm_so++; ++ pmatch[0].rm_eo++; ++ } else { ++ pmatch[0].rm_so = 0; ++ pmatch[0].rm_eo = -1; ++ } ++ if (re == &sreg) ++ regfree(re); ++ setvar_i(newvar("RSTART"), pmatch[0].rm_so); ++ setvar_i(newvar("RLENGTH"), pmatch[0].rm_eo - pmatch[0].rm_so); ++ setvar_i(res, pmatch[0].rm_so); ++} ++ ++/* Reduce stack usage in evaluate() by keeping builtins' code separate */ + static NOINLINE var *exec_builtin(node *op, var *res) + { + #define tspl (G.exec_builtin__tspl) +@@ -2473,8 +2497,6 @@ static NOINLINE var *exec_builtin(node *op, var *res) + node *an[4]; + var *av[4]; + const char *as[4]; +- regmatch_t pmatch[1]; +- regex_t sreg, *re; + node *spl; + uint32_t isr, info; + int nargs; +@@ -2633,20 +2655,7 @@ static NOINLINE var *exec_builtin(node *op, var *res) + break; + + case B_ma: +- re = as_regex(an[1], &sreg); +- n = regexec(re, as[0], 1, pmatch, 0); +- if (n == 0) { +- pmatch[0].rm_so++; +- pmatch[0].rm_eo++; +- } else { +- pmatch[0].rm_so = 0; +- pmatch[0].rm_eo = -1; +- } +- setvar_i(newvar("RSTART"), pmatch[0].rm_so); +- setvar_i(newvar("RLENGTH"), pmatch[0].rm_eo - pmatch[0].rm_so); +- setvar_i(res, pmatch[0].rm_so); +- if (re == &sreg) +- regfree(re); ++ exec_builtin_match(an[1], as[0], res); + break; + + case B_ge: +@@ -2732,7 +2741,9 @@ static rstream *next_input_file(void) + + /* + * Evaluate node - the heart of the program. Supplied with subtree +- * and place where to store result. Returns ptr to result. ++ * and "res" variable to assign the result to if we evaluate an expression. ++ * If node refers to e.g. a variable or a field, no assignment happens. ++ * Return ptr to the result (which may or may not be the "res" variable!) + */ + #define XC(n) ((n) >> 8) + +-- +2.27.0 + + +From a76f1b553545e144f5456c84398a0d98a81ff70d Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sat, 3 Jul 2021 00:39:55 +0200 +Subject: [PATCH 42/61] awk: rename GRPSTART/END to L/RBRACE, no code changes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 60 ++++++++++++++++++++++++++++----------------------- + 1 file changed, 33 insertions(+), 27 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 96e06db25..a1a2afd1d 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -211,8 +211,8 @@ typedef struct tsplitter_s { + #define TC_PIPE (1 << 9) /* input redirection pipe | */ + #define TC_UOPPRE2 (1 << 10) /* unary prefix operator + - ! */ + #define TC_ARRTERM (1 << 11) /* ] */ +-#define TC_GRPSTART (1 << 12) /* { */ +-#define TC_GRPTERM (1 << 13) /* } */ ++#define TC_LBRACE (1 << 12) /* { */ ++#define TC_RBRACE (1 << 13) /* } */ + #define TC_SEMICOL (1 << 14) /* ; */ + #define TC_NEWLINE (1 << 15) + #define TC_STATX (1 << 16) /* ctl statement (for, next...) */ +@@ -250,8 +250,8 @@ if ((n) & TC_COMMA ) debug_printf_parse(" COMMA" ); \ + if ((n) & TC_PIPE ) debug_printf_parse(" PIPE" ); \ + if ((n) & TC_UOPPRE2 ) debug_printf_parse(" UOPPRE2" ); \ + if ((n) & TC_ARRTERM ) debug_printf_parse(" ARRTERM" ); \ +-if ((n) & TC_GRPSTART) debug_printf_parse(" GRPSTART"); \ +-if ((n) & TC_GRPTERM ) debug_printf_parse(" GRPTERM" ); \ ++if ((n) & TC_LBRACE ) debug_printf_parse(" LBRACE" ); \ ++if ((n) & TC_RBRACE ) debug_printf_parse(" RBRACE" ); \ + if ((n) & TC_SEMICOL ) debug_printf_parse(" SEMICOL" ); \ + if ((n) & TC_NEWLINE ) debug_printf_parse(" NEWLINE" ); \ + if ((n) & TC_STATX ) debug_printf_parse(" STATX" ); \ +@@ -291,13 +291,13 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ + | TC_FUNCDECL | TC_BEGIN | TC_END) + + /* discard newlines after these */ +-#define TS_NOTERM (TC_COMMA | TC_GRPSTART | TC_GRPTERM \ ++#define TS_NOTERM (TC_COMMA | TC_LBRACE | TC_RBRACE \ + | TS_BINOP | TS_OPTERM) + + /* what can expression begin with */ + #define TS_OPSEQ (TS_OPERAND | TS_UOPPRE | TC_REGEXP) + /* what can group begin with */ +-#define TS_GRPSEQ (TS_OPSEQ | TS_OPTERM | TS_STATEMNT | TC_GRPSTART) ++#define TS_GRPSEQ (TS_OPSEQ | TS_OPTERM | TS_STATEMNT | TC_LBRACE) + + /* if previous token class is CONCAT_L and next is CONCAT_R, concatenation */ + /* operator is inserted between them */ +@@ -402,8 +402,8 @@ static const char tokenlist[] ALIGN1 = + "\1|" NTC /* TC_PIPE */ + "\1+" "\1-" "\1!" NTC /* TC_UOPPRE2 */ + "\1]" NTC /* TC_ARRTERM */ +- "\1{" NTC /* TC_GRPSTART */ +- "\1}" NTC /* TC_GRPTERM */ ++ "\1{" NTC /* TC_LBRACE */ ++ "\1}" NTC /* TC_RBRACE */ + "\1;" NTC /* TC_SEMICOL */ + "\1\n" NTC /* TC_NEWLINE */ + "\2if" "\2do" "\3for" "\5break" /* TC_STATX */ +@@ -1471,7 +1471,7 @@ static node *parse_expr(uint32_t term_tc) + debug_printf_parse("%s: TC_LENGTH\n", __func__); + tc = next_token(TC_LPAREN /* length(...) */ + | TS_OPTERM /* length; (or newline)*/ +- | TC_GRPTERM /* length } */ ++ | TC_RBRACE /* length } */ + | TC_BINOPX /* length NUM */ + | TC_COMMA /* print length, 1 */ + ); +@@ -1516,11 +1516,11 @@ static void chain_expr(uint32_t info) + + n = chain_node(info); + +- n->l.n = parse_expr(TS_OPTERM | TC_GRPTERM); ++ n->l.n = parse_expr(TS_OPTERM | TC_RBRACE); + if ((info & OF_REQUIRED) && !n->l.n) + syntax_error(EMSG_TOO_FEW_ARGS); + +- if (t_tclass & TC_GRPTERM) ++ if (t_tclass & TC_RBRACE) + rollback_token(); + } + +@@ -1559,16 +1559,16 @@ static void chain_group(void) + c = next_token(TS_GRPSEQ); + } while (c & TC_NEWLINE); + +- if (c & TC_GRPSTART) { +- debug_printf_parse("%s: TC_GRPSTART\n", __func__); +- while ((c = next_token(TS_GRPSEQ | TC_GRPTERM)) != TC_GRPTERM) { +- debug_printf_parse("%s: !TC_GRPTERM\n", __func__); ++ if (c & TC_LBRACE) { ++ debug_printf_parse("%s: TC_LBRACE\n", __func__); ++ while ((c = next_token(TS_GRPSEQ | TC_RBRACE)) != TC_RBRACE) { ++ debug_printf_parse("%s: !TC_RBRACE\n", __func__); + if (c & TC_NEWLINE) + continue; + rollback_token(); + chain_group(); + } +- debug_printf_parse("%s: TC_GRPTERM\n", __func__); ++ debug_printf_parse("%s: TC_RBRACE\n", __func__); + return; + } + if (c & (TS_OPSEQ | TS_OPTERM)) { +@@ -1588,7 +1588,7 @@ static void chain_group(void) + chain_group(); + n2 = chain_node(OC_EXEC); + n->r.n = seq->last; +- if (next_token(TS_GRPSEQ | TC_GRPTERM | TC_ELSE) == TC_ELSE) { ++ if (next_token(TS_GRPSEQ | TC_RBRACE | TC_ELSE) == TC_ELSE) { + chain_group(); + n2->a.n = seq->last; + } else { +@@ -1641,12 +1641,12 @@ static void chain_group(void) + case OC_PRINTF: + debug_printf_parse("%s: OC_PRINT[F]\n", __func__); + n = chain_node(t_info); +- n->l.n = parse_expr(TS_OPTERM | TC_OUTRDR | TC_GRPTERM); ++ n->l.n = parse_expr(TS_OPTERM | TC_OUTRDR | TC_RBRACE); + if (t_tclass & TC_OUTRDR) { + n->info |= t_info; +- n->r.n = parse_expr(TS_OPTERM | TC_GRPTERM); ++ n->r.n = parse_expr(TS_OPTERM | TC_RBRACE); + } +- if (t_tclass & TC_GRPTERM) ++ if (t_tclass & TC_RBRACE) + rollback_token(); + break; + +@@ -1684,7 +1684,7 @@ static void parse_program(char *p) + + g_pos = p; + t_lineno = 1; +- while ((tclass = next_token(TC_EOF | TS_OPSEQ | TC_GRPSTART | ++ while ((tclass = next_token(TC_EOF | TS_OPSEQ | TC_LBRACE | + TS_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL)) != TC_EOF) { + + if (tclass & TS_OPTERM) { +@@ -1696,10 +1696,14 @@ static void parse_program(char *p) + if (tclass & TC_BEGIN) { + debug_printf_parse("%s: TC_BEGIN\n", __func__); + seq = &beginseq; ++//TODO: ensure there is no newline between BEGIN and { ++//next_token(TC_LBRACE); rollback_token(); + chain_group(); + } else if (tclass & TC_END) { + debug_printf_parse("%s: TC_END\n", __func__); + seq = &endseq; ++//TODO: ensure there is no newline between END and { ++//next_token(TC_LBRACE); rollback_token(); + chain_group(); + } else if (tclass & TC_FUNCDECL) { + debug_printf_parse("%s: TC_FUNCDECL\n", __func__); +@@ -1726,24 +1730,26 @@ static void parse_program(char *p) + /* it was a comma, we ate it */ + } + seq = &f->body; ++//TODO: ensure there is { after "func F(...)" - but newlines are allowed ++//while (next_token(TC_LBRACE | TC_NEWLINE) == TC_NEWLINE) continue; rollback_token(); + chain_group(); + hash_clear(ahash); + } else if (tclass & TS_OPSEQ) { + debug_printf_parse("%s: TS_OPSEQ\n", __func__); + rollback_token(); + cn = chain_node(OC_TEST); +- cn->l.n = parse_expr(TS_OPTERM | TC_EOF | TC_GRPSTART); +- if (t_tclass & TC_GRPSTART) { +- debug_printf_parse("%s: TC_GRPSTART\n", __func__); ++ cn->l.n = parse_expr(TS_OPTERM | TC_EOF | TC_LBRACE); ++ if (t_tclass & TC_LBRACE) { ++ debug_printf_parse("%s: TC_LBRACE\n", __func__); + rollback_token(); + chain_group(); + } else { +- debug_printf_parse("%s: !TC_GRPSTART\n", __func__); ++ debug_printf_parse("%s: !TC_LBRACE\n", __func__); + chain_node(OC_PRINT); + } + cn->r.n = mainseq.last; +- } else /* if (tclass & TC_GRPSTART) */ { +- debug_printf_parse("%s: TC_GRPSTART(?)\n", __func__); ++ } else /* if (tclass & TC_LBRACE) */ { ++ debug_printf_parse("%s: TC_LBRACE(?)\n", __func__); + rollback_token(); + chain_group(); + } +-- +2.27.0 + + +From df7698f1df2ed5a82a1558e167ba3262d1c614cb Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sat, 3 Jul 2021 01:16:48 +0200 +Subject: [PATCH 43/61] awk: tighten rules in action parsing + +Disallow: + BEGIN + { action } - must start on the same line +Disallow: + func f() + print "hello" - must be in {...} + +function old new delta +chain_until_rbrace - 41 +41 +parse_program 307 336 +29 +chain_group 649 616 -33 +------------------------------------------------------------------------------ +(add/remove: 1/0 grow/shrink: 1/1 up/down: 70/-33) Total: 37 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 108 ++++++++++++++++++++++++++++++-------------------- + 1 file changed, 66 insertions(+), 42 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index a1a2afd1d..c68416873 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1549,29 +1549,35 @@ static node *chain_loop(node *nn) + return n; + } + ++static void chain_until_rbrace(void) ++{ ++ uint32_t tc; ++ while ((tc = next_token(TS_GRPSEQ | TC_RBRACE)) != TC_RBRACE) { ++ debug_printf_parse("%s: !TC_RBRACE\n", __func__); ++ if (tc == TC_NEWLINE) ++ continue; ++ rollback_token(); ++ chain_group(); ++ } ++ debug_printf_parse("%s: TC_RBRACE\n", __func__); ++} ++ + /* parse group and attach it to chain */ + static void chain_group(void) + { +- uint32_t c; ++ uint32_t tc; + node *n, *n2, *n3; + + do { +- c = next_token(TS_GRPSEQ); +- } while (c & TC_NEWLINE); ++ tc = next_token(TS_GRPSEQ); ++ } while (tc == TC_NEWLINE); + +- if (c & TC_LBRACE) { ++ if (tc == TC_LBRACE) { + debug_printf_parse("%s: TC_LBRACE\n", __func__); +- while ((c = next_token(TS_GRPSEQ | TC_RBRACE)) != TC_RBRACE) { +- debug_printf_parse("%s: !TC_RBRACE\n", __func__); +- if (c & TC_NEWLINE) +- continue; +- rollback_token(); +- chain_group(); +- } +- debug_printf_parse("%s: TC_RBRACE\n", __func__); ++ chain_until_rbrace(); + return; + } +- if (c & (TS_OPSEQ | TS_OPTERM)) { ++ if (tc & (TS_OPSEQ | TS_OPTERM)) { + debug_printf_parse("%s: TS_OPSEQ | TS_OPTERM\n", __func__); + rollback_token(); + chain_expr(OC_EXEC | Vx); +@@ -1675,37 +1681,48 @@ static void chain_group(void) + + static void parse_program(char *p) + { +- uint32_t tclass; +- node *cn; +- func *f; +- var *v; +- + debug_printf_parse("%s()\n", __func__); + + g_pos = p; + t_lineno = 1; +- while ((tclass = next_token(TC_EOF | TS_OPSEQ | TC_LBRACE | +- TS_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL)) != TC_EOF) { ++ for (;;) { ++ uint32_t tclass; + +- if (tclass & TS_OPTERM) { ++ tclass = next_token(TC_EOF | TS_OPSEQ | TC_LBRACE | ++ TS_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL); ++ ++ if (tclass == TC_EOF) { ++ debug_printf_parse("%s: TC_EOF\n", __func__); ++ break; ++ } ++ if (tclass & TS_OPTERM) { /* ; or */ + debug_printf_parse("%s: TS_OPTERM\n", __func__); ++//NB: gawk allows many newlines, but does not allow more than one semicolon: ++// BEGIN {...};; ++//would complain "each rule must have a pattern or an action part". ++//Same message for ++// ; BEGIN {...} + continue; + } +- +- seq = &mainseq; +- if (tclass & TC_BEGIN) { ++ if (tclass == TC_BEGIN) { + debug_printf_parse("%s: TC_BEGIN\n", __func__); + seq = &beginseq; +-//TODO: ensure there is no newline between BEGIN and { +-//next_token(TC_LBRACE); rollback_token(); +- chain_group(); +- } else if (tclass & TC_END) { ++ /* ensure there is no newline between BEGIN and { */ ++ next_token(TC_LBRACE); ++ chain_until_rbrace(); ++ continue; ++ } ++ if (tclass == TC_END) { + debug_printf_parse("%s: TC_END\n", __func__); + seq = &endseq; +-//TODO: ensure there is no newline between END and { +-//next_token(TC_LBRACE); rollback_token(); +- chain_group(); +- } else if (tclass & TC_FUNCDECL) { ++ /* ensure there is no newline between END and { */ ++ next_token(TC_LBRACE); ++ chain_until_rbrace(); ++ continue; ++ } ++ if (tclass == TC_FUNCDECL) { ++ func *f; ++ + debug_printf_parse("%s: TC_FUNCDECL\n", __func__); + next_token(TC_FUNCTION); + f = newfunc(t_string); +@@ -1716,6 +1733,7 @@ static void parse_program(char *p) + //f->nargs = 0; - already is + /* func arg list: comma sep list of args, and a close paren */ + for (;;) { ++ var *v; + if (next_token(TC_VARIABLE | TC_RPAREN) == TC_RPAREN) { + if (f->nargs == 0) + break; /* func() is ok */ +@@ -1730,31 +1748,37 @@ static void parse_program(char *p) + /* it was a comma, we ate it */ + } + seq = &f->body; +-//TODO: ensure there is { after "func F(...)" - but newlines are allowed +-//while (next_token(TC_LBRACE | TC_NEWLINE) == TC_NEWLINE) continue; rollback_token(); +- chain_group(); ++ /* ensure there is { after "func F(...)" - but newlines are allowed */ ++ while (next_token(TC_LBRACE | TC_NEWLINE) == TC_NEWLINE) ++ continue; ++ chain_until_rbrace(); + hash_clear(ahash); +- } else if (tclass & TS_OPSEQ) { ++ continue; ++ } ++ seq = &mainseq; ++ if (tclass & TS_OPSEQ) { ++ node *cn; ++ + debug_printf_parse("%s: TS_OPSEQ\n", __func__); + rollback_token(); + cn = chain_node(OC_TEST); + cn->l.n = parse_expr(TS_OPTERM | TC_EOF | TC_LBRACE); +- if (t_tclass & TC_LBRACE) { ++ if (t_tclass == TC_LBRACE) { + debug_printf_parse("%s: TC_LBRACE\n", __func__); + rollback_token(); + chain_group(); + } else { ++ /* no action, assume default "{ print }" */ + debug_printf_parse("%s: !TC_LBRACE\n", __func__); + chain_node(OC_PRINT); + } + cn->r.n = mainseq.last; +- } else /* if (tclass & TC_LBRACE) */ { +- debug_printf_parse("%s: TC_LBRACE(?)\n", __func__); +- rollback_token(); +- chain_group(); ++ continue; + } ++ /* tclass == TC_LBRACE */ ++ debug_printf_parse("%s: TC_LBRACE(?)\n", __func__); ++ chain_until_rbrace(); + } +- debug_printf_parse("%s: TC_EOF\n", __func__); + } + + +-- +2.27.0 + + +From bebe1432529281f66d2004e07194718a47207d5d Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sat, 3 Jul 2021 01:32:03 +0200 +Subject: [PATCH 44/61] awk: open-code TS_OPTERM, no logic changes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 33 +++++++++++++++++---------------- + 1 file changed, 17 insertions(+), 16 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index c68416873..8c471d693 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -283,7 +283,6 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ + + #define TS_LVALUE (TC_VARIABLE | TC_ARRAY) + #define TS_STATEMNT (TC_STATX | TC_WHILE) +-#define TS_OPTERM (TC_SEMICOL | TC_NEWLINE) + + /* word tokens, cannot mean something else if not expected */ + #define TS_WORD (TC_IN | TS_STATEMNT | TC_ELSE \ +@@ -291,13 +290,14 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ + | TC_FUNCDECL | TC_BEGIN | TC_END) + + /* discard newlines after these */ +-#define TS_NOTERM (TC_COMMA | TC_LBRACE | TC_RBRACE \ +- | TS_BINOP | TS_OPTERM) ++#define TS_NOTERM (TS_BINOP | TC_COMMA | TC_LBRACE | TC_RBRACE \ ++ | TC_SEMICOL | TC_NEWLINE) + + /* what can expression begin with */ + #define TS_OPSEQ (TS_OPERAND | TS_UOPPRE | TC_REGEXP) + /* what can group begin with */ +-#define TS_GRPSEQ (TS_OPSEQ | TS_OPTERM | TS_STATEMNT | TC_LBRACE) ++#define TS_GRPSEQ (TS_OPSEQ | TS_STATEMNT \ ++ | TC_SEMICOL | TC_NEWLINE | TC_LBRACE) + + /* if previous token class is CONCAT_L and next is CONCAT_R, concatenation */ + /* operator is inserted between them */ +@@ -642,7 +642,7 @@ struct globals2 { + #define g_buf (G.g_buf ) + #define INIT_G() do { \ + SET_PTR_TO_GLOBALS((char*)xzalloc(sizeof(G1)+sizeof(G)) + sizeof(G1)); \ +- t_tclass = TS_OPTERM; \ ++ t_tclass = TC_NEWLINE; \ + G.evaluate__seed = 1; \ + } while (0) + +@@ -1090,7 +1090,7 @@ static uint32_t next_token(uint32_t expected) + const uint32_t *ti; + uint32_t tc, last_token_class; + +- last_token_class = t_tclass; /* t_tclass is initialized to TS_OPTERM */ ++ last_token_class = t_tclass; /* t_tclass is initialized to TC_NEWLINE */ + + debug_printf_parse("%s() expected(%x):", __func__, expected); + debug_parse_print_tc(expected); +@@ -1470,7 +1470,8 @@ static node *parse_expr(uint32_t term_tc) + case TC_LENGTH: + debug_printf_parse("%s: TC_LENGTH\n", __func__); + tc = next_token(TC_LPAREN /* length(...) */ +- | TS_OPTERM /* length; (or newline)*/ ++ | TC_SEMICOL /* length; */ ++ | TC_NEWLINE /* length */ + | TC_RBRACE /* length } */ + | TC_BINOPX /* length NUM */ + | TC_COMMA /* print length, 1 */ +@@ -1516,7 +1517,7 @@ static void chain_expr(uint32_t info) + + n = chain_node(info); + +- n->l.n = parse_expr(TS_OPTERM | TC_RBRACE); ++ n->l.n = parse_expr(TC_SEMICOL | TC_NEWLINE | TC_RBRACE); + if ((info & OF_REQUIRED) && !n->l.n) + syntax_error(EMSG_TOO_FEW_ARGS); + +@@ -1577,8 +1578,8 @@ static void chain_group(void) + chain_until_rbrace(); + return; + } +- if (tc & (TS_OPSEQ | TS_OPTERM)) { +- debug_printf_parse("%s: TS_OPSEQ | TS_OPTERM\n", __func__); ++ if (tc & (TS_OPSEQ | TC_SEMICOL | TC_NEWLINE)) { ++ debug_printf_parse("%s: TS_OPSEQ | TC_SEMICOL | TC_NEWLINE\n", __func__); + rollback_token(); + chain_expr(OC_EXEC | Vx); + return; +@@ -1647,10 +1648,10 @@ static void chain_group(void) + case OC_PRINTF: + debug_printf_parse("%s: OC_PRINT[F]\n", __func__); + n = chain_node(t_info); +- n->l.n = parse_expr(TS_OPTERM | TC_OUTRDR | TC_RBRACE); ++ n->l.n = parse_expr(TC_SEMICOL | TC_NEWLINE | TC_OUTRDR | TC_RBRACE); + if (t_tclass & TC_OUTRDR) { + n->info |= t_info; +- n->r.n = parse_expr(TS_OPTERM | TC_RBRACE); ++ n->r.n = parse_expr(TC_SEMICOL | TC_NEWLINE | TC_RBRACE); + } + if (t_tclass & TC_RBRACE) + rollback_token(); +@@ -1689,14 +1690,14 @@ static void parse_program(char *p) + uint32_t tclass; + + tclass = next_token(TC_EOF | TS_OPSEQ | TC_LBRACE | +- TS_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL); ++ TC_SEMICOL | TC_NEWLINE | TC_BEGIN | TC_END | TC_FUNCDECL); + + if (tclass == TC_EOF) { + debug_printf_parse("%s: TC_EOF\n", __func__); + break; + } +- if (tclass & TS_OPTERM) { /* ; or */ +- debug_printf_parse("%s: TS_OPTERM\n", __func__); ++ if (tclass & (TC_SEMICOL | TC_NEWLINE)) { ++ debug_printf_parse("%s: TC_SEMICOL | TC_NEWLINE\n", __func__); + //NB: gawk allows many newlines, but does not allow more than one semicolon: + // BEGIN {...};; + //would complain "each rule must have a pattern or an action part". +@@ -1762,7 +1763,7 @@ static void parse_program(char *p) + debug_printf_parse("%s: TS_OPSEQ\n", __func__); + rollback_token(); + cn = chain_node(OC_TEST); +- cn->l.n = parse_expr(TS_OPTERM | TC_EOF | TC_LBRACE); ++ cn->l.n = parse_expr(TC_SEMICOL | TC_NEWLINE | TC_EOF | TC_LBRACE); + if (t_tclass == TC_LBRACE) { + debug_printf_parse("%s: TC_LBRACE\n", __func__); + rollback_token(); +-- +2.27.0 + + +From be80050f2cff5967de7a50eb3aed2f95c39357cd Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sat, 3 Jul 2021 01:59:36 +0200 +Subject: [PATCH 45/61] awk: support %F %a %A in printf + +function old new delta +.rodata 104111 104120 +9 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 8c471d693..2c3b49bc8 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -909,7 +909,7 @@ static int fmt_num(char *b, int size, const char *format, double n, int int_as_i + do { c = *s; } while (c && *++s); + if (strchr("diouxX", c)) { + r = snprintf(b, size, format, (int)n); +- } else if (strchr("eEfgG", c)) { ++ } else if (strchr("eEfFgGaA", c)) { + r = snprintf(b, size, format, n); + } else { + syntax_error(EMSG_INV_FMT); +-- +2.27.0 + + +From 8b97bd49bdd5181c211f5d7b64108edf9e8962f4 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sat, 3 Jul 2021 11:54:01 +0200 +Subject: [PATCH 46/61] awk: do not use a copy of g_progname for + node->l.new_progname + +We never destroy g_progname's, the strings still exist, no need to copy + +function old new delta +chain_node 104 97 -7 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 2c3b49bc8..4119253ec 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -179,7 +179,7 @@ typedef struct node_s { + struct node_s *n; + var *v; + int aidx; +- char *new_progname; ++ const char *new_progname; + regex_t *re; + } l; + union { +@@ -1501,7 +1501,7 @@ static node *chain_node(uint32_t info) + if (seq->programname != g_progname) { + seq->programname = g_progname; + n = chain_node(OC_NEWSOURCE); +- n->l.new_progname = xstrdup(g_progname); ++ n->l.new_progname = g_progname; + } + + n = seq->last; +-- +2.27.0 + + +From 61dc1b3f2201368a310b0754a74e6152fe6b015d Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sat, 3 Jul 2021 11:57:59 +0200 +Subject: [PATCH 47/61] awk: rand(): 64-bit constants should be ULL + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 4119253ec..e4dd6684c 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -3169,9 +3169,9 @@ static var *evaluate(node *op, var *res) + uint64_t v = ((uint64_t)rand() << 32) | u; + /* the above shift+or is optimized out on 32-bit arches */ + # if RAND_MAX > 0x7fffffff +- v &= 0x7fffffffffffffffUL; ++ v &= 0x7fffffffffffffffULL; + # endif +- R_d = (double)v / 0x8000000000000000UL; ++ R_d = (double)v / 0x8000000000000000ULL; + #else + # error Not implemented for this value of RAND_MAX + #endif +-- +2.27.0 + + +From a6468234691fb0718fa0d57b9de4a7748f805af9 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sat, 3 Jul 2021 12:20:36 +0200 +Subject: [PATCH 48/61] awk: match(): code shrink + +function old new delta +do_match - 165 +165 +exec_builtin_match 202 - -202 +------------------------------------------------------------------------------ +(add/remove: 1/1 grow/shrink: 0/0 up/down: 165/-202) Total: -37 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 24 +++++++++++------------- + 1 file changed, 11 insertions(+), 13 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index e4dd6684c..649198d15 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -2497,26 +2497,24 @@ static NOINLINE int do_mktime(const char *ds) + } + + /* Reduce stack usage in exec_builtin() by keeping match() code separate */ +-static NOINLINE void exec_builtin_match(node *an1, const char *as0, var *res) ++static NOINLINE var *do_match(node *an1, const char *as0) + { + regmatch_t pmatch[1]; + regex_t sreg, *re; +- int n; ++ int n, start, len; + + re = as_regex(an1, &sreg); + n = regexec(re, as0, 1, pmatch, 0); +- if (n == 0) { +- pmatch[0].rm_so++; +- pmatch[0].rm_eo++; +- } else { +- pmatch[0].rm_so = 0; +- pmatch[0].rm_eo = -1; +- } + if (re == &sreg) + regfree(re); +- setvar_i(newvar("RSTART"), pmatch[0].rm_so); +- setvar_i(newvar("RLENGTH"), pmatch[0].rm_eo - pmatch[0].rm_so); +- setvar_i(res, pmatch[0].rm_so); ++ start = 0; ++ len = -1; ++ if (n == 0) { ++ start = pmatch[0].rm_so + 1; ++ len = pmatch[0].rm_eo - pmatch[0].rm_so; ++ } ++ setvar_i(newvar("RLENGTH"), len); ++ return setvar_i(newvar("RSTART"), start); + } + + /* Reduce stack usage in evaluate() by keeping builtins' code separate */ +@@ -2686,7 +2684,7 @@ static NOINLINE var *exec_builtin(node *op, var *res) + break; + + case B_ma: +- exec_builtin_match(an[1], as[0], res); ++ res = do_match(an[1], as[0]); + break; + + case B_ge: +-- +2.27.0 + + +From 9642f8123d92f8a1db9078178b04d22015d5e03a Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sat, 3 Jul 2021 13:29:32 +0200 +Subject: [PATCH 49/61] awk: restore strdup elision optimization in assignment + +function old new delta +evaluate 3339 3387 +48 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 25 +++++++++++++++++-------- + 1 file changed, 17 insertions(+), 8 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 649198d15..20672db9a 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -102,7 +102,7 @@ enum { + #define VF_USER 0x0200 /* 1 = user input (may be numeric string) */ + #define VF_SPECIAL 0x0400 /* 1 = requires extra handling when changed */ + #define VF_WALK 0x0800 /* 1 = variable has alloc'd x.walker list */ +-#define VF_FSTR 0x1000 /* 1 = var::string points to fstring buffer */ ++#define VF_FSTR 0x1000 /* 1 = don't free() var::string (not malloced, or is owned by something else) */ + #define VF_CHILD 0x2000 /* 1 = function arg; x.parent points to source */ + #define VF_DIRTY 0x4000 /* 1 = variable was set explicitly */ + +@@ -1371,6 +1371,12 @@ static node *parse_expr(uint32_t term_tc) + cn->a.n = vn->a.n; + if (tc & TS_BINOP) { + cn->l.n = vn; ++//FIXME: this is the place to detect and reject assignments to non-lvalues. ++//Currently we allow "assignments" to consts and temporaries, nonsense like this: ++// awk 'BEGIN { "qwe" = 1 }' ++// awk 'BEGIN { 7 *= 7 }' ++// awk 'BEGIN { length("qwe") = 1 }' ++// awk 'BEGIN { (1+1) += 3 }' + expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; + if ((t_info & OPCLSMASK) == OC_PGETLINE) { + /* it's a pipe */ +@@ -3043,14 +3049,17 @@ static var *evaluate(node *op, var *res) + case XC( OC_MOVE ): + debug_printf_eval("MOVE\n"); + /* if source is a temporary string, jusk relink it to dest */ +-//Disabled: if R.v is numeric but happens to have cached R.v->string, +-//then L.v ends up being a string, which is wrong +-// if (R.v == TMPVAR1 && R.v->string) { +-// res = setvar_p(L.v, R.v->string); +-// R.v->string = NULL; +-// } else { ++ if (R.v == TMPVAR1 ++ && !(R.v->type & VF_NUMBER) ++ /* Why check !NUMBER? if R.v is a number but has cached R.v->string, ++ * L.v ends up a string, which is wrong */ ++ /*&& R.v->string - always not NULL (right?) */ ++ ) { ++ res = setvar_p(L.v, R.v->string); /* avoids strdup */ ++ R.v->string = NULL; ++ } else { + res = copyvar(L.v, R.v); +-// } ++ } + break; + + case XC( OC_TERNARY ): +-- +2.27.0 + + +From c49ba79e1ce45367a1d994b12d972daae0698beb Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sat, 3 Jul 2021 13:57:47 +0200 +Subject: [PATCH 50/61] awk: simplify tests for operation class + +Usually, an operation class has only one possible value of "info" word. +In this case, just compare the entire info word, do not bother +to mask OPCLSMASK bits. + +(Example where this is not the case: OC_REPLACE for "=") + +function old new delta +mk_splitter 106 100 -6 +chain_group 616 610 -6 +nextarg 40 32 -8 +exec_builtin 1157 1149 -8 +as_regex 111 103 -8 +awk_split 553 543 -10 +parse_expr 948 936 -12 +awk_getline 656 642 -14 +evaluate 3387 3343 -44 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 0/9 up/down: 0/-116) Total: -116 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 64 +++++++++++++++++++++++++++++---------------------- + 1 file changed, 36 insertions(+), 28 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 20672db9a..cd135ef64 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -432,7 +432,8 @@ static const char tokenlist[] ALIGN1 = + static const uint32_t tokeninfo[] ALIGN4 = { + 0, + 0, +- OC_REGEXP, ++#define TI_REGEXP OC_REGEXP ++ TI_REGEXP, + xS|'a', xS|'w', xS|'|', + OC_UNARY|xV|P(9)|'p', OC_UNARY|xV|P(9)|'m', + #define TI_PREINC (OC_UNARY|xV|P(9)|'P') +@@ -443,12 +444,17 @@ static const uint32_t tokeninfo[] ALIGN4 = { + OC_BINARY|NV|P(29)|'+', OC_BINARY|NV|P(29)|'-', OC_REPLACE|NV|P(74)|'&', OC_BINARY|NV|P(15)|'&', + OC_BINARY|NV|P(25)|'/', OC_BINARY|NV|P(25)|'%', OC_BINARY|NV|P(15)|'&', OC_BINARY|NV|P(25)|'*', + OC_COMPARE|VV|P(39)|4, OC_COMPARE|VV|P(39)|3, OC_COMPARE|VV|P(39)|0, OC_COMPARE|VV|P(39)|1, +-#define TI_LESS (OC_COMPARE|VV|P(39)|2) ++#define TI_LESS (OC_COMPARE|VV|P(39)|2) + TI_LESS, OC_MATCH|Sx|P(45)|'!', OC_MATCH|Sx|P(45)|'~', OC_LAND|Vx|P(55), +- OC_LOR|Vx|P(59), OC_TERNARY|Vx|P(64)|'?', OC_COLON|xx|P(67)|':', +- OC_IN|SV|P(49), /* TC_IN */ +- OC_COMMA|SS|P(80), +- OC_PGETLINE|SV|P(37), ++#define TI_TERNARY (OC_TERNARY|Vx|P(64)|'?') ++#define TI_COLON (OC_COLON|xx|P(67)|':') ++ OC_LOR|Vx|P(59), TI_TERNARY, TI_COLON, ++#define TI_IN (OC_IN|SV|P(49)) ++ TI_IN, ++#define TI_COMMA (OC_COMMA|SS|P(80)) ++ TI_COMMA, ++#define TI_PGETLINE (OC_PGETLINE|SV|P(37)) ++ TI_PGETLINE, + OC_UNARY|xV|P(19)|'+', OC_UNARY|xV|P(19)|'-', OC_UNARY|xV|P(19)|'!', + 0, /* ] */ + 0, +@@ -456,7 +462,8 @@ static const uint32_t tokeninfo[] ALIGN4 = { + 0, + 0, /* \n */ + ST_IF, ST_DO, ST_FOR, OC_BREAK, +- OC_CONTINUE, OC_DELETE|Rx, OC_PRINT, ++#define TI_PRINT OC_PRINT ++ OC_CONTINUE, OC_DELETE|Rx, TI_PRINT, + OC_PRINTF, OC_NEXT, OC_NEXTFILE, + OC_RETURN|Vx, OC_EXIT|Nx, + ST_WHILE, +@@ -465,8 +472,8 @@ static const uint32_t tokeninfo[] ALIGN4 = { + // Highest byte bit pattern: nn s3s2s1 v3v2v1 + // nn - min. number of args, sN - resolve Nth arg to string, vN - resolve to var + // OC_F's are builtins with zero or one argument. +-// |Rx| enforces that arg is present for: system, close, cos, sin, exp, int, log, sqrt. +-// Check for no args is present in builtins' code (not in this table): rand, systime. ++// |Rx| enforces that arg is present for: system, close, cos, sin, exp, int, log, sqrt ++// Check for no args is present in builtins' code (not in this table): rand, systime + // Have one _optional_ arg: fflush, srand, length + #define OC_B OC_BUILTIN + #define OC_F OC_FBLTIN +@@ -1310,7 +1317,7 @@ static node *new_node(uint32_t info) + + static void mk_re_node(const char *s, node *n, regex_t *re) + { +- n->info = OC_REGEXP; ++ n->info = TI_REGEXP; + n->l.re = re; + n->r.ire = re + 1; + xregcomp(re, s, REG_EXTENDED); +@@ -1360,12 +1367,13 @@ static node *parse_expr(uint32_t term_tc) + * previous operators with higher priority */ + vn = cn; + while (((t_info & PRIMASK) > (vn->a.n->info & PRIMASK2)) +- || ((t_info == vn->info) && ((t_info & OPCLSMASK) == OC_COLON)) ++ || ((t_info == vn->info) && t_info == TI_COLON) + ) { + vn = vn->a.n; + if (!vn->a.n) syntax_error(EMSG_UNEXP_TOKEN); + } +- if ((t_info & OPCLSMASK) == OC_TERNARY) ++ if (t_info == TI_TERNARY) ++//TODO: why? + t_info += P(6); + cn = vn->a.n->r.n = new_node(t_info); + cn->a.n = vn->a.n; +@@ -1378,7 +1386,7 @@ static node *parse_expr(uint32_t term_tc) + // awk 'BEGIN { length("qwe") = 1 }' + // awk 'BEGIN { (1+1) += 3 }' + expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; +- if ((t_info & OPCLSMASK) == OC_PGETLINE) { ++ if (t_info == TI_PGETLINE) { + /* it's a pipe */ + next_token(TC_GETLINE); + /* give maximum priority to this pipe */ +@@ -1630,7 +1638,7 @@ static void chain_group(void) + next_token(TC_LPAREN); + n2 = parse_expr(TC_SEMICOL | TC_RPAREN); + if (t_tclass & TC_RPAREN) { /* for-in */ +- if (!n2 || (n2->info & OPCLSMASK) != OC_IN) ++ if (!n2 || n2->info != TI_IN) + syntax_error(EMSG_UNEXP_TOKEN); + n = chain_node(OC_WALKINIT | VV); + n->l.n = n2->l.n; +@@ -1834,7 +1842,7 @@ static node *mk_splitter(const char *s, tsplitter *spl) + re = &spl->re[0]; + ire = &spl->re[1]; + n = &spl->n; +- if ((n->info & OPCLSMASK) == OC_REGEXP) { ++ if (n->info == TI_REGEXP) { + regfree(re); + regfree(ire); // TODO: nuke ire, use re+1? + } +@@ -1858,7 +1866,7 @@ static regex_t *as_regex(node *op, regex_t *preg) + int cflags; + const char *s; + +- if ((op->info & OPCLSMASK) == OC_REGEXP) { ++ if (op->info == TI_REGEXP) { + return icase ? op->r.ire : op->l.re; + } + +@@ -1968,7 +1976,7 @@ static int awk_split(const char *s, node *spl, char **slist) + c[2] = '\n'; + + n = 0; +- if ((spl->info & OPCLSMASK) == OC_REGEXP) { /* regex split */ ++ if (spl->info == TI_REGEXP) { /* regex split */ + if (!*s) + return n; /* "": zero fields */ + n++; /* at least one field will be there */ +@@ -2135,7 +2143,7 @@ static node *nextarg(node **pn) + node *n; + + n = *pn; +- if (n && (n->info & OPCLSMASK) == OC_COMMA) { ++ if (n && n->info == TI_COMMA) { + *pn = n->r.n; + n = n->l.n; + } else { +@@ -2229,7 +2237,7 @@ static int awk_getline(rstream *rsm, var *v) + so = eo = p; + r = 1; + if (p > 0) { +- if ((rsplitter.n.info & OPCLSMASK) == OC_REGEXP) { ++ if (rsplitter.n.info == TI_REGEXP) { + if (regexec(icase ? rsplitter.n.r.ire : rsplitter.n.l.re, + b, 1, pmatch, 0) == 0) { + so = pmatch[0].rm_so; +@@ -2575,8 +2583,8 @@ static NOINLINE var *exec_builtin(node *op, var *res) + char *s, *s1; + + if (nargs > 2) { +- spl = (an[2]->info & OPCLSMASK) == OC_REGEXP ? +- an[2] : mk_splitter(getvar_s(evaluate(an[2], TMPVAR2)), &tspl); ++ spl = (an[2]->info == TI_REGEXP) ? an[2] ++ : mk_splitter(getvar_s(evaluate(an[2], TMPVAR2)), &tspl); + } else { + spl = &fsplitter.n; + } +@@ -2860,7 +2868,7 @@ static var *evaluate(node *op, var *res) + /* test pattern */ + case XC( OC_TEST ): + debug_printf_eval("TEST\n"); +- if ((op1->info & OPCLSMASK) == OC_COMMA) { ++ if (op1->info == TI_COMMA) { + /* it's range pattern */ + if ((opinfo & OF_CHECKED) || ptest(op1->l.n)) { + op->info |= OF_CHECKED; +@@ -2921,7 +2929,7 @@ static var *evaluate(node *op, var *res) + F = rsm->F; + } + +- if ((opinfo & OPCLSMASK) == OC_PRINT) { ++ if (opinfo == TI_PRINT) { + if (!op1) { + fputs(getvar_s(intvar[F0]), F); + } else { +@@ -2940,7 +2948,7 @@ static var *evaluate(node *op, var *res) + } + } + fputs(getvar_s(intvar[ORS]), F); +- } else { /* OC_PRINTF */ ++ } else { /* PRINTF */ + char *s = awk_printf(op1, &len); + #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS + fwrite(s, len, 1, F); +@@ -3064,7 +3072,7 @@ static var *evaluate(node *op, var *res) + + case XC( OC_TERNARY ): + debug_printf_eval("TERNARY\n"); +- if ((op->r.n->info & OPCLSMASK) != OC_COLON) ++ if (op->r.n->info != TI_COLON) + syntax_error(EMSG_POSSIBLE_ERROR); + res = evaluate(istrue(L.v) ? op->r.n->l.n : op->r.n->r.n, res); + break; +@@ -3122,7 +3130,7 @@ static var *evaluate(node *op, var *res) + if (op1) { + rsm = newfile(L.s); + if (!rsm->F) { +- if ((opinfo & OPCLSMASK) == OC_PGETLINE) { ++ if (opinfo == TI_PGETLINE) { + rsm->F = popen(L.s, "r"); + rsm->is_pipe = TRUE; + } else { +@@ -3158,7 +3166,7 @@ static var *evaluate(node *op, var *res) + double R_d = R_d; /* for compiler */ + debug_printf_eval("FBLTIN\n"); + +- if (op1 && (op1->info & OPCLSMASK) == OC_COMMA) ++ if (op1 && op1->info == TI_COMMA) + /* Simple builtins take one arg maximum */ + syntax_error("Too many arguments"); + +@@ -3358,7 +3366,7 @@ static var *evaluate(node *op, var *res) + case XC( OC_COMMA ): { + const char *sep = ""; + debug_printf_eval("COMMA\n"); +- if ((opinfo & OPCLSMASK) == OC_COMMA) ++ if (opinfo == TI_COMMA) + sep = getvar_s(intvar[SUBSEP]); + setvar_p(res, xasprintf("%s%s%s", L.s, sep, R.s)); + break; +-- +2.27.0 + + +From 39122ab01367775898f3f46394942138176b4101 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sun, 4 Jul 2021 01:25:34 +0200 +Subject: [PATCH 51/61] awk: fix printf buffer overflow + +function old new delta +awk_printf 468 546 +78 +fmt_num 239 247 +8 +getvar_s 125 111 -14 +evaluate 3343 3329 -14 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 2/2 up/down: 86/-28) Total: 58 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 94 ++++++++++++++++++++++++++++++--------------------- + 1 file changed, 55 insertions(+), 39 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index cd135ef64..a440a6234 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -904,25 +904,23 @@ static double my_strtod(char **pp) + + /* -------- working with variables (set/get/copy/etc) -------- */ + +-static int fmt_num(char *b, int size, const char *format, double n, int int_as_int) ++static void fmt_num(const char *format, double n) + { +- int r = 0; +- char c; +- const char *s = format; +- +- if (int_as_int && n == (long long)n) { +- r = snprintf(b, size, "%lld", (long long)n); ++ if (n == (long long)n) { ++ snprintf(g_buf, MAXVARFMT, "%lld", (long long)n); + } else { ++ const char *s = format; ++ char c; ++ + do { c = *s; } while (c && *++s); + if (strchr("diouxX", c)) { +- r = snprintf(b, size, format, (int)n); ++ snprintf(g_buf, MAXVARFMT, format, (int)n); + } else if (strchr("eEfFgGaA", c)) { +- r = snprintf(b, size, format, n); ++ snprintf(g_buf, MAXVARFMT, format, n); + } else { + syntax_error(EMSG_INV_FMT); + } + } +- return r; + } + + static xhash *iamarray(var *a) +@@ -999,7 +997,7 @@ static const char *getvar_s(var *v) + { + /* if v is numeric and has no cached string, convert it to string */ + if ((v->type & (VF_NUMBER | VF_CACHED)) == VF_NUMBER) { +- fmt_num(g_buf, MAXVARFMT, getvar_s(intvar[CONVFMT]), v->number, TRUE); ++ fmt_num(getvar_s(intvar[CONVFMT]), v->number); + v->string = xstrdup(g_buf); + v->type |= VF_CACHED; + } +@@ -2315,12 +2313,9 @@ static int awk_getline(rstream *rsm, var *v) + #endif + static char *awk_printf(node *n, int *len) + { +- char *b = NULL; +- char *fmt, *s, *f; +- const char *s1; +- int i, j, incr, bsize; +- char c, c1; +- var *arg; ++ char *b; ++ char *fmt, *f; ++ int i; + + //tmpvar = nvalloc(1); + #define TMPVAR (&G.awk_printf__tmpvar) +@@ -2333,8 +2328,14 @@ static char *awk_printf(node *n, int *len) + // to evaluate() potentially recursing into another awk_printf() can't + // mangle the value. + ++ b = NULL; + i = 0; +- while (*f) { ++ while (*f) { /* "print one format spec" loop */ ++ char *s; ++ char c; ++ char sv; ++ var *arg; ++ + s = f; + while (*f && (*f != '%' || *++f == '%')) + f++; +@@ -2343,40 +2344,55 @@ static char *awk_printf(node *n, int *len) + syntax_error("%*x formats are not supported"); + f++; + } +- +- incr = (f - s) + MAXVARFMT; +- b = qrealloc(b, incr + i, &bsize); + c = *f; +- if (c != '\0') +- f++; +- c1 = *f; ++ if (!c) { ++ /* Tail of fmt with no percent chars, ++ * or "....%" (percent seen, but no format specifier char found) ++ */ ++ goto tail; ++ } ++ sv = *++f; + *f = '\0'; + arg = evaluate(nextarg(&n), TMPVAR); + +- j = i; +- if (c == 'c' || !c) { +- i += sprintf(b+i, s, is_numeric(arg) ? ++ /* Result can be arbitrarily long. Example: ++ * printf "%99999s", "BOOM" ++ */ ++ if (c == 'c') { ++ s = xasprintf(s, is_numeric(arg) ? + (char)getvar_i(arg) : *getvar_s(arg)); + } else if (c == 's') { +- s1 = getvar_s(arg); +- b = qrealloc(b, incr+i+strlen(s1), &bsize); +- i += sprintf(b+i, s, s1); ++ s = xasprintf(s, getvar_s(arg)); + } else { +- i += fmt_num(b+i, incr, s, getvar_i(arg), FALSE); ++ double d = getvar_i(arg); ++ if (strchr("diouxX", c)) { ++//TODO: make it wider here (%x -> %llx etc)? ++ s = xasprintf(s, (int)d); ++ } else if (strchr("eEfFgGaA", c)) { ++ s = xasprintf(s, d); ++ } else { ++ syntax_error(EMSG_INV_FMT); ++ } + } +- *f = c1; ++ *f = sv; + +- /* if there was an error while sprintf, return value is negative */ +- if (i < j) +- i = j; ++ if (i == 0) { ++ b = s; ++ i = strlen(b); ++ continue; ++ } ++ tail: ++ b = xrealloc(b, i + strlen(s) + 1); ++ i = stpcpy(b + i, s) - b; ++ if (!c) /* tail? */ ++ break; ++ free(s); + } + + free(fmt); + //nvfree(tmpvar, 1); + #undef TMPVAR + +- b = xrealloc(b, i + 1); +- b[i] = '\0'; + #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS + if (len) + *len = i; +@@ -2936,8 +2952,8 @@ static var *evaluate(node *op, var *res) + for (;;) { + var *v = evaluate(nextarg(&op1), TMPVAR0); + if (v->type & VF_NUMBER) { +- fmt_num(g_buf, MAXVARFMT, getvar_s(intvar[OFMT]), +- getvar_i(v), TRUE); ++ fmt_num(getvar_s(intvar[OFMT]), ++ getvar_i(v)); + fputs(g_buf, F); + } else { + fputs(getvar_s(v), F); +-- +2.27.0 + + +From 9c55f6ae3f528a3416368e0aff9942d5b4ed216d Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sun, 11 Jul 2021 11:46:21 +0200 +Subject: [PATCH 52/61] awk: rollback_token() + chain_group() == + chain_until_rbrace() + +function old new delta +parse_program 336 332 -4 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index a440a6234..755e68fc7 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1778,8 +1778,7 @@ static void parse_program(char *p) + cn->l.n = parse_expr(TC_SEMICOL | TC_NEWLINE | TC_EOF | TC_LBRACE); + if (t_tclass == TC_LBRACE) { + debug_printf_parse("%s: TC_LBRACE\n", __func__); +- rollback_token(); +- chain_group(); ++ chain_until_rbrace(); + } else { + /* no action, assume default "{ print }" */ + debug_printf_parse("%s: !TC_LBRACE\n", __func__); +-- +2.27.0 + + +From bd0d2c3b5bf5c9337e67b43222bafcdf80c4e36a Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sun, 11 Jul 2021 12:00:31 +0200 +Subject: [PATCH 53/61] awk: undo TI_PRINT, it introduced a bug (print with any + redirect acting as printf) + +function old new delta +evaluate 3329 3337 +8 + +Patch by Ron Yorston + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 8 +++++--- + testsuite/awk.tests | 5 +++++ + 2 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 755e68fc7..0aa7c0804 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -462,8 +462,7 @@ static const uint32_t tokeninfo[] ALIGN4 = { + 0, + 0, /* \n */ + ST_IF, ST_DO, ST_FOR, OC_BREAK, +-#define TI_PRINT OC_PRINT +- OC_CONTINUE, OC_DELETE|Rx, TI_PRINT, ++ OC_CONTINUE, OC_DELETE|Rx, OC_PRINT, + OC_PRINTF, OC_NEXT, OC_NEXTFILE, + OC_RETURN|Vx, OC_EXIT|Nx, + ST_WHILE, +@@ -2944,7 +2943,10 @@ static var *evaluate(node *op, var *res) + F = rsm->F; + } + +- if (opinfo == TI_PRINT) { ++ /* Can't just check 'opinfo == OC_PRINT' here, parser ORs ++ * additional bits to opinfos of print/printf with redirects ++ */ ++ if ((opinfo & OPCLSMASK) == OC_PRINT) { + if (!op1) { + fputs(getvar_s(intvar[F0]), F); + } else { +diff --git a/testsuite/awk.tests b/testsuite/awk.tests +index 770d8ffce..6b23b91cb 100755 +--- a/testsuite/awk.tests ++++ b/testsuite/awk.tests +@@ -450,4 +450,9 @@ testing "awk exit N propagates through END's exit" \ + "42\n" \ + '' '' + ++testing "awk print + redirect" \ ++ "awk 'BEGIN { print \"STDERR %s\" >\"/dev/stderr\" }' 2>&1" \ ++ "STDERR %s\n" \ ++ '' '' ++ + exit $FAILCOUNT +-- +2.27.0 + + +From 5ed199c07d9ffc947443118dda0e0af6569588d5 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sun, 11 Jul 2021 12:25:33 +0200 +Subject: [PATCH 54/61] awk: unbreak "printf('%c') can output NUL" testcase + +function old new delta +awk_printf 546 593 +47 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 43 ++++++++++++++++++++++++++----------------- + 1 file changed, 26 insertions(+), 17 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 0aa7c0804..e765d3fcf 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -2309,11 +2309,11 @@ static int awk_getline(rstream *rsm, var *v) + #if !ENABLE_FEATURE_AWK_GNU_EXTENSIONS + # define awk_printf(a, b) awk_printf(a) + #endif +-static char *awk_printf(node *n, int *len) ++static char *awk_printf(node *n, size_t *len) + { + char *b; + char *fmt, *f; +- int i; ++ size_t i; + + //tmpvar = nvalloc(1); + #define TMPVAR (&G.awk_printf__tmpvar) +@@ -2333,6 +2333,7 @@ static char *awk_printf(node *n, int *len) + char c; + char sv; + var *arg; ++ size_t slen; + + s = f; + while (*f && (*f != '%' || *++f == '%')) +@@ -2347,6 +2348,7 @@ static char *awk_printf(node *n, int *len) + /* Tail of fmt with no percent chars, + * or "....%" (percent seen, but no format specifier char found) + */ ++ slen = strlen(s); + goto tail; + } + sv = *++f; +@@ -2357,31 +2359,38 @@ static char *awk_printf(node *n, int *len) + * printf "%99999s", "BOOM" + */ + if (c == 'c') { +- s = xasprintf(s, is_numeric(arg) ? +- (char)getvar_i(arg) : *getvar_s(arg)); +- } else if (c == 's') { +- s = xasprintf(s, getvar_s(arg)); ++ c = is_numeric(arg) ? getvar_i(arg) : *getvar_s(arg); ++ s = xasprintf(s, c); ++ /* + 1 if c == NUL: handle printf "%c" 0 case ++ * (and printf "%22c" 0 etc, but still fails for e.g. printf "%-22c" 0) */ ++ slen = strlen(s) + (c == '\0'); + } else { +- double d = getvar_i(arg); +- if (strchr("diouxX", c)) { +-//TODO: make it wider here (%x -> %llx etc)? +- s = xasprintf(s, (int)d); +- } else if (strchr("eEfFgGaA", c)) { +- s = xasprintf(s, d); ++ if (c == 's') { ++ s = xasprintf(s, getvar_s(arg)); + } else { +- syntax_error(EMSG_INV_FMT); ++ double d = getvar_i(arg); ++ if (strchr("diouxX", c)) { ++//TODO: make it wider here (%x -> %llx etc)? ++ s = xasprintf(s, (int)d); ++ } else if (strchr("eEfFgGaA", c)) { ++ s = xasprintf(s, d); ++ } else { ++ syntax_error(EMSG_INV_FMT); ++ } + } ++ slen = strlen(s); + } + *f = sv; + + if (i == 0) { + b = s; +- i = strlen(b); ++ i = slen; + continue; + } + tail: +- b = xrealloc(b, i + strlen(s) + 1); +- i = stpcpy(b + i, s) - b; ++ b = xrealloc(b, i + slen + 1); ++ strcpy(b + i, s); ++ i += slen; + if (!c) /* tail? */ + break; + free(s); +@@ -2926,7 +2935,6 @@ static var *evaluate(node *op, var *res) + debug_printf_eval("PRINTF\n"); + { + FILE *F = stdout; +- IF_FEATURE_AWK_GNU_EXTENSIONS(int len;) + + if (op->r.n) { + rstream *rsm = newfile(R.s); +@@ -2966,6 +2974,7 @@ static var *evaluate(node *op, var *res) + } + fputs(getvar_s(intvar[ORS]), F); + } else { /* PRINTF */ ++ IF_FEATURE_AWK_GNU_EXTENSIONS(size_t len;) + char *s = awk_printf(op1, &len); + #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS + fwrite(s, len, 1, F); +-- +2.27.0 + + +From f38b2d9bcddd00432150567bef8f8a2bf0d1ed43 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sun, 11 Jul 2021 12:51:43 +0200 +Subject: [PATCH 55/61] awk: unbreak "cmd" | getline + +function old new delta +evaluate 3337 3343 +6 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 3 ++- + testsuite/awk.tests | 5 +++++ + 2 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/editors/awk.c b/editors/awk.c +index e765d3fcf..6c60a0615 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -3156,7 +3156,8 @@ static var *evaluate(node *op, var *res) + if (op1) { + rsm = newfile(L.s); + if (!rsm->F) { +- if (opinfo == TI_PGETLINE) { ++ /* NB: can't use "opinfo == TI_PGETLINE", would break "cmd" | getline */ ++ if ((opinfo & OPCLSMASK) == OC_PGETLINE) { + rsm->F = popen(L.s, "r"); + rsm->is_pipe = TRUE; + } else { +diff --git a/testsuite/awk.tests b/testsuite/awk.tests +index 6b23b91cb..242c897d1 100755 +--- a/testsuite/awk.tests ++++ b/testsuite/awk.tests +@@ -455,4 +455,9 @@ testing "awk print + redirect" \ + "STDERR %s\n" \ + '' '' + ++testing "awk \"cmd\" | getline" \ ++ "awk 'BEGIN { \"echo HELLO\" | getline; print }'" \ ++ "HELLO\n" \ ++ '' '' ++ + exit $FAILCOUNT +-- +2.27.0 + + +From 3a759a81580a1f7d9b4428e30c623324ec2e3699 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Sun, 11 Jul 2021 18:16:10 +0200 +Subject: [PATCH 56/61] awk: fix corner case in awk_printf + +Example where it wasn't working: + awk 'BEGIN { printf "qwe %s rty %c uio\n", "a", 0, "c" }' +- the NUL printing in %c caused premature stop of printing. + +function old new delta +awk_printf 593 596 +3 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 6c60a0615..465033f5f 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -2359,11 +2359,11 @@ static char *awk_printf(node *n, size_t *len) + * printf "%99999s", "BOOM" + */ + if (c == 'c') { +- c = is_numeric(arg) ? getvar_i(arg) : *getvar_s(arg); +- s = xasprintf(s, c); +- /* + 1 if c == NUL: handle printf "%c" 0 case ++ char cc = is_numeric(arg) ? getvar_i(arg) : *getvar_s(arg); ++ s = xasprintf(s, cc); ++ /* + 1 if cc == NUL: handle printf "%c" 0 case + * (and printf "%22c" 0 etc, but still fails for e.g. printf "%-22c" 0) */ +- slen = strlen(s) + (c == '\0'); ++ slen = strlen(s) + (cc == '\0'); + } else { + if (c == 's') { + s = xasprintf(s, getvar_s(arg)); +-- +2.27.0 + + +From e62366d32f13e059266e2996a68be023bef309ef Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Mon, 12 Jul 2021 11:27:11 +0200 +Subject: [PATCH 57/61] awk: fix printf "%-10c", 0 + +function old new delta +awk_printf 596 626 +30 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 9 +++++---- + testsuite/awk.tests | 8 ++++++++ + 2 files changed, 13 insertions(+), 4 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 465033f5f..437d87ecf 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -2360,10 +2360,11 @@ static char *awk_printf(node *n, size_t *len) + */ + if (c == 'c') { + char cc = is_numeric(arg) ? getvar_i(arg) : *getvar_s(arg); +- s = xasprintf(s, cc); +- /* + 1 if cc == NUL: handle printf "%c" 0 case +- * (and printf "%22c" 0 etc, but still fails for e.g. printf "%-22c" 0) */ +- slen = strlen(s) + (cc == '\0'); ++ char *r = xasprintf(s, cc ? cc : '^' /* else strlen will be wrong */); ++ slen = strlen(r); ++ if (cc == '\0') /* if cc is NUL, re-format the string with it */ ++ sprintf(r, s, cc); ++ s = r; + } else { + if (c == 's') { + s = xasprintf(s, getvar_s(arg)); +diff --git a/testsuite/awk.tests b/testsuite/awk.tests +index 242c897d1..3cddb4dd4 100755 +--- a/testsuite/awk.tests ++++ b/testsuite/awk.tests +@@ -415,6 +415,14 @@ testing "awk printf('%c') can output NUL" \ + "awk '{printf(\"hello%c null\n\", 0)}'" "hello\0 null\n" "" "\n" + SKIP= + ++optional FEATURE_AWK_GNU_EXTENSIONS ++testing "awk printf('%-10c') can output NUL" \ ++ "awk 'BEGIN { printf \"[%-10c]\n\", 0 }' | od -tx1" "\ ++0000000 5b 00 20 20 20 20 20 20 20 20 20 5d 0a ++0000015 ++" "" "" ++SKIP= ++ + # testing "description" "command" "result" "infile" "stdin" + testing 'awk negative field access' \ + 'awk 2>&1 -- '\''{ $(-1) }'\' \ +-- +2.27.0 + + +From 258057e67d4403d43f48788fabdf874c1bb59502 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Mon, 12 Jul 2021 13:30:30 +0200 +Subject: [PATCH 58/61] awk: in parsing, remove superfluous NEWLINE check; + optimize builtin arg evaluation + +function old new delta +exec_builtin 1149 1145 -4 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 437d87ecf..7a282356d 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1589,8 +1589,8 @@ static void chain_group(void) + chain_until_rbrace(); + return; + } +- if (tc & (TS_OPSEQ | TC_SEMICOL | TC_NEWLINE)) { +- debug_printf_parse("%s: TS_OPSEQ | TC_SEMICOL | TC_NEWLINE\n", __func__); ++ if (tc & (TS_OPSEQ | TC_SEMICOL)) { ++ debug_printf_parse("%s: TS_OPSEQ | TC_SEMICOL\n", __func__); + rollback_token(); + chain_expr(OC_EXEC | Vx); + return; +@@ -2582,10 +2582,11 @@ static NOINLINE var *exec_builtin(node *op, var *res) + av[2] = av[3] = NULL; + for (i = 0; i < 4 && op; i++) { + an[i] = nextarg(&op); +- if (isr & 0x09000000) ++ if (isr & 0x09000000) { + av[i] = evaluate(an[i], TMPVAR(i)); +- if (isr & 0x08000000) +- as[i] = getvar_s(av[i]); ++ if (isr & 0x08000000) ++ as[i] = getvar_s(av[i]); ++ } + isr >>= 1; + } + +-- +2.27.0 + + +From 18fe636700ac5d795027d920922340410f65640e Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Wed, 14 Jul 2021 14:25:07 +0200 +Subject: [PATCH 59/61] awk: tighten parsing - disallow extra semicolons + +'; BEGIN {...}' and 'BEGIN {...} ;; {...}' are not accepted by gawk + +function old new delta +parse_program 332 353 +21 + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 40 ++++++++++++++++++++++++---------------- + 1 file changed, 24 insertions(+), 16 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 7a282356d..2f8a18c8e 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1634,7 +1634,7 @@ static void chain_group(void) + debug_printf_parse("%s: ST_FOR\n", __func__); + next_token(TC_LPAREN); + n2 = parse_expr(TC_SEMICOL | TC_RPAREN); +- if (t_tclass & TC_RPAREN) { /* for-in */ ++ if (t_tclass & TC_RPAREN) { /* for (I in ARRAY) */ + if (!n2 || n2->info != TI_IN) + syntax_error(EMSG_UNEXP_TOKEN); + n = chain_node(OC_WALKINIT | VV); +@@ -1700,20 +1700,15 @@ static void parse_program(char *p) + for (;;) { + uint32_t tclass; + +- tclass = next_token(TC_EOF | TS_OPSEQ | TC_LBRACE | +- TC_SEMICOL | TC_NEWLINE | TC_BEGIN | TC_END | TC_FUNCDECL); +- ++ tclass = next_token(TS_OPSEQ | TC_LBRACE | TC_BEGIN | TC_END | TC_FUNCDECL ++ | TC_EOF | TC_NEWLINE /* but not TC_SEMICOL */); ++ got_tok: + if (tclass == TC_EOF) { + debug_printf_parse("%s: TC_EOF\n", __func__); + break; + } +- if (tclass & (TC_SEMICOL | TC_NEWLINE)) { +- debug_printf_parse("%s: TC_SEMICOL | TC_NEWLINE\n", __func__); +-//NB: gawk allows many newlines, but does not allow more than one semicolon: +-// BEGIN {...};; +-//would complain "each rule must have a pattern or an action part". +-//Same message for +-// ; BEGIN {...} ++ if (tclass == TC_NEWLINE) { ++ debug_printf_parse("%s: TC_NEWLINE\n", __func__); + continue; + } + if (tclass == TC_BEGIN) { +@@ -1722,7 +1717,7 @@ static void parse_program(char *p) + /* ensure there is no newline between BEGIN and { */ + next_token(TC_LBRACE); + chain_until_rbrace(); +- continue; ++ goto next_tok; + } + if (tclass == TC_END) { + debug_printf_parse("%s: TC_END\n", __func__); +@@ -1730,7 +1725,7 @@ static void parse_program(char *p) + /* ensure there is no newline between END and { */ + next_token(TC_LBRACE); + chain_until_rbrace(); +- continue; ++ goto next_tok; + } + if (tclass == TC_FUNCDECL) { + func *f; +@@ -1765,7 +1760,7 @@ static void parse_program(char *p) + continue; + chain_until_rbrace(); + hash_clear(ahash); +- continue; ++ goto next_tok; + } + seq = &mainseq; + if (tclass & TS_OPSEQ) { +@@ -1784,12 +1779,25 @@ static void parse_program(char *p) + chain_node(OC_PRINT); + } + cn->r.n = mainseq.last; +- continue; ++ goto next_tok; + } + /* tclass == TC_LBRACE */ + debug_printf_parse("%s: TC_LBRACE(?)\n", __func__); + chain_until_rbrace(); +- } ++ next_tok: ++ /* Same as next_token() at the top of the loop, + TC_SEMICOL */ ++ tclass = next_token(TS_OPSEQ | TC_LBRACE | TC_BEGIN | TC_END | TC_FUNCDECL ++ | TC_EOF | TC_NEWLINE | TC_SEMICOL); ++ /* gawk allows many newlines, but does not allow more than one semicolon: ++ * BEGIN {...};; ++ * would complain "each rule must have a pattern or an action part". ++ * Same message for ++ * ; BEGIN {...} ++ */ ++ if (tclass != TC_SEMICOL) ++ goto got_tok; /* use this token */ ++ /* else: loop back - ate the semicolon, get and use _next_ token */ ++ } /* for (;;) */ + } + + +-- +2.27.0 + + +From 9b502f61277aa48a412dd1a18e7a30b5d4c3d71a Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Wed, 14 Jul 2021 14:33:37 +0200 +Subject: [PATCH 60/61] awk: disallow break/continue outside of loops + +function old new delta +.rodata 104139 104186 +47 +chain_group 610 633 +23 +------------------------------------------------------------------------------ +(add/remove: 0/0 grow/shrink: 2/0 up/down: 70/0) Total: 70 bytes + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 6 ++++-- + testsuite/awk.tests | 9 ++------- + 2 files changed, 6 insertions(+), 9 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 2f8a18c8e..607d69487 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -1671,16 +1671,18 @@ static void chain_group(void) + case OC_BREAK: + debug_printf_parse("%s: OC_BREAK\n", __func__); + n = chain_node(OC_EXEC); ++ if (!break_ptr) ++ syntax_error("'break' not in a loop"); + n->a.n = break_ptr; +-//TODO: if break_ptr is NULL, syntax error (not in the loop)? + chain_expr(t_info); + break; + + case OC_CONTINUE: + debug_printf_parse("%s: OC_CONTINUE\n", __func__); + n = chain_node(OC_EXEC); ++ if (!continue_ptr) ++ syntax_error("'continue' not in a loop"); + n->a.n = continue_ptr; +-//TODO: if continue_ptr is NULL, syntax error (not in the loop)? + chain_expr(t_info); + break; + +diff --git a/testsuite/awk.tests b/testsuite/awk.tests +index 3cddb4dd4..f53b1efe2 100755 +--- a/testsuite/awk.tests ++++ b/testsuite/awk.tests +@@ -379,19 +379,14 @@ testing "awk -e and ARGC" \ + "" + SKIP= + +-# The examples are in fact not valid awk programs (break/continue +-# can only be used inside loops). +-# But we do accept them outside of loops. +-# We had a bug with misparsing "break ; else" sequence. +-# Test that *that* bug is fixed, using simplest possible scripts: + testing "awk break" \ + "awk -f - 2>&1; echo \$?" \ +- "0\n" \ ++ "awk: -:1: 'break' not in a loop\n1\n" \ + "" \ + 'BEGIN { if (1) break; else a = 1 }' + testing "awk continue" \ + "awk -f - 2>&1; echo \$?" \ +- "0\n" \ ++ "awk: -:1: 'continue' not in a loop\n1\n" \ + "" \ + 'BEGIN { if (1) continue; else a = 1 }' + +-- +2.27.0 + + +From 027b43ab6700b85f037fb69c08ad052cff6a7384 Mon Sep 17 00:00:00 2001 +From: Denys Vlasenko +Date: Wed, 14 Jul 2021 16:58:05 +0200 +Subject: [PATCH 61/61] awk: whitespace and debugging tweaks + +Signed-off-by: Denys Vlasenko +--- + editors/awk.c | 133 +++++++++++++++++++++++++------------------------- + 1 file changed, 66 insertions(+), 67 deletions(-) + +diff --git a/editors/awk.c b/editors/awk.c +index 607d69487..3adbca7aa 100644 +--- a/editors/awk.c ++++ b/editors/awk.c +@@ -199,77 +199,78 @@ typedef struct tsplitter_s { + + /* simple token classes */ + /* order and hex values are very important!!! See next_token() */ +-#define TC_LPAREN (1 << 0) /* ( */ +-#define TC_RPAREN (1 << 1) /* ) */ +-#define TC_REGEXP (1 << 2) /* /.../ */ +-#define TC_OUTRDR (1 << 3) /* | > >> */ +-#define TC_UOPPOST (1 << 4) /* unary postfix operator ++ -- */ +-#define TC_UOPPRE1 (1 << 5) /* unary prefix operator ++ -- $ */ +-#define TC_BINOPX (1 << 6) /* two-opnd operator */ +-#define TC_IN (1 << 7) /* 'in' */ +-#define TC_COMMA (1 << 8) /* , */ +-#define TC_PIPE (1 << 9) /* input redirection pipe | */ +-#define TC_UOPPRE2 (1 << 10) /* unary prefix operator + - ! */ +-#define TC_ARRTERM (1 << 11) /* ] */ +-#define TC_LBRACE (1 << 12) /* { */ +-#define TC_RBRACE (1 << 13) /* } */ +-#define TC_SEMICOL (1 << 14) /* ; */ +-#define TC_NEWLINE (1 << 15) +-#define TC_STATX (1 << 16) /* ctl statement (for, next...) */ +-#define TC_WHILE (1 << 17) /* 'while' */ +-#define TC_ELSE (1 << 18) /* 'else' */ +-#define TC_BUILTIN (1 << 19) ++#define TC_LPAREN (1 << 0) /* ( */ ++#define TC_RPAREN (1 << 1) /* ) */ ++#define TC_REGEXP (1 << 2) /* /.../ */ ++#define TC_OUTRDR (1 << 3) /* | > >> */ ++#define TC_UOPPOST (1 << 4) /* unary postfix operator ++ -- */ ++#define TC_UOPPRE1 (1 << 5) /* unary prefix operator ++ -- $ */ ++#define TC_BINOPX (1 << 6) /* two-opnd operator */ ++#define TC_IN (1 << 7) /* 'in' */ ++#define TC_COMMA (1 << 8) /* , */ ++#define TC_PIPE (1 << 9) /* input redirection pipe | */ ++#define TC_UOPPRE2 (1 << 10) /* unary prefix operator + - ! */ ++#define TC_ARRTERM (1 << 11) /* ] */ ++#define TC_LBRACE (1 << 12) /* { */ ++#define TC_RBRACE (1 << 13) /* } */ ++#define TC_SEMICOL (1 << 14) /* ; */ ++#define TC_NEWLINE (1 << 15) ++#define TC_STATX (1 << 16) /* ctl statement (for, next...) */ ++#define TC_WHILE (1 << 17) /* 'while' */ ++#define TC_ELSE (1 << 18) /* 'else' */ ++#define TC_BUILTIN (1 << 19) + /* This costs ~50 bytes of code. + * A separate class to support deprecated "length" form. If we don't need that + * (i.e. if we demand that only "length()" with () is valid), then TC_LENGTH + * can be merged with TC_BUILTIN: + */ +-#define TC_LENGTH (1 << 20) /* 'length' */ +-#define TC_GETLINE (1 << 21) /* 'getline' */ +-#define TC_FUNCDECL (1 << 22) /* 'function' 'func' */ +-#define TC_BEGIN (1 << 23) /* 'BEGIN' */ +-#define TC_END (1 << 24) /* 'END' */ +-#define TC_EOF (1 << 25) +-#define TC_VARIABLE (1 << 26) /* name */ +-#define TC_ARRAY (1 << 27) /* name[ */ +-#define TC_FUNCTION (1 << 28) /* name( */ +-#define TC_STRING (1 << 29) /* "..." */ +-#define TC_NUMBER (1 << 30) ++#define TC_LENGTH (1 << 20) /* 'length' */ ++#define TC_GETLINE (1 << 21) /* 'getline' */ ++#define TC_FUNCDECL (1 << 22) /* 'function' 'func' */ ++#define TC_BEGIN (1 << 23) /* 'BEGIN' */ ++#define TC_END (1 << 24) /* 'END' */ ++#define TC_EOF (1 << 25) ++#define TC_VARIABLE (1 << 26) /* name */ ++#define TC_ARRAY (1 << 27) /* name[ */ ++#define TC_FUNCTION (1 << 28) /* name( */ ++#define TC_STRING (1 << 29) /* "..." */ ++#define TC_NUMBER (1 << 30) + + #ifndef debug_parse_print_tc +-#define debug_parse_print_tc(n) do { \ +-if ((n) & TC_LPAREN ) debug_printf_parse(" LPAREN" ); \ +-if ((n) & TC_RPAREN ) debug_printf_parse(" RPAREN" ); \ +-if ((n) & TC_REGEXP ) debug_printf_parse(" REGEXP" ); \ +-if ((n) & TC_OUTRDR ) debug_printf_parse(" OUTRDR" ); \ +-if ((n) & TC_UOPPOST ) debug_printf_parse(" UOPPOST" ); \ +-if ((n) & TC_UOPPRE1 ) debug_printf_parse(" UOPPRE1" ); \ +-if ((n) & TC_BINOPX ) debug_printf_parse(" BINOPX" ); \ +-if ((n) & TC_IN ) debug_printf_parse(" IN" ); \ +-if ((n) & TC_COMMA ) debug_printf_parse(" COMMA" ); \ +-if ((n) & TC_PIPE ) debug_printf_parse(" PIPE" ); \ +-if ((n) & TC_UOPPRE2 ) debug_printf_parse(" UOPPRE2" ); \ +-if ((n) & TC_ARRTERM ) debug_printf_parse(" ARRTERM" ); \ +-if ((n) & TC_LBRACE ) debug_printf_parse(" LBRACE" ); \ +-if ((n) & TC_RBRACE ) debug_printf_parse(" RBRACE" ); \ +-if ((n) & TC_SEMICOL ) debug_printf_parse(" SEMICOL" ); \ +-if ((n) & TC_NEWLINE ) debug_printf_parse(" NEWLINE" ); \ +-if ((n) & TC_STATX ) debug_printf_parse(" STATX" ); \ +-if ((n) & TC_WHILE ) debug_printf_parse(" WHILE" ); \ +-if ((n) & TC_ELSE ) debug_printf_parse(" ELSE" ); \ +-if ((n) & TC_BUILTIN ) debug_printf_parse(" BUILTIN" ); \ +-if ((n) & TC_LENGTH ) debug_printf_parse(" LENGTH" ); \ +-if ((n) & TC_GETLINE ) debug_printf_parse(" GETLINE" ); \ +-if ((n) & TC_FUNCDECL) debug_printf_parse(" FUNCDECL"); \ +-if ((n) & TC_BEGIN ) debug_printf_parse(" BEGIN" ); \ +-if ((n) & TC_END ) debug_printf_parse(" END" ); \ +-if ((n) & TC_EOF ) debug_printf_parse(" EOF" ); \ +-if ((n) & TC_VARIABLE) debug_printf_parse(" VARIABLE"); \ +-if ((n) & TC_ARRAY ) debug_printf_parse(" ARRAY" ); \ +-if ((n) & TC_FUNCTION) debug_printf_parse(" FUNCTION"); \ +-if ((n) & TC_STRING ) debug_printf_parse(" STRING" ); \ +-if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ +-} while (0) ++static void debug_parse_print_tc(uint32_t n) ++{ ++ if (n & TC_LPAREN ) debug_printf_parse(" LPAREN" ); ++ if (n & TC_RPAREN ) debug_printf_parse(" RPAREN" ); ++ if (n & TC_REGEXP ) debug_printf_parse(" REGEXP" ); ++ if (n & TC_OUTRDR ) debug_printf_parse(" OUTRDR" ); ++ if (n & TC_UOPPOST ) debug_printf_parse(" UOPPOST" ); ++ if (n & TC_UOPPRE1 ) debug_printf_parse(" UOPPRE1" ); ++ if (n & TC_BINOPX ) debug_printf_parse(" BINOPX" ); ++ if (n & TC_IN ) debug_printf_parse(" IN" ); ++ if (n & TC_COMMA ) debug_printf_parse(" COMMA" ); ++ if (n & TC_PIPE ) debug_printf_parse(" PIPE" ); ++ if (n & TC_UOPPRE2 ) debug_printf_parse(" UOPPRE2" ); ++ if (n & TC_ARRTERM ) debug_printf_parse(" ARRTERM" ); ++ if (n & TC_LBRACE ) debug_printf_parse(" LBRACE" ); ++ if (n & TC_RBRACE ) debug_printf_parse(" RBRACE" ); ++ if (n & TC_SEMICOL ) debug_printf_parse(" SEMICOL" ); ++ if (n & TC_NEWLINE ) debug_printf_parse(" NEWLINE" ); ++ if (n & TC_STATX ) debug_printf_parse(" STATX" ); ++ if (n & TC_WHILE ) debug_printf_parse(" WHILE" ); ++ if (n & TC_ELSE ) debug_printf_parse(" ELSE" ); ++ if (n & TC_BUILTIN ) debug_printf_parse(" BUILTIN" ); ++ if (n & TC_LENGTH ) debug_printf_parse(" LENGTH" ); ++ if (n & TC_GETLINE ) debug_printf_parse(" GETLINE" ); ++ if (n & TC_FUNCDECL) debug_printf_parse(" FUNCDECL"); ++ if (n & TC_BEGIN ) debug_printf_parse(" BEGIN" ); ++ if (n & TC_END ) debug_printf_parse(" END" ); ++ if (n & TC_EOF ) debug_printf_parse(" EOF" ); ++ if (n & TC_VARIABLE) debug_printf_parse(" VARIABLE"); ++ if (n & TC_ARRAY ) debug_printf_parse(" ARRAY" ); ++ if (n & TC_FUNCTION) debug_printf_parse(" FUNCTION"); ++ if (n & TC_STRING ) debug_printf_parse(" STRING" ); ++ if (n & TC_NUMBER ) debug_printf_parse(" NUMBER" ); ++} + #endif + + /* combined token classes ("token [class] sets") */ +@@ -417,7 +418,7 @@ static const char tokenlist[] ALIGN1 = + "\5close" "\6system" "\6fflush" "\5atan2" + "\3cos" "\3exp" "\3int" "\3log" + "\4rand" "\3sin" "\4sqrt" "\5srand" +- "\6gensub" "\4gsub" "\5index" /* "\6length" was here */ ++ "\6gensub" "\4gsub" "\5index" /* "\6length" was here */ + "\5match" "\5split" "\7sprintf" "\3sub" + "\6substr" "\7systime" "\10strftime" "\6mktime" + "\7tolower" "\7toupper" NTC +@@ -1802,7 +1803,6 @@ static void parse_program(char *p) + } /* for (;;) */ + } + +- + /* -------- program execution part -------- */ + + /* temporary variables allocator */ +@@ -3510,7 +3510,6 @@ static var *evaluate(node *op, var *res) + #undef sreg + } + +- + /* -------- main & co. -------- */ + + static int awk_exit(void) +-- +2.27.0 + diff --git a/busybox.spec b/busybox.spec index 87f6ee9..d7b2ced 100644 --- a/busybox.spec +++ b/busybox.spec @@ -4,7 +4,7 @@ %endif %if "%{!?RELEASE:1}" -%define RELEASE 11 +%define RELEASE 12 %endif Name: busybox @@ -25,6 +25,7 @@ Patch6001: backport-CVE-2021-42377.patch Patch6002: backport-CVE-2021-42373.patch Patch6003: backport-CVE-2021-42375.patch Patch6004: backport-CVE-2021-42376.patch +Patch6005: backport-fix-awk-cve.patch BuildRoot: %_topdir/BUILDROOT #Dependency @@ -100,6 +101,12 @@ install -m 644 docs/busybox.dynamic.1 $RPM_BUILD_ROOT/%{_mandir}/man1/busybox.1 %{_mandir}/man1/busybox.petitboot.1.gz %changelog +* Wed Nov 25 2021 xiechengliang - 1:1.33.1-12 +- Type:CVE +- Id:NA +- SUG:NA +- DESC:fix CVE-2021-42378 CVE-2021-42379 CVE-2021-42380 CVE-2021-42381 CVE-2021-42382 CVE-2021-42383 CVE-2021-42384 CVE-2021-42385 and CVE-2021-42386 + * Wed Nov 24 2021 xiechengliang - 1:1.33.1-11 - Type:CVE - Id:NA -- Gitee