From 9af383909a1401cd5cb9bcc2ddbef264dd844ff7 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Tue, 13 Aug 2024 19:02:51 -0400 Subject: [PATCH 01/23] fix token pasting + logging around macro expansion --- pnut.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/pnut.c b/pnut.c index 11636326..3d90193f 100644 --- a/pnut.c +++ b/pnut.c @@ -1352,11 +1352,13 @@ int macro_parse_argument() { return arg_tokens; } -void check_macro_arity(int macro_args_count, int expected_argc) { +void check_macro_arity(int macro_args_count, int macro) { + int expected_argc = cdr(heap[macro + 3]); if (macro_args_count != expected_argc) { putstr("expected_argc="); putint(expected_argc); putstr(" != macro_args_count="); putint(macro_args_count); putchar('\n'); + putstr("macro="); putstr(string_pool + heap[macro + 1]); putchar('\n'); syntax_error("macro argument count mismatch"); } } @@ -1364,13 +1366,14 @@ void check_macro_arity(int macro_args_count, int expected_argc) { // Reads the arguments of a macro call, where the arguments are split by commas. // Note that args are accumulated in reverse order, as the macro arguments refer // to the tokens in reverse order. -int get_macro_args_toks(int expected_argc) { +int get_macro_args_toks(int macro) { int args = 0; int macro_args_count = 0; + bool prev_is_comma = false; get_tok_macro(); // Skip the macro identifier if (tok != '(') { // Function-like macro with 0 arguments - check_macro_arity(macro_args_count, expected_argc); + check_macro_arity(macro_args_count, macro); return -1; // No arguments } @@ -1380,15 +1383,28 @@ int get_macro_args_toks(int expected_argc) { // Allow sequence of commas, this is more lenient than the standard if (tok == ',') { get_tok_macro(); // Skip comma + if (prev_is_comma) { // Push empty arg + args = cons(0, args); + macro_args_count += 1; + } + prev_is_comma = true; continue; + } else { + prev_is_comma = false; } + args = cons(macro_parse_argument(), args); macro_args_count += 1; } expect_tok(')'); - check_macro_arity(macro_args_count, expected_argc); + if (prev_is_comma) { + args = cons(0, args); // Push empty arg + macro_args_count += 1; + } + + check_macro_arity(macro_args_count, macro); return args; } @@ -1435,7 +1451,7 @@ bool attempt_macro_expansion(int macro) { push_macro(car(heap[macro + 3]), 0); return true; } else { - new_macro_args = get_macro_args_toks(cdr(heap[macro + 3])); + new_macro_args = get_macro_args_toks(macro); // get_macro_args_toks fetched the next token, we save it so it's not lost push_macro(cons(cons(tok, val), 0), new_macro_args); if (new_macro_args == -1) { // There was no argument list, i.e. not a function-like macro call @@ -1484,20 +1500,36 @@ int paste_integers(int left_val, int right_val) { void paste_tokens(int left_tok, int left_val) { int right_tok; int right_val; + expand_macro_arg = false; get_tok_macro(); + expand_macro_arg = true; + // We need to handle the case where the right-hand side is a macro argument that expands to empty + // In that case, the left-hand side is returned as is. + if (tok == MACRO_ARG) { + if (get_macro_arg(val) == 0) { + tok = left_tok; + val = left_val; + return; + } else { + push_macro(get_macro_arg(val), 0); // Play the tokens of the macro argument + get_tok_macro(); + } + } right_tok = tok; right_val = val; - if (left_tok == IDENTIFIER OR left_tok == MACRO) { + if (left_tok == IDENTIFIER || left_tok == MACRO || left_tok <= WHILE_KW) { // Something that starts with an identifier can only be an identifier begin_string(); accum_string_string(heap[left_val + 1]); - if (right_tok == IDENTIFIER OR right_tok == MACRO) { + if (right_tok == IDENTIFIER || right_tok == MACRO || right_tok <= WHILE_KW) { accum_string_string(heap[right_val + 1]); } else if (right_tok == INTEGER) { accum_string_integer(-right_val); } else { putstr("left_tok="); putint(left_tok); putstr(", right_tok="); putint(right_tok); putchar('\n'); + // show identifier/macro string + putstr("left="); putstr(string_pool + heap[left_val + 1]); putchar('\n'); syntax_error("cannot paste an identifier with a non-identifier or non-negative integer"); } @@ -1564,7 +1596,7 @@ void get_tok() { paste_tokens(tok, val); break; } - } else if (macro_tok_lst == 0 AND paste_last_token) { + } else if (macro_tok_lst == 0 AND paste_last_token) { // We finished expanding the left-hand side of ## if (macro_stack_ix == 0) { // If we are not in a macro expansion, we can't paste the last token // This should not happen if the macro is well-formed, which is From 06f6af1fb1888a4e89a4195ff7883ebbbb5d7953 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Wed, 14 Aug 2024 22:35:22 -0400 Subject: [PATCH 02/23] Simplify tokenizer --- pnut.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pnut.c b/pnut.c index 3d90193f..502df823 100644 --- a/pnut.c +++ b/pnut.c @@ -1636,10 +1636,6 @@ void get_tok() { } /* skip whitespace, detecting when it is at start of line */ - - if (ch == '\n') tok = ch; - get_ch(); - while (0 <= ch AND ch <= ' ') { if (ch == '\n') tok = ch; get_ch(); From a036b47f95bcb09b5c553300075d90317335e704 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Thu, 15 Aug 2024 13:25:37 -0400 Subject: [PATCH 03/23] Remove unused keywords --- debug.c | 2 -- pnut.c | 8 ++------ utils/keywords.txt | 8 -------- 3 files changed, 2 insertions(+), 16 deletions(-) diff --git a/debug.c b/debug.c index 58ea486e..3c00aa46 100644 --- a/debug.c +++ b/debug.c @@ -31,7 +31,6 @@ void print_tok(int tok, int val) { else if (tok == DOUBLE_KW) putstr("double"); else if (tok == ELSE_KW) putstr("else"); else if (tok == ENUM_KW) putstr("enum"); - else if (tok == ERROR_KW) putstr("error"); else if (tok == EXTERN_KW) putstr("extern"); else if (tok == FLOAT_KW) putstr("float"); else if (tok == FOR_KW) putstr("for"); @@ -127,7 +126,6 @@ void print_tok_type(int tok) { else if (tok == DOUBLE_KW) putstr("double"); else if (tok == ELSE_KW) putstr("else"); else if (tok == ENUM_KW) putstr("enum"); - else if (tok == ERROR_KW) putstr("error"); else if (tok == EXTERN_KW) putstr("extern"); else if (tok == FLOAT_KW) putstr("float"); else if (tok == FOR_KW) putstr("for"); diff --git a/pnut.c b/pnut.c index 502df823..b1b62216 100644 --- a/pnut.c +++ b/pnut.c @@ -103,14 +103,11 @@ enum { DOUBLE_KW, ELSE_KW, ENUM_KW, - ERROR_KW, EXTERN_KW, FLOAT_KW, FOR_KW, GOTO_KW, IF_KW, - IFNDEF_KW, - INCLUDE_KW, INT_KW, LONG_KW, REGISTER_KW, @@ -256,8 +253,8 @@ int string_start; int hash; /* These parameters give a perfect hashing of the C keywords */ -#define HASH_PARAM 2764 -#define HASH_PRIME 107 +#define HASH_PARAM 1997 +#define HASH_PRIME 53 #define HEAP_SIZE 200000 int heap[HEAP_SIZE]; int heap_alloc = HASH_PRIME; @@ -1183,7 +1180,6 @@ void init_ident_table() { init_ident(DOUBLE_KW, "double"); init_ident(ELSE_KW, "else"); init_ident(ENUM_KW, "enum"); - init_ident(ERROR_KW, "error"); init_ident(EXTERN_KW, "extern"); init_ident(FLOAT_KW, "float"); init_ident(FOR_KW, "for"); diff --git a/utils/keywords.txt b/utils/keywords.txt index a77b5b77..ecc1c9d2 100644 --- a/utils/keywords.txt +++ b/utils/keywords.txt @@ -5,22 +5,15 @@ char const continue default -define do double else -endif enum -error extern -FILE float for goto if -ifdef -ifndef -include int long register @@ -32,7 +25,6 @@ static struct switch typedef -undef union unsigned void From b496f4b31c2793c525329358d13dbf793e452b84 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Thu, 15 Aug 2024 13:29:00 -0400 Subject: [PATCH 04/23] Handle files not-ending with newline --- pnut.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pnut.c b/pnut.c index b1b62216..245085cd 100644 --- a/pnut.c +++ b/pnut.c @@ -512,7 +512,9 @@ void get_ch() { // Not freeing include_stack2->filepath because it may not be dynamically allocated free(include_stack2->dirname); free(include_stack2); - get_ch(); + // EOF is treated as a newline so that files without a newline at the end are still parsed correctly + // On the next get_ch call, the first character of the next file will be read + ch = '\n'; } } #ifdef INCLUDE_LINE_NUMBER_ON_ERROR From 46bbebc2bde2b21a0cb89b178f953da0a0659f49 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Thu, 15 Aug 2024 13:37:03 -0400 Subject: [PATCH 05/23] Read #directives until '\n' token Because the preprocessor treats whitespace as important, it used to parse macros by looking for certain characters. That was a problem when the preprocessor encountered whitespace and comments, and prevented it from recognizing the end of a macro. This commit fixes that issue by adding a flag (skip_newlines) indicating to the tokenizer if it should skip '\n' or not. When treating preoprocessor directives, this flag is set to false so that the macro parser can stop at the end of the line. --- pnut.c | 320 +++++++++++++++++++++++++++++++++------------------------ sh.c | 2 +- 2 files changed, 185 insertions(+), 137 deletions(-) diff --git a/pnut.c b/pnut.c index 245085cd..c8754f5d 100644 --- a/pnut.c +++ b/pnut.c @@ -409,6 +409,8 @@ int if_macro_nest_level = 0; // Current number of unmatched #if/#ifdef/#if bool expand_macro = true; // Don't expand macro arguments. Used for stringification and token pasting. bool expand_macro_arg = true; +// Don't produce newline tokens. Used when reading the tokens of a macro definition. +bool skip_newlines = true; #define MACRO_RECURSION_MAX 100 int macro_stack[MACRO_RECURSION_MAX]; @@ -670,10 +672,32 @@ int WRITE_ID; int OPEN_ID; int CLOSE_ID; +// When we parse a macro, we generally want the tokens as they are, without expanding them. void get_tok_macro() { + bool prev_expand_macro = expand_macro; + bool prev_macro_mask = if_macro_mask; + bool skip_newlines_prev = skip_newlines; + + expand_macro = false; + if_macro_mask = true; + skip_newlines = false; + get_tok(); + expand_macro = prev_expand_macro; + if_macro_mask = prev_macro_mask; + skip_newlines = skip_newlines_prev; +} + +// Like get_tok_macro, but skips newline +// This is useful when we want to read the arguments of a macro expansion. +void get_tok_macro_expand() { + bool prev_expand_macro = expand_macro; + bool prev_macro_mask = if_macro_mask; + expand_macro = false; + if_macro_mask = true; get_tok(); - expand_macro = true; // TODO: Restore to previous value? + expand_macro = prev_expand_macro; + if_macro_mask = prev_macro_mask; } int lookup_macro_token(int args, int tok, int val) { @@ -699,15 +723,15 @@ int read_macro_tokens(int args) { int tail; // Accumulate tokens so they can be replayed when the macro is used - if (ch != '\n' AND ch != EOF) { - get_tok_macro(); + if (tok != '\n' AND tok != EOF) { // Append the token/value pair to the replay list toks = cons(lookup_macro_token(args, tok, val), 0); tail = toks; - while (ch != '\n' AND ch != EOF) { - get_tok_macro(); + get_tok_macro(); + while (tok != '\n' AND tok != EOF) { heap[tail + 1] = cons(lookup_macro_token(args, tok, val), 0); tail = cdr(tail); // Advance tail + get_tok_macro(); } // Check that there are no leading or trailing ## @@ -741,7 +765,6 @@ void handle_define() { int args = 0; // List of arguments for a function-like macro int args_count = -1; // Number of arguments for a function-like macro. -1 means it's an object-like macro - get_tok_macro(); if (tok == IDENTIFIER OR tok == MACRO) { heap[val + 2] = MACRO; // Mark the identifier as a macro macro = val; @@ -751,14 +774,15 @@ void handle_define() { } if (ch == '(') { // Function-like macro args_count = 0; - get_ch(); - while (ch != '\n' AND ch != EOF) { - if (ch == ',') { + get_tok_macro(); // Skip macro name + get_tok_macro(); // Skip '(' + while (tok != '\n' AND tok != EOF) { + if (tok == ',') { // Allow sequence of commas, this is more lenient than the standard - get_ch(); + get_tok_macro(); continue; - } else if (ch == ')') { - get_ch(); + } else if (tok == ')') { + get_tok_macro(); break; } get_tok_macro(); @@ -767,36 +791,29 @@ void handle_define() { args = cons(val, args); args_count += 1; } + } else { + get_tok_macro(); // Skip macro name } - // Skip whitespace between the parameters and macro body - while (ch != '\n' AND ch != EOF AND ch <= ' ') { - get_ch(); - } - - if (ch == '\n' OR ch == EOF) { - heap[macro + 3] = cons(0, args_count); // No tokens to replay - } else { - // Accumulate tokens so they can be replayed when the macro is used - heap[macro + 3] = cons(read_macro_tokens(args), args_count); + // Accumulate tokens so they can be replayed when the macro is used + heap[macro + 3] = cons(read_macro_tokens(args), args_count); #ifdef DEBUG_CPP - putstr("# "); - putstr(string_pool + heap[macro + 1]); - if (args_count != -1) putchar('('); // Function-like macro - - while (args_count > 0) { - putstr(string_pool + heap[car(args) + 1]); - args = cdr(args); - args_count -= 1; - if (args_count > 0) putstr(", "); - } + putstr("# "); + putstr(string_pool + heap[macro + 1]); + if (args_count != -1) putchar('('); // Function-like macro - if (args_count != -1) putstr(") "); - print_macro_raw_tokens(car(heap[macro + 3])); - putchar('\n'); -#endif + while (args_count > 0) { + putstr(string_pool + heap[car(args) + 1]); + args = cdr(args); + args_count -= 1; + if (args_count > 0) putstr(", "); } + + if (args_count != -1) putstr(") "); + print_macro_raw_tokens(car(heap[macro + 3])); + putchar('\n'); +#endif } // For evaluating #if condition, we use the shunting yard algorithm @@ -824,8 +841,8 @@ int precedence(int op) { else if (op == AMP_AMP) return 9; else if (op == BAR_BAR) return 12; else { - printf("op=%d\n", op); - fatal_error("precedence: unknown operator"); + putstr("op="); putint(op); putchar('\n'); + syntax_error("#if: unknown operator"); return -1; } } @@ -890,7 +907,7 @@ void pop_op() { } else if (op == BAR_BAR) { val_stack[val_stack_ix - 2] = val_stack[val_stack_ix - 2] || val_stack[val_stack_ix - 1]; } else { - printf("op=%d\n", op); + putstr("op="); putint(op); putchar('\n'); fatal_error("pop_op: unknown operator"); } val_stack_ix -= 1; @@ -923,50 +940,67 @@ void push_val(int val) { val_stack_ix += 1; } -int evaluate_if_condition() { - int previous_mask = if_macro_mask; - if_macro_mask = true; // Temporarily set to true so that we can read the condition even if it's inside an ifdef false block - while (ch != '\n' AND ch != EOF) { - get_tok(); - if (tok == '(') { +void handle_if_op() { + switch (tok) { + case '(': push_op(tok); - } else if (tok == ')') { - while (op_stack_ix != 0 AND op_stack[op_stack_ix - 1] != '(') { - pop_op(); - } - if (op_stack_ix == 0) { - fatal_error("unmatched parenthesis in #if condition"); - } + break; + case ')': + while (op_stack_ix != 0 AND op_stack[op_stack_ix - 1] != '(') pop_op(); + if (op_stack_ix == 0) fatal_error("unmatched parenthesis in #if condition"); op_stack_ix -= 1; // Pop the '(' - } else if (tok == IDENTIFIER AND val == DEFINED_ID) { - get_tok_macro(); // Skip the defined keyword - if (tok == '(') { - get_tok_macro(); // Skip the '(' - push_val(tok == MACRO); - get_tok_macro(); // Skip the macro name - if (tok != ')') { - // Not using expect_tok because it may be the end of the line - printf("tok=%d\n", tok); - fatal_error("expected ')' in #if defined condition"); + break; + case IDENTIFIER: + if (val == DEFINED_ID) { + get_tok_macro(); // Skip the defined keyword + if (tok == '(') { + get_tok_macro(); // Skip the '(' + push_val(tok == MACRO); + get_tok_macro(); // Skip the macro name + if (tok != ')') { + // Not using expect_tok because it may be the end of the line + putstr("tok="); putint(tok); putchar('\n'); + fatal_error("expected ')' in #if defined condition"); + } + } else if (tok == IDENTIFIER OR tok == MACRO) { + // #if defined MACRO is valid syntax + push_val(tok == MACRO); + } else { + putstr("tok="); putint(tok); putchar('\n'); + fatal_error("expected identifier or macro in #if defined condition"); } - } else if (tok == IDENTIFIER OR tok == MACRO) { - // #if defined MACRO is valid syntax - push_val(tok == MACRO); } else { - printf("tok=%d\n", tok); - fatal_error("expected identifier or macro in #if defined condition"); + push_val(0); // Undefined identifiers are 0 } - } else if (tok == INTEGER) { + break; + case INTEGER: push_val(-val); - } else if (tok == CHARACTER) { + break; + case CHARACTER: push_val(val); - } else if (tok == IDENTIFIER) { - push_val(0); // Undefined macros are 0 - } else { + break; + default: push_op(tok); // Invalid operators are caught by push_op - } + break; + } +} + +int evaluate_if_condition() { + bool prev_skip_newlines = skip_newlines; + int previous_mask = if_macro_mask; + // Temporarily set to true so that we can read the condition even if it's inside an ifdef false block + // Unlike in other directives using get_tok_macro, we want to expand macros in the condition + if_macro_mask = true; + skip_newlines = false; // We want to stop when we reach the first newline + get_tok(); + while (tok != '\n' AND tok != EOF) { + handle_if_op(); + get_tok(); } - if_macro_mask = previous_mask; // Restore the mask to its previous value + + // Restore the previous value + if_macro_mask = previous_mask; + skip_newlines = prev_skip_newlines; // Pop remaining operators while (op_stack_ix != 0) { @@ -981,17 +1015,17 @@ int evaluate_if_condition() { } void handle_include() { - get_tok(); #ifdef SUPPORT_INCLUDE if (tok == STRING) { include_file(string_pool + val, true); + get_tok_macro(); // Skip the string } else if (tok == '<') { - get_tok(); // Ignore the file name for now. // Note that the token is not a string with the file name, but an identifier // with part of the file. This means we'll need to assemble the filename // string, or change get_tok to consider '<' and '>' as string delimiters. - while (tok != '>') get_tok(); + while (tok != '>') get_tok_macro(); + get_tok_macro(); // Skip the '>' } else { putstr("tok="); putint(tok); putchar('\n'); syntax_error("expected string to #include directive"); @@ -1007,112 +1041,112 @@ void handle_shell_include(); #endif void handle_preprocessor_directive() { - bool prev_if_mask = if_macro_mask; - int if_res; + int temp; #ifdef SH_INCLUDE_C_CODE int prev_char_buf_ix = declaration_char_buf_ix; #endif - get_ch(); // Skip the # - if_macro_mask = true; // Temporarily set to true so that we can read the directive even if it's inside an ifdef false block - get_tok(); // Get the directive - if_macro_mask = prev_if_mask; + get_tok_macro(); // Get the # token + get_tok_macro(); // Get the directive - if (tok == IDENTIFIER AND val == IFDEF_ID) { - if_macro_mask = true; get_tok_macro(); if_macro_mask = prev_if_mask; - if (if_macro_mask) { - push_if_macro_mask(tok == MACRO); - } else { - // Keep track of the number of #ifdef so we can skip the corresponding #endif - if_macro_nest_level += 1; - } - } else if (tok == IDENTIFIER AND val == IFNDEF_ID) { - if_macro_mask = true; get_tok_macro(); if_macro_mask = prev_if_mask; + if (tok == IDENTIFIER AND (val == IFDEF_ID || val == IFNDEF_ID)) { + temp = val; + get_tok_macro(); // Get the macro name if (if_macro_mask) { - push_if_macro_mask(tok != MACRO); + push_if_macro_mask(temp == IFDEF_ID ? tok == MACRO : tok != MACRO); } else { // Keep track of the number of #ifdef so we can skip the corresponding #endif if_macro_nest_level += 1; } + get_tok_macro(); // Skip the macro name } else if (tok == IF_KW) { - if_res = evaluate_if_condition(); + temp = evaluate_if_condition(); if (if_macro_mask) { - push_if_macro_mask(if_res); + push_if_macro_mask(temp); } else { // Keep track of the number of #ifdef so we can skip the corresponding #endif if_macro_nest_level += 1; } } else if (tok == IDENTIFIER AND val == ELIF_ID) { - if_res = evaluate_if_condition(); + temp = evaluate_if_condition(); if (if_macro_executed) { // The condition is true, but its ignored if one of the conditions before was also true if_macro_mask = false; } else { - if_macro_executed |= if_res; - if_macro_mask = if_res; + if_macro_executed |= temp; + if_macro_mask = temp; } } else if (tok == ELSE_KW) { if (if_macro_mask OR if_macro_nest_level == 0) { if_macro_mask = !if_macro_executed; } + get_tok_macro(); // Skip the else keyword } else if (tok == IDENTIFIER AND val == ENDIF_ID) { if (if_macro_mask OR if_macro_nest_level == 0) { pop_if_macro_mask(); } else { if_macro_nest_level -= 1; } + get_tok_macro(); // Skip the else keyword } else if (if_macro_mask) { if (tok == IDENTIFIER AND val == INCLUDE_ID) { + get_tok_macro(); // Get the STRING token handle_include(); } #ifdef sh + // Not standard C, but serves to mix existing shell code with compiled C code else if (tok == IDENTIFIER AND val == INCLUDE_SHELL_ID) { - // Not standard C, but serves to mix existing shell code with compiled C code + get_tok_macro(); // Get the STRING token handle_shell_include(); } #endif else if (tok == IDENTIFIER AND val == UNDEF_ID) { - get_tok_macro(); - if (tok == MACRO) { - heap[val + 2] = IDENTIFIER; // Unmark the macro identifier + get_tok_macro(); // Get the macro name + if (tok == IDENTIFIER || tok == MACRO) { // TODO: Doesn't play nice with typedefs, because they are not marked as macros + heap[val + 2] = IDENTIFIER; // Unmark the macro identifier + get_tok_macro(); // Skip the macro name } else { putstr("tok="); putint(tok); putchar('\n'); syntax_error("#undef directive can only be followed by a identifier"); } } else if (tok == IDENTIFIER AND val == DEFINE_ID) { + get_tok_macro(); // Get the macro name handle_define(); - } else if (tok == IDENTIFIER && val == WARNING_ID) { - get_tok_macro(); - putstr("warning: "); - if (tok == STRING) { - putstr(string_pool + val); - } else { - syntax_error("#warning/#error directives can only be followed by a string"); - } - } else if (tok == IDENTIFIER AND val == ERROR_ID) { - get_tok_macro(); - putstr("error: "); - if (tok == STRING) { - syntax_error(string_pool + val); - } else { - syntax_error("#warning/#error directives can only be followed by a string"); + } else if (tok == IDENTIFIER && (val == WARNING_ID || val == ERROR_ID)) { + temp = val; + putstr(temp == WARNING_ID ? "warning:" : "error:"); + // Print the rest of the line, it does not support \ at the end of the line but that's ok + while (ch != '\n' AND ch != EOF) { + putchar(ch); get_ch(); } + putchar('\n'); + tok = '\n'; + if (temp == ERROR_ID) exit(1); } else { putstr("tok="); putint(tok); putstr(": "); putstr(string_pool + heap[val + 1]); putchar('\n'); syntax_error("unsupported preprocessor directive"); } } else { - // Skip the directive - while (ch != '\n' AND ch != EOF) { - get_ch(); - } + // Skip the rest of the directive + while (tok != '\n' AND tok != EOF) get_tok_macro(); } - // Because handle_preprocessor_directive is called from get_tok, and it loops after - // the call to handle_preprocessor_directive, we don't need to call get_tok here - if (ch != '\n' AND ch != EOF) { - putstr("ch="); putint(ch); putchar('\n'); + + if (tok != '\n' AND tok != EOF) { + putstr("tok="); putint(tok); putchar('\n'); + putstr("directive="); putint(tok); putchar('\n'); + putstr("string="); putstr(string_pool + heap[val + 1]); putchar('\n'); + if (tok == IDENTIFIER OR tok == MACRO) { + putstr("string = "); + putstr(string_pool + heap[1 + val]); + putchar('\n'); + } syntax_error("preprocessor expected end of line"); } + + // Because handle_preprocessor_directive is called from get_tok, and it loops + // after the call to handle_preprocessor_directive, we don't need to call + // get_tok before returning. + #ifdef SH_INCLUDE_C_CODE declaration_char_buf_ix = prev_char_buf_ix - 1; // - 1 to undo the # #endif @@ -1344,7 +1378,7 @@ int macro_parse_argument() { heap[tail + 1] = cons(cons(tok, val), 0); tail = cdr(tail); } - get_tok_macro(); + get_tok_macro_expand(); } return arg_tokens; @@ -1368,19 +1402,18 @@ int get_macro_args_toks(int macro) { int args = 0; int macro_args_count = 0; bool prev_is_comma = false; - get_tok_macro(); // Skip the macro identifier + get_tok_macro_expand(); // Skip the macro identifier if (tok != '(') { // Function-like macro with 0 arguments check_macro_arity(macro_args_count, macro); return -1; // No arguments } - get_tok_macro(); // Skip '(' + get_tok_macro_expand(); // Skip '(' while (tok != ')' AND tok != EOF) { - // Allow sequence of commas, this is more lenient than the standard if (tok == ',') { - get_tok_macro(); // Skip comma + get_tok_macro_expand(); // Skip comma if (prev_is_comma) { // Push empty arg args = cons(0, args); macro_args_count += 1; @@ -1633,19 +1666,34 @@ void get_tok() { break; } - /* skip whitespace, detecting when it is at start of line */ + /* + skip whitespace, detecting when it is at start of line. + When skip_newlines is false, produces a '\n' token whenever it + encounters whitespace containing at least a newline. + This condenses multiple newlines into a single '\n' token and serves + to end the current preprocessor directive. + */ + + tok = 0; // Reset the token while (0 <= ch AND ch <= ' ') { if (ch == '\n') tok = ch; get_ch(); } - /* detect '#' at start of line, possibly preceded by whitespace */ - - if ((tok == '\n') AND (ch == '#')) - handle_preprocessor_directive(); + if (tok == '\n' && !skip_newlines) { + // If the newline is followed by a #, the preprocessor directive is + // handled in the next iteration of the loop. + break; + } /* will continue while (1) loop */ + } + /* detect '#' at start of line, possibly preceded by whitespace */ + else if (tok == '\n' && ch == '#') { + tok = 0; // Consume the newline so handle_preprocessor_directive's get_tok doesn't re-enter this case + handle_preprocessor_directive(); + /* will continue while (1) loop */ } else if (('a' <= ch AND ch <= 'z') OR diff --git a/sh.c b/sh.c index 2265568e..da99dc57 100644 --- a/sh.c +++ b/sh.c @@ -5,7 +5,6 @@ void handle_shell_include() { FILE* shell_include_fp; int c; - get_tok(); if (tok == STRING) { // Include the shell code from the file shell_include_fp = fopen(string_pool + val, "r"); @@ -18,6 +17,7 @@ void handle_shell_include() { } putchar('\n'); fclose(shell_include_fp); + get_tok_macro(); // Skip the string } else { putstr("tok="); putint(tok); putchar('\n'); syntax_error("expected string to #include_shell directive"); From 75c7d483964b889a56ae1444b84c2792ce76bfbb Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sun, 18 Aug 2024 14:14:29 -0400 Subject: [PATCH 06/23] Add 5 timeout when compiling and running tests --- .github/workflows/main.yml | 6 ++++++ run-tests.sh | 17 +++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f3a990f7..0940d287 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -21,6 +21,12 @@ jobs: - name: Checkout code uses: actions/checkout@v2 + - name: Install utils + run: | + if [ ${{ matrix.host }} = "macos-latest" ]; then + brew install coreutils + fi + - name: Run ${{ matrix.target }} tests on ${{ matrix.host }} run: | set -e diff --git a/run-tests.sh b/run-tests.sh index e1cb0f9d..9136bdff 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -7,7 +7,9 @@ # The --match flag is used to run tests that match the given pattern, useful for re-running failed tests # The --bootstrap flag compiles the tests using pnut compiled with pnut, useful for catching bootstrap errors -fail() { echo "$1"; exit "$2"; } +trap "exit 1" INT + +fail() { echo "$1"; exit $2; } if [ $# -lt 1 ]; then fail "Usage: $0 -m pattern --bootstrap" 1 @@ -71,12 +73,19 @@ test_args() { execute_test() { # executable: $1, args: $2 ... if [ "$backend" = "sh" ]; then - bash "./$1" $2 # Default to bash for sh backend + # Default to bash for sh backend + # Use a 5s timeout to prevent infinite loops + timeout 5 bash "./$1" $2 else "./$1" $2 fi } +compile_test() { # c_file: $1 + # 5s timeout to prevent infinite loops in pnut + timeout 5 "$pnut_comp" "$1" +} + run_test() { # file_to_test: $1 file="$1" filename=$(basename "$file" .c) # Get the filename without extension @@ -89,7 +98,7 @@ run_test() { # file_to_test: $1 # Generate golden file if it doesn't exist if [ ! -f "$golden_file" ]; then - "$pnut_comp" "$file" > "$dir/$filename.$ext" 2> "$dir/$filename.err" + compile_test "$file" > "$dir/$filename.$ext" 2> "$dir/$filename.err" if [ $? -eq 0 ]; then chmod +x "$dir/$filename.$ext" execute_test "$dir/$filename.$ext" "$(test_args $file)" > "$golden_file" @@ -101,7 +110,7 @@ run_test() { # file_to_test: $1 fi # Compile the test file with pnut.exe - "$pnut_comp" "$file" > "$dir/$filename.$ext" 2> "$dir/$filename.err" + compile_test "$file" > "$dir/$filename.$ext" 2> "$dir/$filename.err" if [ $? -eq 0 ]; then # If compilation was successful chmod +x "$dir/$filename.$ext" From dc51f2311da0e19e167f669e781ab2247c5d7704 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sun, 18 Aug 2024 14:20:46 -0400 Subject: [PATCH 07/23] Add missing ARROW case in print_tok --- debug.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/debug.c b/debug.c index 3c00aa46..35c9cf20 100644 --- a/debug.c +++ b/debug.c @@ -55,6 +55,7 @@ void print_tok(int tok, int val) { else if (tok == AMP_AMP) putstr("&&"); else if (tok == AMP_EQ) putstr("&="); + else if (tok == ARROW) putstr("->"); else if (tok == BAR_BAR) putstr("||"); else if (tok == BAR_EQ) putstr("|="); else if (tok == CARET_EQ) putstr("^="); @@ -150,6 +151,7 @@ void print_tok_type(int tok) { else if (tok == AMP_AMP) putstr("&&"); else if (tok == AMP_EQ) putstr("&="); + else if (tok == ARROW) putstr("->"); else if (tok == BAR_BAR) putstr("||"); else if (tok == BAR_EQ) putstr("|="); else if (tok == CARET_EQ) putstr("^="); From 4ee81e93c8a7b5e65f60a2f8adf03400cbb989aa Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sun, 18 Aug 2024 14:21:39 -0400 Subject: [PATCH 08/23] Support token pasting between int and identifier This creates an invalid identifier, but the result may be pasted with another identifier (to the left) resulting in a valid identifier. --- pnut.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pnut.c b/pnut.c index c8754f5d..7923791d 100644 --- a/pnut.c +++ b/pnut.c @@ -1569,6 +1569,13 @@ void paste_tokens(int left_tok, int left_val) { } else if (left_tok == INTEGER) { if (right_tok == INTEGER) { val = -paste_integers(-left_val, -right_val); + } else if (right_tok == IDENTIFIER || right_tok == MACRO || right_tok <= WHILE_KW) { + begin_string(); + accum_string_integer(-left_val); + accum_string_string(heap[right_val + 1]); + + val = end_ident(); + tok = heap[val+2]; // The kind of the identifier } else { putstr("left_tok="); putint(left_tok); putstr(", right_tok="); putint(right_tok); putchar('\n'); syntax_error("cannot paste an integer with a non-integer"); From 7ec88a8f2b4b07e3eb08ea884a771258002cad45 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sun, 18 Aug 2024 14:25:12 -0400 Subject: [PATCH 09/23] Save macro def before processing expansion args Otherwise, while reading the arguments, the macro may be redefined and the expansion would use the new definition (only valid after the #define) instead of the previous one. An example showing the bug: #define FOO 1 int foo_val = FOO #define FOO 3 // Overwrites FOO ; Before, foo_val was assigned the value 3 and now 1 as expected. --- pnut.c | 6 ++++-- tests/_all/preprocessor/macro/object-like.c | 7 +++++++ tests/_all/preprocessor/macro/object-like.golden | 1 + 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pnut.c b/pnut.c index 7923791d..53a3fe7d 100644 --- a/pnut.c +++ b/pnut.c @@ -1477,9 +1477,11 @@ void push_tokens(int tokens) { // Returns 1 if the macro was expanded, 0 otherwise. bool attempt_macro_expansion(int macro) { int new_macro_args; + // We must save the tokens because the macro may be redefined while reading the arguments + int tokens = car(heap[macro + 3]); macro = val; if (cdr(heap[macro + 3]) == -1) { // Object-like macro - push_macro(car(heap[macro + 3]), 0); + push_macro(tokens, 0); return true; } else { new_macro_args = get_macro_args_toks(macro); @@ -1491,7 +1493,7 @@ bool attempt_macro_expansion(int macro) { val = macro; return false; } else { - push_macro(car(heap[macro + 3]), new_macro_args); + push_macro(tokens, new_macro_args); return true; } } diff --git a/tests/_all/preprocessor/macro/object-like.c b/tests/_all/preprocessor/macro/object-like.c index 7570d902..20f1cf77 100644 --- a/tests/_all/preprocessor/macro/object-like.c +++ b/tests/_all/preprocessor/macro/object-like.c @@ -36,6 +36,8 @@ void putdigit(int n) { } void main() { + int foo_val; + putdigit(EMPTY + 8); // Will expand to + 8 putdigit(FOO); putdigit(GARPLY); @@ -52,4 +54,9 @@ void main() { putdigit(PARENS_EXPR); putdigit(PARENS_PARENS_EXPR); putdigit(PARENS_PARENS_EXPR2); + foo_val = FOO + #define FOO 3 // This will not change the value of foo_val + ; + + putdigit(foo_val); } diff --git a/tests/_all/preprocessor/macro/object-like.golden b/tests/_all/preprocessor/macro/object-like.golden index 09063d83..b641909f 100644 --- a/tests/_all/preprocessor/macro/object-like.golden +++ b/tests/_all/preprocessor/macro/object-like.golden @@ -10,3 +10,4 @@ 2 2 2 +2 From 801be2260a857d25a1a3c06417545b9e56d2b4ab Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sun, 18 Aug 2024 14:29:14 -0400 Subject: [PATCH 10/23] Allow C keywords to be defined This is useful to allow unused types to be redefined to something supported by pnut. --- pnut.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pnut.c b/pnut.c index 53a3fe7d..0086a600 100644 --- a/pnut.c +++ b/pnut.c @@ -765,7 +765,7 @@ void handle_define() { int args = 0; // List of arguments for a function-like macro int args_count = -1; // Number of arguments for a function-like macro. -1 means it's an object-like macro - if (tok == IDENTIFIER OR tok == MACRO) { + if (tok == IDENTIFIER OR tok == MACRO OR (0 <= AUTO_KW AND tok <= WHILE_KW)) { heap[val + 2] = MACRO; // Mark the identifier as a macro macro = val; } else { From 9daa299b0978e8aaf48c016042e1d2f69a57a356 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sun, 18 Aug 2024 16:33:36 -0400 Subject: [PATCH 11/23] A few more tests --- tests/_all/preprocessor/if/if.c | 32 +++++++++++-------- tests/_all/preprocessor/if/if.golden | 29 +++++++++-------- .../preprocessor/include/include-no-newline.c | 27 ++++++++++++++++ .../include/include-no-newline.golden | 1 + .../preprocessor/include/include-no-newline.h | 5 +++ tests/_all/preprocessor/macro/fun-like.c | 18 +++++++++++ tests/_all/preprocessor/macro/fun-like.golden | 5 +++ tests/_all/preprocessor/macro/object-like.c | 4 ++- 8 files changed, 93 insertions(+), 28 deletions(-) create mode 100644 tests/_all/preprocessor/include/include-no-newline.c create mode 100644 tests/_all/preprocessor/include/include-no-newline.golden create mode 100644 tests/_all/preprocessor/include/include-no-newline.h diff --git a/tests/_all/preprocessor/if/if.c b/tests/_all/preprocessor/if/if.c index 33ad82a4..e67fab7a 100644 --- a/tests/_all/preprocessor/if/if.c +++ b/tests/_all/preprocessor/if/if.c @@ -16,14 +16,14 @@ void main() { #if 0 putdigit(0); #else - putdigit(1); + putdigit(2); #endif // else doesn't execute if any block before did #if 0 putdigit(0); #elif 1 - putdigit(1); + putdigit(3); #else putdigit(0); #endif @@ -32,7 +32,7 @@ void main() { #if 0 putdigit(0); #elif 1 - putdigit(1); + putdigit(4); #elif 1 putdigit(0); #endif @@ -40,7 +40,7 @@ void main() { // defined operator works #define FOO #if defined(FOO) - putdigit(1); + putdigit(5); #else putdigit(0); #endif @@ -49,7 +49,7 @@ void main() { #if defined(FOO) putdigit(0); #else - putdigit(1); + putdigit(6); #endif // if and ifdef can be used together @@ -58,12 +58,12 @@ void main() { #elif defined(FOO) putdigit(0); #else - putdigit(1); + putdigit(7); #endif // Test operator precedence #if 1 + 2 * 3 == 7 - putdigit(1); + putdigit(8); #else putdigit(0); #endif @@ -71,7 +71,7 @@ void main() { #if 1 + 2 * 3 != 7 putdigit(0); #else - putdigit(1); + putdigit(9); #endif #if 1 + 2 * 3 < 7 @@ -82,34 +82,40 @@ void main() { #define BUFSIZE 10000 #if defined BUFSIZE && BUFSIZE >= 1024 - putdigit(1); + putdigit(2); #else putdigit(0); #endif #if 'A' == 65 - putdigit(1); + putdigit(3); #else putdigit(0); #endif #if NOT_DEF == 0 - putdigit(1); + putdigit(4); #else putdigit(0); #endif +#define ARCH_i386 24 +#if ARCH_i386 + putdigit(5); +#else + putdigit(0); +#endif #define TCC_ARM_EABI 1 #if defined(TCC_ARM_EABI) && !defined(CONFIG_TCC_ELFINTERP) - putdigit(1); + putdigit(6); #else putdigit(0); #endif #define __FreeBSD__ #if !defined(TCC_TARGET_PE) && (defined(__FreeBSD__) || defined(__FreeBSD_kernel__)) - putdigit(1); + putdigit(7); #else putdigit(0); #endif diff --git a/tests/_all/preprocessor/if/if.golden b/tests/_all/preprocessor/if/if.golden index 1d9e4767..1a82b2e9 100644 --- a/tests/_all/preprocessor/if/if.golden +++ b/tests/_all/preprocessor/if/if.golden @@ -1,15 +1,16 @@ 1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +2 +3 +4 +5 +6 +7 +8 +9 +1 +2 +3 +4 +5 +6 +7 diff --git a/tests/_all/preprocessor/include/include-no-newline.c b/tests/_all/preprocessor/include/include-no-newline.c new file mode 100644 index 00000000..800d8b5b --- /dev/null +++ b/tests/_all/preprocessor/include/include-no-newline.c @@ -0,0 +1,27 @@ +// tests for #include "" directives + +#include // putchar + +#include "include-no-newline.h" +#define INCLUDE +#include "include-no-newline.h" + +void putint_aux(int n) { + if (n <= -10) putint_aux(n / 10); + putchar('0' - (n % 10)); +} + +void putint(int n) { + if (n < 0) { + putchar('-'); + putint_aux(n); + } else { + putint_aux(-n); + } + + putchar('\n'); +} + +void main() { + putint(CONSTANT); +} diff --git a/tests/_all/preprocessor/include/include-no-newline.golden b/tests/_all/preprocessor/include/include-no-newline.golden new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/tests/_all/preprocessor/include/include-no-newline.golden @@ -0,0 +1 @@ +0 diff --git a/tests/_all/preprocessor/include/include-no-newline.h b/tests/_all/preprocessor/include/include-no-newline.h new file mode 100644 index 00000000..66d858c1 --- /dev/null +++ b/tests/_all/preprocessor/include/include-no-newline.h @@ -0,0 +1,5 @@ +// tests for #include "" directives without trailing newline + +#ifdef INCLUDE +int CONSTANT = 0; +#endif \ No newline at end of file diff --git a/tests/_all/preprocessor/macro/fun-like.c b/tests/_all/preprocessor/macro/fun-like.c index 3a40ebd2..d349a234 100644 --- a/tests/_all/preprocessor/macro/fun-like.c +++ b/tests/_all/preprocessor/macro/fun-like.c @@ -27,6 +27,19 @@ int THUNK = 8; #define CONTROL 7 #define ARGS ADDR,3,CONTROL +#define DEF(id, str, val) ,id + +enum { + ABC = 42 + DEF(def, "def",) + DEF(ghi, "ghi",) + DEF(jkl, "jlk",) +// Tests that the expansion of the preceding DEF uses the old DEF macro +#define DEF(id, str, val) ,id = val + DEF(mno, "mno", 1) + DEF(pqr, "pqr", 2) +}; + #define MULTI_LINE_MACRO(X, Y) \ FST(X, Y) + \ SND(X, Y) @@ -53,4 +66,9 @@ void main() { putdigit(THUNK); // THUNK is also a variable containing 42 putdigit(SETBIT(SETBIT2, ARGS)); putdigit(MULTI_LINE_MACRO(1, 2)); + putdigit(def % 10); + putdigit(ghi % 10); + putdigit(jkl % 10); + putdigit(mno % 10); + putdigit(pqr % 10); } diff --git a/tests/_all/preprocessor/macro/fun-like.golden b/tests/_all/preprocessor/macro/fun-like.golden index 3c4d2969..1eacf06e 100644 --- a/tests/_all/preprocessor/macro/fun-like.golden +++ b/tests/_all/preprocessor/macro/fun-like.golden @@ -9,3 +9,8 @@ 8 5 3 +3 +4 +5 +1 +2 diff --git a/tests/_all/preprocessor/macro/object-like.c b/tests/_all/preprocessor/macro/object-like.c index 20f1cf77..9cb17e8b 100644 --- a/tests/_all/preprocessor/macro/object-like.c +++ b/tests/_all/preprocessor/macro/object-like.c @@ -30,13 +30,15 @@ #define PARENS_PARENS_EXPR (PARENS_EXPR) #define PARENS_EXPR (1 + 1) +#define float int // We can redefine keywords + void putdigit(int n) { putchar('0' + n); putchar('\n'); } void main() { - int foo_val; + float foo_val; // not a float, but an int putdigit(EMPTY + 8); // Will expand to + 8 putdigit(FOO); From e246f6b3278a174cd6fd3cc617663aed4945d908 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sun, 18 Aug 2024 17:25:37 -0400 Subject: [PATCH 12/23] Make prepare.sh script more verbose --- examples/prepare.sh | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/examples/prepare.sh b/examples/prepare.sh index e918f878..7f0605ac 100644 --- a/examples/prepare.sh +++ b/examples/prepare.sh @@ -12,24 +12,43 @@ echo "Compiling examples" PNUT_SH_OPTIONS="-DRELEASE_PNUT_SH -DRT_COMPACT" -gcc -o build/pnut-sh-base.exe $PNUT_SH_OPTIONS pnut.c # Compile pnut.exe +# Compile pnut.exe +gcc -o build/pnut-sh-base.exe $PNUT_SH_OPTIONS pnut.c 2> /dev/null || fail "Error: Failed to compile pnut" compile_options() { echo `sed -n -e "/\/\/ pnut-options:/p" "$1" | sed -e "s/^\/\/ pnut-options://" | tr '\n' ',' | sed -e 's/,$//'` } +fail() { echo "$1"; exit $2; } + +failed=0 + +generate_executable_with() { + if ./build/$1 $file > $COMP_DIR/$filename.sh; then + chmod +x $COMP_DIR/$filename.sh + printf "✅\n" + else + printf "Failed to compile ❌\n" + failed=1 + fi +} + for file in $(find examples -type f -name "*.c" | sort); do filename=$(basename $file .c); file_opts=$(compile_options $file) # To speed up the compilation process, we only compile pnut.exe if there are specific options if [ -z "$file_opts" ]; then - echo "Compiling $filename" - ./build/pnut-sh-base.exe $file > $COMP_DIR/$filename.sh + printf "Compiling $filename: " + generate_executable_with "pnut-sh-base.exe" else - echo "Compiling $filename with $file_opts" - gcc -o build/pnut-sh-opt.exe $PNUT_SH_OPTIONS $file_opts pnut.c # Compile pnut.exe with specific options - ./build/pnut-sh-opt.exe $file $file_opts > $COMP_DIR/$filename.sh + printf "Compiling $filename with $file_opts: " + # Compile pnut.exe with specific options + gcc -o build/pnut-sh-opt.exe $PNUT_SH_OPTIONS $file_opts pnut.c 2> /dev/null || fail "Error: Failed to compile pnut with $file_opts" + generate_executable_with "pnut-sh-opt.exe" fi - chmod +x $COMP_DIR/$filename.sh - done + +if [ $failed -eq 1 ]; then + echo "##### Some examples failed to compile #####" + exit 1 +fi From e2db147e3674a6a79b701924d45274c237ba61de Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sun, 18 Aug 2024 17:26:07 -0400 Subject: [PATCH 13/23] Fix RT_USE_LOOKUP_TABLE when using unicode chars --- sh-runtime.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sh-runtime.c b/sh-runtime.c index d3a1a278..ad4697ab 100644 --- a/sh-runtime.c +++ b/sh-runtime.c @@ -214,6 +214,7 @@ END_RUNTIME_FUN(int_to_char) DEFINE_RUNTIME_FUN(char_to_int) #ifndef RT_COMPACT #ifdef RT_USE_LOOKUP_TABLE + putstr(" LC_ALL=C\n"); putstr("__c2i_0=48\n"); putstr("__c2i_1=49\n"); putstr("__c2i_2=50\n"); From 469e2270e44a7ed053ddb3d3060ad56b0d6028ba Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sun, 18 Aug 2024 18:25:15 -0400 Subject: [PATCH 14/23] Nicer sequence of assignments --- pnut.c | 6 ++---- sh.c | 11 ++++++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pnut.c b/pnut.c index 0086a600..8b059ff1 100644 --- a/pnut.c +++ b/pnut.c @@ -429,10 +429,8 @@ void push_if_macro_mask(bool new_mask) { if_macro_stack[if_macro_stack_ix] = if_macro_mask; if_macro_stack[if_macro_stack_ix + 1] = if_macro_executed; if_macro_stack_ix += 2; - // Then set the new mask value - if_macro_mask = new_mask; - // If the condition is true, we don't want to execute the next #elif that's true - if_macro_executed = if_macro_mask; + // Then set the new mask value and reset the executed flag + if_macro_mask = if_macro_executed = new_mask; } void pop_if_macro_mask() { diff --git a/sh.c b/sh.c index da99dc57..442de237 100644 --- a/sh.c +++ b/sh.c @@ -1664,10 +1664,15 @@ void comp_assignment(ast lhs, ast rhs) { comp_fun_call(rhs, lhs); } else { /* - TODO: This may need to be disabled because of arithmetic precision issues with some shells. + If lhs is an identifier, we use x=$((...)) instead of : $((x = ...)). + This is unless the right hand side is an assignment, in which case we + generate everything in 1 arithmetic expansion for symmetry. + + Note: On certain shells there seems to be a conversion when entering and + exiting arithmetic expansions, meaning that the `x=$((...))` may not + always be equivalent to `: $((x = ...))`. */ - /* If lhs is an identifier, we generate x=$(( ... )) instead of : $(( x = ... )) */ - if (lhs_op == IDENTIFIER) { + if (lhs_op == IDENTIFIER && get_op(rhs) != '=') { append_glo_decl(string_concat3(comp_lvalue(lhs), wrap_char('='), comp_rvalue(rhs, RVALUE_CTX_BASE))); } else { append_glo_decl(string_concat5(wrap_str(": $(("), comp_lvalue(lhs), wrap_str(" = "), comp_rvalue(rhs, RVALUE_CTX_ARITH_EXPANSION), wrap_str("))"))); From bae4cbe359b019103cb5c910c77dbd5610281ce5 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sun, 18 Aug 2024 18:27:14 -0400 Subject: [PATCH 15/23] Indicate file location when calling fatal_error --- pnut.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pnut.c b/pnut.c index 8b059ff1..0af7eac7 100644 --- a/pnut.c +++ b/pnut.c @@ -198,7 +198,15 @@ void putintneg(int n) { } void fatal_error(char *msg) { +#ifdef INCLUDE_LINE_NUMBER_ON_ERROR +#ifdef SUPPORT_INCLUDE + putstr(include_stack->filepath); putchar(':'); +#endif + putint(last_tok_line_number); putchar(':'); putint(last_tok_column_number); putstr(msg); putchar('\n'); +#else + putstr(msg); putchar('\n'); +#endif exit(1); } @@ -215,11 +223,6 @@ void syntax_error(char *msg) { exit(1); } -void missing_feature_error(char *msg) { - putstr("not yet implemented: "); putstr(msg); - exit(1); -} - void print_dec(int n) { if (n < 0) { putchar('-'); From 83746910f5ba822f06354d3a6705ff6a123327fc Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sun, 18 Aug 2024 18:28:22 -0400 Subject: [PATCH 16/23] Crash when parse_definition fails to parse --- pnut.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pnut.c b/pnut.c index 0af7eac7..82481277 100644 --- a/pnut.c +++ b/pnut.c @@ -2557,6 +2557,8 @@ ast parse_definition(int local) { expect_tok(';'); return result; } else { + putstr("tok="); putint(tok); putchar('\n'); + syntax_error("unknown decl: type expected"); return result; } } From 64642cc58e333f604eba482d9e8834c7bfdbfda5 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sun, 18 Aug 2024 19:43:03 -0400 Subject: [PATCH 17/23] Support #include <...> when -I option is used The -I option specifies the search path of files that are included with #include <...>. --- pnut.c | 270 ++++++++---------- portable_libc/include/stdio.h | 4 +- portable_libc/include/stdlib.h | 2 +- portable_libc/include/string.h | 2 +- portable_libc/include/unistd.h | 2 +- run-tests.sh | 17 +- .../preprocessor/include/include-bracket.c | 26 ++ .../include/include-bracket.golden | 3 + 8 files changed, 173 insertions(+), 153 deletions(-) create mode 100644 tests/_all/preprocessor/include/include-bracket.c create mode 100644 tests/_all/preprocessor/include/include-bracket.golden diff --git a/pnut.c b/pnut.c index 82481277..02b5838b 100644 --- a/pnut.c +++ b/pnut.c @@ -18,8 +18,6 @@ #define OPTIMIZE_LONG_LINES #endif -#define SUPPORT_INCLUDE - // Use positional parameter directly for function parameters that are constants #define OPTIMIZE_CONSTANT_PARAM_not #define SUPPORT_ADDRESS_OF_OP_not @@ -75,7 +73,6 @@ int last_tok_line_number = 1; int last_tok_column_number = 0; #endif -#ifdef SUPPORT_INCLUDE struct IncludeStack { FILE* fp; struct IncludeStack *next; @@ -88,7 +85,7 @@ struct IncludeStack { }; struct IncludeStack *include_stack, *include_stack2; FILE *fp = 0; // Current file pointer that's being read -#endif +char* include_search_path = 0; // Search path for include files // Tokens and AST nodes enum { @@ -199,9 +196,7 @@ void putintneg(int n) { void fatal_error(char *msg) { #ifdef INCLUDE_LINE_NUMBER_ON_ERROR -#ifdef SUPPORT_INCLUDE putstr(include_stack->filepath); putchar(':'); -#endif putint(last_tok_line_number); putchar(':'); putint(last_tok_column_number); putstr(msg); putchar('\n'); #else @@ -212,9 +207,7 @@ void fatal_error(char *msg) { void syntax_error(char *msg) { #ifdef INCLUDE_LINE_NUMBER_ON_ERROR -#ifdef SUPPORT_INCLUDE putstr(include_stack->filepath); putchar(':'); -#endif putint(last_tok_line_number); putchar(':'); putint(last_tok_column_number); putstr(" syntax error: "); putstr(msg); putchar('\n'); #else @@ -499,7 +492,6 @@ void output_declaration_c_code(bool no_header) { #endif void get_ch() { -#ifdef SUPPORT_INCLUDE ch = fgetc(fp); if (ch == EOF) { // If it's not the last file on the stack, EOF means that we need to switch to the next file @@ -528,17 +520,6 @@ void get_ch() { column_number += 1; } #endif -#else - ch = getchar(); -#ifdef INCLUDE_LINE_NUMBER_ON_ERROR - if (ch == '\n') { - line_number += 1; - column_number = 0; - } else { - column_number += 1; - } -#endif -#endif #ifdef SH_INCLUDE_C_CODE // Save C code chars so they can be displayed with the shell code declaration_char_buf[declaration_char_buf_ix] = ch; @@ -546,8 +527,6 @@ void get_ch() { #endif } -#ifdef SUPPORT_INCLUDE - #ifdef PNUT_CC // TODO: It would be nice to not have to duplicate this code int strlen(char *str) { @@ -604,9 +583,9 @@ char *file_parent_directory(char *path) { return path; } -void include_file(char *file_name, bool relative) { - if (relative) { - file_name = str_concat(include_stack->dirname, file_name); +void include_file(char *file_name, char *relative_to) { + if (relative_to) { + file_name = str_concat(relative_to, file_name); } fp = fopen(file_name, "r"); if (fp == 0) { @@ -631,7 +610,101 @@ void include_file(char *file_name, bool relative) { #endif include_stack = include_stack2; } -#endif + +int accum_digit(int base) { + int digit = 99; + if ('0' <= ch AND ch <= '9') { + digit = ch - '0'; + } else if ('A' <= ch AND ch <= 'Z') { + digit = ch - 'A' + 10; + } else if ('a' <= ch AND ch <= 'z') { + digit = ch - 'a' + 10; + } + if (digit >= base) { + return 0; /* character is not a digit in that base */ + } else { + /* + TODO: Put overflow check back + if ((val < limit) OR ((val == limit) AND (digit > limit * base - MININT))) { + fatal_error("literal integer overflow"); + } + */ + val = val * base - digit; + get_ch(); + return 1; + } +} + +void get_string_char() { + + val = ch; + get_ch(); + + if (val == '\\') { + if ('0' <= ch AND ch <= '7') { + /* + Parse octal character, up to 3 digits. + Note that \1111 is parsed as '\111' followed by '1' + See https://en.wikipedia.org/wiki/Escape_sequences_in_C#Notes + */ + val = 0; + accum_digit(8); + accum_digit(8); + accum_digit(8); + val = -(val % 256); /* keep low 8 bits, without overflowing */ + } else if ((ch == 'x') OR (ch == 'X')) { + get_ch(); + val = 0; + /* Allow 1 or 2 hex digits. */ + if (accum_digit(16)) { + accum_digit(16); + } else { + syntax_error("invalid hex escape -- it must have at least one digit"); + } + val = -(val % 256); /* keep low 8 bits, without overflowing */ + } else { + if (ch == 'a') { + val = 7; + } else if (ch == 'b') { + val = 8; + } else if (ch == 'f') { + val = 12; + } else if (ch == 'n') { + val = 10; + } else if (ch == 'r') { + val = 13; + } else if (ch == 't') { + val = 9; + } else if (ch == 'v') { + val = 11; + } else if ((ch == '\\') OR (ch == '\'') OR (ch == '\"')) { + val = ch; + } else { + syntax_error("unimplemented string character escape"); + } + get_ch(); + } + } +} + +void accum_string_until(char end) { + while ((ch != end) AND (ch != EOF)) { + get_string_char(); + tok = ch; + ch = val; + accum_string(); + ch = tok; + } + + if (ch != end) { + syntax_error("unterminated string literal"); + } + + ch = 0; + accum_string(); + + get_ch(); +} // We add the preprocessor keywords to the ident table so they can be easily // recognized by the preprocessor. Because these are not C keywords, their kind @@ -1016,25 +1089,21 @@ int evaluate_if_condition() { } void handle_include() { -#ifdef SUPPORT_INCLUDE if (tok == STRING) { - include_file(string_pool + val, true); + include_file(string_pool + val, include_stack->dirname); get_tok_macro(); // Skip the string } else if (tok == '<') { - // Ignore the file name for now. - // Note that the token is not a string with the file name, but an identifier - // with part of the file. This means we'll need to assemble the filename - // string, or change get_tok to consider '<' and '>' as string delimiters. - while (tok != '>') get_tok_macro(); - get_tok_macro(); // Skip the '>' + accum_string_until('>'); + // #include directives only take effect if the search path is provided + // TODO: Issue a warning to stderr when skipping the directive + if (include_search_path != 0) { + include_file(string_pool + string_start, include_search_path); + } + get_tok_macro(); // Skip the string } else { putstr("tok="); putint(tok); putchar('\n'); syntax_error("expected string to #include directive"); } - -#else - syntax_error("The #include directive is not supported in this version of the compiler."); -#endif } #ifdef sh @@ -1283,82 +1352,6 @@ void init_pnut_macros() { init_ident(MACRO, "PNUT_CC"); } -int accum_digit(int base) { - int digit = 99; - if ('0' <= ch AND ch <= '9') { - digit = ch - '0'; - } else if ('A' <= ch AND ch <= 'Z') { - digit = ch - 'A' + 10; - } else if ('a' <= ch AND ch <= 'z') { - digit = ch - 'a' + 10; - } - if (digit >= base) { - return 0; /* character is not a digit in that base */ - } else { - /* - TODO: Put overflow check back - if ((val < limit) OR ((val == limit) AND (digit > limit * base - MININT))) { - fatal_error("literal integer overflow"); - } - */ - val = val * base - digit; - get_ch(); - return 1; - } -} - -void get_string_char() { - - val = ch; - get_ch(); - - if (val == '\\') { - if ('0' <= ch AND ch <= '7') { - /* - Parse octal character, up to 3 digits. - Note that \1111 is parsed as '\111' followed by '1' - See https://en.wikipedia.org/wiki/Escape_sequences_in_C#Notes - */ - val = 0; - accum_digit(8); - accum_digit(8); - accum_digit(8); - val = -(val % 256); /* keep low 8 bits, without overflowing */ - } else if ((ch == 'x') OR (ch == 'X')) { - get_ch(); - val = 0; - /* Allow 1 or 2 hex digits. */ - if (accum_digit(16)) { - accum_digit(16); - } else { - syntax_error("invalid hex escape -- it must have at least one digit"); - } - val = -(val % 256); /* keep low 8 bits, without overflowing */ - } else { - if (ch == 'a') { - val = 7; - } else if (ch == 'b') { - val = 8; - } else if (ch == 'f') { - val = 12; - } else if (ch == 'n') { - val = 10; - } else if (ch == 'r') { - val = 13; - } else if (ch == 't') { - val = 9; - } else if (ch == 'v') { - val = 11; - } else if ((ch == '\\') OR (ch == '\'') OR (ch == '\"')) { - val = ch; - } else { - syntax_error("unimplemented string character escape"); - } - get_ch(); - } - } -} - // A macro argument is represented using a list of tokens. // Macro arguments are split by commas, but commas can also appear in function // calls and as operators. To distinguish between the two, we need to keep track @@ -1770,23 +1763,7 @@ void get_tok() { get_ch(); begin_string(); - - while ((ch != '\"') AND (ch != EOF)) { - get_string_char(); - tok = ch; - ch = val; - accum_string(); - ch = tok; - } - - if (ch != '\"') { - syntax_error("unterminated string literal"); - } - - ch = 0; - accum_string(); - - get_ch(); + accum_string_until('\"'); val = string_start; tok = STRING; @@ -3248,30 +3225,33 @@ int main(int argc, char **argv) { for (i = 1; i < argc; i += 1) { if (argv[i][0] == '-') { - if (argv[i][1] == 'D') { - init_ident(MACRO, argv[i] + 2); - } else { - putstr("Option "); - putstr(argv[i]); - putchar('\n'); - fatal_error("unknown option"); + switch (argv[i][1]) { + case 'D': + init_ident(MACRO, argv[i] + 2); + break; + + case 'I': + if (include_search_path != 0) { + fatal_error("only one include path allowed"); + } + include_search_path = argv[i] + 2; + break; + + default: + putstr("Option "); putstr(argv[i]); putchar('\n'); + fatal_error("unknown option"); + break; } } else { // Options that don't start with '-' are file names -#ifdef SUPPORT_INCLUDE - include_file(argv[i], false); -#else - fatal_error("input file not supported. Pnut expects the input from stdin."); -#endif + include_file(argv[i], 0); } } -#ifdef SUPPORT_INCLUDE if (fp == 0) { putstr("Usage: "); putstr(argv[0]); putstr(" \n"); fatal_error("no input file"); } -#endif #ifndef DEBUG_CPP #ifndef DEBUG_GETCHAR diff --git a/portable_libc/include/stdio.h b/portable_libc/include/stdio.h index d2ac9af2..508f00ab 100644 --- a/portable_libc/include/stdio.h +++ b/portable_libc/include/stdio.h @@ -1,8 +1,8 @@ #ifndef _STDIO_H #define _STDIO_H -#include "include/sys/types.h" -#include "include/stdarg.h" +#include "sys/types.h" +#include "stdarg.h" #ifdef USE_STRUCT diff --git a/portable_libc/include/stdlib.h b/portable_libc/include/stdlib.h index 060dbb86..b0f2f92d 100644 --- a/portable_libc/include/stdlib.h +++ b/portable_libc/include/stdlib.h @@ -1,7 +1,7 @@ #ifndef _STDLIB_H #define _STDLIB_H -#include "include/sys/types.h" +#include "sys/types.h" void *malloc(size_t size); void free(void *ptr); diff --git a/portable_libc/include/string.h b/portable_libc/include/string.h index 93564b2f..1d83bed3 100644 --- a/portable_libc/include/string.h +++ b/portable_libc/include/string.h @@ -1,4 +1,4 @@ -#include "include/sys/types.h" +#include "sys/types.h" void *memset(void *dest, int c, size_t n); void *memcpy(void *dest, const void *src, size_t n); diff --git a/portable_libc/include/unistd.h b/portable_libc/include/unistd.h index 51fb8358..98360665 100644 --- a/portable_libc/include/unistd.h +++ b/portable_libc/include/unistd.h @@ -1,7 +1,7 @@ #ifndef _UNISTD_H #define _UNISTD_H -#include "include/sys/types.h" +#include "sys/types.h" typedef int mode_t; diff --git a/run-tests.sh b/run-tests.sh index 9136bdff..3d16675d 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -71,6 +71,17 @@ test_args() { echo `sed -n -e "/\/\/ args:/p" "$1" | sed -e "s/^\/\/ args://" | tr '\n' ',' | sed -e 's/,$//'` } +# Some tests specify command line arguments in the source file meant to be passed to the compiler. +# This function extracts the arguments from the source file. +# To specify arguments, add a comment in the source file like this: +# // pnut_opt: arg1 arg2 arg3 +test_args_comp() { + # echo "test_args_comp file $1" >&2 + echo `sed -n -e "/\/\/ pnut_opt:/p" "$1" | sed -e "s/^\/\/ pnut_opt://" | tr '\n' ',' | sed -e 's/,$//'` +} + +# echo "test_args_comp: $(test_args_comp tests/_all/preprocessor/include/include-bracket.c)" + execute_test() { # executable: $1, args: $2 ... if [ "$backend" = "sh" ]; then # Default to bash for sh backend @@ -83,7 +94,7 @@ execute_test() { # executable: $1, args: $2 ... compile_test() { # c_file: $1 # 5s timeout to prevent infinite loops in pnut - timeout 5 "$pnut_comp" "$1" + timeout 5 "$pnut_comp" "$1" $2 } run_test() { # file_to_test: $1 @@ -98,7 +109,7 @@ run_test() { # file_to_test: $1 # Generate golden file if it doesn't exist if [ ! -f "$golden_file" ]; then - compile_test "$file" > "$dir/$filename.$ext" 2> "$dir/$filename.err" + compile_test "$file" "$(test_args_comp $file)" > "$dir/$filename.$ext" 2> "$dir/$filename.err" if [ $? -eq 0 ]; then chmod +x "$dir/$filename.$ext" execute_test "$dir/$filename.$ext" "$(test_args $file)" > "$golden_file" @@ -110,7 +121,7 @@ run_test() { # file_to_test: $1 fi # Compile the test file with pnut.exe - compile_test "$file" > "$dir/$filename.$ext" 2> "$dir/$filename.err" + compile_test "$file" "$(test_args_comp $file)" > "$dir/$filename.$ext" 2> "$dir/$filename.err" if [ $? -eq 0 ]; then # If compilation was successful chmod +x "$dir/$filename.$ext" diff --git a/tests/_all/preprocessor/include/include-bracket.c b/tests/_all/preprocessor/include/include-bracket.c new file mode 100644 index 00000000..c3d14f73 --- /dev/null +++ b/tests/_all/preprocessor/include/include-bracket.c @@ -0,0 +1,26 @@ +// tests for #include "" directives +// pnut_opt: -Iportable_libc/include/ + +#include + +void putint_aux(int n) { + if (n <= -10) putint_aux(n / 10); + putchar('0' - (n % 10)); +} + +void putint(int n) { + if (n < 0) { + putchar('-'); + putint_aux(n); + } else { + putint_aux(-n); + } + + putchar('\n'); +} + +void main() { + putint(stdin); + putint(stdout); + putint(stderr); +} diff --git a/tests/_all/preprocessor/include/include-bracket.golden b/tests/_all/preprocessor/include/include-bracket.golden new file mode 100644 index 00000000..bb0b1cf6 --- /dev/null +++ b/tests/_all/preprocessor/include/include-bracket.golden @@ -0,0 +1,3 @@ +0 +0 +0 From d4e81094173b422a8fb3d693472603d9e5a1b1a3 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sat, 31 Aug 2024 11:32:52 -0400 Subject: [PATCH 18/23] Move up AST nodes functions --- pnut.c | 198 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 99 insertions(+), 99 deletions(-) diff --git a/pnut.c b/pnut.c index fd6dad0b..82b9dcf4 100644 --- a/pnut.c +++ b/pnut.c @@ -300,6 +300,105 @@ int set_cdr(int pair, int value) { return value; } +int get_op(ast node) { + return heap[node] & 1023; +} + +ast get_nb_children(ast node) { + return heap[node] >> 10; +} + +int get_val(ast node) { + return heap[node+1]; +} + +void set_val(ast node, int val) { + heap[node+1] = val; +} + +ast get_child(ast node, int i) { + return heap[node+i+1]; +} + +void set_child(ast node, int i, ast child) { + heap[node+i+1] = child; +} + +ast ast_result; + +ast new_ast0(int op, int val) { + + ast_result = alloc_obj(2); + + heap[ast_result] = op; + set_val(ast_result, val); + + return ast_result; +} + +ast new_ast1(int op, ast child0) { + + ast_result = alloc_obj(2); + + heap[ast_result] = op + 1024; + set_child(ast_result, 0, child0); + + return ast_result; +} + +ast new_ast2(int op, ast child0, ast child1) { + + ast_result = alloc_obj(3); + + heap[ast_result] = op + 2048; + set_child(ast_result, 0, child0); + set_child(ast_result, 1, child1); + + return ast_result; +} + +ast new_ast3(int op, ast child0, ast child1, ast child2) { + + ast_result = alloc_obj(4); + + heap[ast_result] = op + 3072; + set_child(ast_result, 0, child0); + set_child(ast_result, 1, child1); + set_child(ast_result, 2, child2); + + return ast_result; +} + +ast new_ast4(int op, ast child0, ast child1, ast child2, ast child3) { + + ast_result = alloc_obj(5); + + heap[ast_result] = op + 4096; + set_child(ast_result, 0, child0); + set_child(ast_result, 1, child1); + set_child(ast_result, 2, child2); + set_child(ast_result, 3, child3); + + return ast_result; +} + +ast clone_ast(ast orig) { + int nb_children = get_nb_children(orig); + int i; + + // Account for the value of ast nodes with no child + if (nb_children == 0) nb_children = 1; + + ast_result = alloc_obj(nb_children + 1); + + heap[ast_result] = heap[orig]; // copy operator and nb of children + for (i = 0; i < nb_children; i += 1) { + set_child(ast_result, i, get_child(orig, i)); + } + + return ast_result; +} + void begin_string() { string_start = string_pool_alloc; hash = 0; @@ -1991,105 +2090,6 @@ void get_tok() { /* parser */ -int get_op(ast node) { - return heap[node] & 1023; -} - -ast get_nb_children(ast node) { - return heap[node] >> 10; -} - -int get_val(ast node) { - return heap[node+1]; -} - -void set_val(ast node, int val) { - heap[node+1] = val; -} - -ast get_child(ast node, int i) { - return heap[node+i+1]; -} - -void set_child(ast node, int i, ast child) { - heap[node+i+1] = child; -} - -ast ast_result; - -ast new_ast0(int op, int val) { - - ast_result = alloc_obj(2); - - heap[ast_result] = op; - set_val(ast_result, val); - - return ast_result; -} - -ast new_ast1(int op, ast child0) { - - ast_result = alloc_obj(2); - - heap[ast_result] = op + 1024; - set_child(ast_result, 0, child0); - - return ast_result; -} - -ast new_ast2(int op, ast child0, ast child1) { - - ast_result = alloc_obj(3); - - heap[ast_result] = op + 2048; - set_child(ast_result, 0, child0); - set_child(ast_result, 1, child1); - - return ast_result; -} - -ast new_ast3(int op, ast child0, ast child1, ast child2) { - - ast_result = alloc_obj(4); - - heap[ast_result] = op + 3072; - set_child(ast_result, 0, child0); - set_child(ast_result, 1, child1); - set_child(ast_result, 2, child2); - - return ast_result; -} - -ast new_ast4(int op, ast child0, ast child1, ast child2, ast child3) { - - ast_result = alloc_obj(5); - - heap[ast_result] = op + 4096; - set_child(ast_result, 0, child0); - set_child(ast_result, 1, child1); - set_child(ast_result, 2, child2); - set_child(ast_result, 3, child3); - - return ast_result; -} - -ast clone_ast(ast orig) { - int nb_children = get_nb_children(orig); - int i; - - // Account for the value of ast nodes with no child - if (nb_children == 0) nb_children = 1; - - ast_result = alloc_obj(nb_children + 1); - - heap[ast_result] = heap[orig]; // copy operator and nb of children - for (i = 0; i < nb_children; i += 1) { - set_child(ast_result, i, get_child(orig, i)); - } - - return ast_result; -} - #ifdef NICE_ERR_MSG #include "debug.c" #endif From 1d9dd564efa72321b76761990ae24fbd4ddb5dbb Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sat, 31 Aug 2024 13:00:19 -0400 Subject: [PATCH 19/23] Reuse C parser to parse #if expressions Now that the tokenizer can produce NEWLINE tokens, we can reuse the C parser to parse #if expressions. Without newlines, the C parser would keep reading until the end of the expression, skipping over the newlines. Now, if it encounters a newline, a newline token is produced and the C parser fails to parse the expression. This replaces the code that implemented the shunting yard algorithm with a function that can evaluate constant expressions. This function evaluates AST nodes that represent constant expressions, and will be used to support non-integer literal expressions for array lengths. --- pnut.c | 279 ++++++++++++++++++++++----------------------------------- 1 file changed, 106 insertions(+), 173 deletions(-) diff --git a/pnut.c b/pnut.c index 82b9dcf4..48a126aa 100644 --- a/pnut.c +++ b/pnut.c @@ -991,202 +991,120 @@ void handle_define() { #endif } -// For evaluating #if condition, we use the shunting yard algorithm -// https://en.wikipedia.org/wiki/Shunting_yard_algorithm -#define IF_CONDITION_STACK_SIZE 100 -int op_stack[IF_CONDITION_STACK_SIZE]; -int op_stack_ix = 0; -int val_stack[IF_CONDITION_STACK_SIZE]; -int val_stack_ix = 0; - -// Operator precedence of C operators allowed in #if condition -// Convert to lookup table? -int precedence(int op) { - if (op == '(') return -1; // Left paren are lowest precedence - else if ((op == '!') OR (op == '~')) return 2; - else if ((op == '*') OR (op == '/') OR (op == '%')) return 3; - else if ((op == '+') OR (op == '-')) return 4; - else if ((op == LSHIFT) OR (op == RSHIFT)) return 5; - else if ((op == '<') OR (op == '>') - OR (op == LT_EQ) OR (op == GT_EQ)) return 6; - else if ((op == EQ_EQ) OR (op == EXCL_EQ)) return 7; - else if (op == '&') return 8; - else if (op == '^') return 9; - else if (op == '|') return 10; - else if (op == AMP_AMP) return 9; - else if (op == BAR_BAR) return 12; - else { - putstr("op="); putint(op); putchar('\n'); - syntax_error("#if: unknown operator"); - return -1; - } -} - -void pop_op() { - int op = op_stack[op_stack_ix - 1]; - op_stack_ix -= 1; - - if (op == '!' OR op == '~') { - // Unary operators - if (val_stack_ix < 1) { - fatal_error("invalid unary expression, not enough values in #if condition"); - } - if (op == '!') { - val_stack[val_stack_ix - 1] = !val_stack[val_stack_ix - 1]; - } else if (op == '~') { - val_stack[val_stack_ix - 1] = ~val_stack[val_stack_ix - 1]; - } - } else { - // Binary operators - if (val_stack_ix < 2) { - fatal_error("invalid binary expression, not enough values in #if condition"); - } - - if (op == '*') { - val_stack[val_stack_ix - 2] *= val_stack[val_stack_ix - 1]; - } else if (op == '/') { - val_stack[val_stack_ix - 2] /= val_stack[val_stack_ix - 1]; - } else if (op == '%') { - val_stack[val_stack_ix - 2] %= val_stack[val_stack_ix - 1]; - } else if (op == '+') { - val_stack[val_stack_ix - 2] += val_stack[val_stack_ix - 1]; - } else if (op == '-') { - val_stack[val_stack_ix - 2] -= val_stack[val_stack_ix - 1]; - } else if (op == LSHIFT) { - val_stack[val_stack_ix - 2] <<= val_stack[val_stack_ix - 1]; - } else if (op == RSHIFT) { - val_stack[val_stack_ix - 2] >>= val_stack[val_stack_ix - 1]; - } else if (op == '<') { - val_stack[val_stack_ix - 2] = val_stack[val_stack_ix - 2] < val_stack[val_stack_ix - 1]; - } else if (op == '>') { - val_stack[val_stack_ix - 2] = val_stack[val_stack_ix - 2] > val_stack[val_stack_ix - 1]; - } else if (op == LT_EQ) { - val_stack[val_stack_ix - 2] = val_stack[val_stack_ix - 2] <= val_stack[val_stack_ix - 1]; - } else if (op == GT_EQ) { - val_stack[val_stack_ix - 2] = val_stack[val_stack_ix - 2] >= val_stack[val_stack_ix - 1]; - } else if (op == EQ_EQ) { - val_stack[val_stack_ix - 2] = val_stack[val_stack_ix - 2] == val_stack[val_stack_ix - 1]; - } else if (op == EXCL_EQ) { - val_stack[val_stack_ix - 2] = val_stack[val_stack_ix - 2] != val_stack[val_stack_ix - 1]; - } else if (op == '&') { - val_stack[val_stack_ix - 2] &= val_stack[val_stack_ix - 1]; - } else if (op == '^') { - val_stack[val_stack_ix - 2] ^= val_stack[val_stack_ix - 1]; - } else if (op == '|') { - val_stack[val_stack_ix - 2] |= val_stack[val_stack_ix - 1]; - } else if (op == AMP_AMP) { - // C documentation specifies that && and || are short-circuit operators, not - // sure how they make sense in a #if condition since the operators don't - // have side effects. - val_stack[val_stack_ix - 2] = val_stack[val_stack_ix - 2] && val_stack[val_stack_ix - 1]; - } else if (op == BAR_BAR) { - val_stack[val_stack_ix - 2] = val_stack[val_stack_ix - 2] || val_stack[val_stack_ix - 1]; - } else { - putstr("op="); putint(op); putchar('\n'); - fatal_error("pop_op: unknown operator"); - } - val_stack_ix -= 1; - } -} +int eval_constant(ast expr, bool if_macro) { + int val; + int op = get_op(expr); + int op1; + int op2; + int result; + int i; -void push_op(int op) { - int op_precedence = precedence(op); + switch (op) { + case INTEGER: return -get_val(expr); + case CHARACTER: return get_val(expr); + case '~': return !eval_constant(get_child(expr, 0), if_macro); + case '!': return !eval_constant(get_child(expr, 0), if_macro); + case '-': + case '+': + op1 = eval_constant(get_child(expr, 0), if_macro); + if (get_nb_children(expr) == 1) { + return op == '-' ? -op1 : op1; + } + op2 = eval_constant(get_child(expr, 1), if_macro); + return op == '-' ? op1 - op2 : op1 + op2; - if (op_stack_ix >= IF_CONDITION_STACK_SIZE) { - fatal_error("too many operators in #if condition"); - } + case '?': + op1 = eval_constant(get_child(expr, 0), if_macro); + if (op1) { + return eval_constant(get_child(expr, 1), if_macro); + } + return eval_constant(get_child(expr, 2), if_macro); + + case '*': + case '/': + case '%': + case '&': + case '|': + case '^': + case LSHIFT: + case RSHIFT: + case EQ_EQ: + case EXCL_EQ: + case LT_EQ: + case GT_EQ: + case '<': + case '>': + op1 = eval_constant(get_child(expr, 0), if_macro); + op2 = eval_constant(get_child(expr, 1), if_macro); + switch (op) { + case '*': return op1 * op2; + case '/': return op1 / op2; + case '%': return op1 % op2; + case '&': return op1 & op2; + case '|': return op1 | op2; + case '^': return op1 ^ op2; + case LSHIFT: return op1 << op2; + case RSHIFT: return op1 >> op2; + case EQ_EQ: return op1 == op2; + case EXCL_EQ: return op1 != op2; + case LT_EQ: return op1 <= op2; + case GT_EQ: return op1 >= op2; + case '<': return op1 < op2; + case '>': return op1 > op2; + } + return 0; // Should never reach here - while (op_stack_ix != 0 - AND op_stack[op_stack_ix - 1] != '(' - AND precedence(op_stack[op_stack_ix - 1]) <= op_precedence) { - pop_op(); - } + case AMP_AMP: + op1 = eval_constant(get_child(expr, 0), if_macro); + if (!op1) return 0; + return eval_constant(get_child(expr, 1), if_macro); - op_stack[op_stack_ix] = op; - op_stack_ix += 1; -} + case BAR_BAR: + op1 = eval_constant(get_child(expr, 0), if_macro); + if (op1) return 1; + return eval_constant(get_child(expr, 1), if_macro); -void push_val(int val) { - if (val_stack_ix >= IF_CONDITION_STACK_SIZE) { - fatal_error("too many values in #if condition"); - } + case '(': // defined operators are represented as fun calls + if (if_macro && get_val(get_child(expr, 0)) == DEFINED_ID) { + return get_child(expr, 1) == MACRO; + } - val_stack[val_stack_ix] = val; - val_stack_ix += 1; -} + fatal_error("unknown function call in constant expressions"); + return 0; -void handle_if_op() { - switch (tok) { - case '(': - push_op(tok); - break; - case ')': - while (op_stack_ix != 0 AND op_stack[op_stack_ix - 1] != '(') pop_op(); - if (op_stack_ix == 0) fatal_error("unmatched parenthesis in #if condition"); - op_stack_ix -= 1; // Pop the '(' - break; case IDENTIFIER: - if (val == DEFINED_ID) { - get_tok_macro(); // Skip the defined keyword - if (tok == '(') { - get_tok_macro(); // Skip the '(' - push_val(tok == MACRO); - get_tok_macro(); // Skip the macro name - if (tok != ')') { - // Not using expect_tok because it may be the end of the line - putstr("tok="); putint(tok); putchar('\n'); - fatal_error("expected ')' in #if defined condition"); - } - } else if (tok == IDENTIFIER OR tok == MACRO) { - // #if defined MACRO is valid syntax - push_val(tok == MACRO); - } else { - putstr("tok="); putint(tok); putchar('\n'); - fatal_error("expected identifier or macro in #if defined condition"); - } - } else { - push_val(0); // Undefined identifiers are 0 + if (if_macro) { + // Undefined identifiers are 0 + // At this point, macros have already been expanded so we can't have a macro identifier + return 0; } - break; - case INTEGER: - push_val(-val); - break; - case CHARACTER: - push_val(val); - break; + // TODO: Enums when not not if_macro + fatal_error("identifiers are not allowed in constant expression"); + return 0; + default: - push_op(tok); // Invalid operators are caught by push_op - break; + putstr("op="); putint(op); putchar('\n'); + fatal_error("unsupported operator in constant expression"); } } +ast parse_assignment_expression(); + int evaluate_if_condition() { bool prev_skip_newlines = skip_newlines; int previous_mask = if_macro_mask; + ast expr; // Temporarily set to true so that we can read the condition even if it's inside an ifdef false block // Unlike in other directives using get_tok_macro, we want to expand macros in the condition if_macro_mask = true; skip_newlines = false; // We want to stop when we reach the first newline - get_tok(); - while (tok != '\n' AND tok != EOF) { - handle_if_op(); - get_tok(); - } + get_tok(); // Skip the #if keyword + expr = parse_assignment_expression(); // Restore the previous value if_macro_mask = previous_mask; skip_newlines = prev_skip_newlines; - - // Pop remaining operators - while (op_stack_ix != 0) { - pop_op(); - } - - if (val_stack_ix != 1) { - fatal_error("invalid #if condition"); - } - val_stack_ix = 0; // Reset the value stack - return val_stack[0]; + return eval_constant(expr, true); } void handle_include() { @@ -2698,7 +2616,7 @@ ast parse_unary_expression() { result = parse_cast_expression(); result = new_ast1(op, result); - } else if (tok == SIZEOF_KW) { + } else if (skip_newlines && tok == SIZEOF_KW) { // only parse sizeof if we're not in a #if expression get_tok(); if (tok == '(') { @@ -2710,6 +2628,21 @@ ast parse_unary_expression() { } result = new_ast1(SIZEOF_KW, result); + } else if (!skip_newlines && tok == IDENTIFIER && val == DEFINED_ID) { // Parsing a macro + + get_tok_macro(); + if (tok == '(') { + get_tok_macro(); + result = new_ast2('(', new_ast0(IDENTIFIER, DEFINED_ID), tok); + get_tok_macro(); + expect_tok(')'); + } else if (tok == IDENTIFIER || tok == MACRO) { + result = new_ast2('(', new_ast0(IDENTIFIER, DEFINED_ID), tok); + get_tok_macro(); + } else { + syntax_error("identifier or '(' expected"); + } + } else { result = parse_postfix_expression(); } From 7bb795974d9ff375169f0f5c55a55f7580955213 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sat, 31 Aug 2024 13:05:59 -0400 Subject: [PATCH 20/23] Increase compile_test timeout when using pnut.sh --- run-tests.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/run-tests.sh b/run-tests.sh index 519576e3..46b8d949 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -140,8 +140,12 @@ execute_test() { # executable: $1, timeout: $2, args: $3 } compile_test() { # c_file: $1 - # 5s timeout to prevent infinite loops in pnut - timeout 5 "$pnut_comp" "$1" $2 + # 15s timeout to prevent infinite loops in pnut + if [ $bootstrap -eq 1 ]; then + timeout 15 $shell "$pnut_comp" $PNUT_EXE_OPTIONS "$1" $2 + else + timeout 5 "$pnut_comp" "$1" $2 + fi } run_test() { # file_to_test: $1 From e9e5dcc59ad60da609c7701fc0612994ff2685f3 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sat, 31 Aug 2024 13:06:22 -0400 Subject: [PATCH 21/23] Adjust whitespace in fatal_error --- pnut.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pnut.c b/pnut.c index 48a126aa..5d8c3cc5 100644 --- a/pnut.c +++ b/pnut.c @@ -200,7 +200,7 @@ void fatal_error(char *msg) { #ifdef INCLUDE_LINE_NUMBER_ON_ERROR putstr(include_stack->filepath); putchar(':'); putint(last_tok_line_number); putchar(':'); putint(last_tok_column_number); - putstr(msg); putchar('\n'); + putstr(" "); putstr(msg); putchar('\n'); #else putstr(msg); putchar('\n'); #endif From 5b57fcbdf38168dd755663173f48b9e69a351678 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Sat, 31 Aug 2024 13:40:20 -0400 Subject: [PATCH 22/23] Revert "Fix RT_USE_LOOKUP_TABLE when using unicode chars" This reverts commit e2db147e3674a6a79b701924d45274c237ba61de. --- sh-runtime.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sh-runtime.c b/sh-runtime.c index b65c31d1..18ea05fb 100644 --- a/sh-runtime.c +++ b/sh-runtime.c @@ -214,7 +214,6 @@ END_RUNTIME_FUN(int_to_char) DEFINE_RUNTIME_FUN(char_to_int) #ifndef RT_COMPACT #ifdef RT_USE_LOOKUP_TABLE - putstr(" LC_ALL=C\n"); putstr("__c2i_0=48\n"); putstr("__c2i_1=49\n"); putstr("__c2i_2=50\n"); From f845ac9e6b88c8b1f7c528a5de01a929f7f11e81 Mon Sep 17 00:00:00 2001 From: Laurent Huberdeau Date: Mon, 2 Sep 2024 13:54:46 -0400 Subject: [PATCH 23/23] Fix macro expansion stack usage When attempting to expand a macro, the list of argument is parsed by get_macro_args_toks which produced the next token after ')'. This token was then pushed on the tokens stack so it would be processed after the expanded macro's tokens. When multiple macros were expanded sequentially, this caused the last stack entry to never be empty, which broke the stack reuse mechanism (similar to TCO). This bug was not visible when bootstrapping pnuts because not enough macros were expanded in a row to trigger the issue. This is however a common pattern in TCC. --- pnut.c | 26 ++++++++++-------------- tests/_all/preprocessor/macro/fun-like.c | 2 +- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/pnut.c b/pnut.c index 5d8c3cc5..6b02c00f 100644 --- a/pnut.c +++ b/pnut.c @@ -1441,7 +1441,7 @@ int get_macro_args_toks(int macro) { macro_args_count += 1; } - expect_tok(')'); + if (tok != ')') syntax_error("unterminated macro argument list"); if (prev_is_comma) { args = cons(0, args); // Push empty arg @@ -1463,7 +1463,7 @@ int get_macro_arg(int ix) { return car(arg); } -void push_macro(int tokens, int args) { +void play_macro(int tokens, int args) { if (tokens != 0) { if (macro_tok_lst != 0) { if (macro_stack_ix + 2 >= MACRO_RECURSION_MAX) { @@ -1479,10 +1479,6 @@ void push_macro(int tokens, int args) { } } -void push_tokens(int tokens) { - push_macro(tokens, 0); -} - // Try to expand a macro. // If a function-like macro is not called with (), it is not expanded and the identifier is returned as is. // If the wrong number of arguments is passed to a function-like macro, a fatal error is raised. @@ -1494,19 +1490,19 @@ bool attempt_macro_expansion(int macro) { int tokens = car(heap[macro + 3]); macro = val; if (cdr(heap[macro + 3]) == -1) { // Object-like macro - push_macro(tokens, 0); + play_macro(tokens, 0); return true; } else { new_macro_args = get_macro_args_toks(macro); - // get_macro_args_toks fetched the next token, we save it so it's not lost - push_macro(cons(cons(tok, val), 0), new_macro_args); - if (new_macro_args == -1) { // There was no argument list, i.e. not a function-like macro call - // Function-like macro without (), so we don't expand it. + // There was no argument list, i.e. not a function-like macro call even though it is a function-like macro + if (new_macro_args == -1) { + // get_macro_args_toks looked at the next token so we need to save it + play_macro(cons(cons(tok, val), 0), 0); tok = IDENTIFIER; val = macro; return false; } else { - push_macro(tokens, new_macro_args); + play_macro(tokens, new_macro_args); return true; } } @@ -1557,7 +1553,7 @@ void paste_tokens(int left_tok, int left_val) { val = left_val; return; } else { - push_macro(get_macro_arg(val), 0); // Play the tokens of the macro argument + play_macro(get_macro_arg(val), 0); // Play the tokens of the macro argument get_tok_macro(); } } @@ -1669,7 +1665,7 @@ void get_tok() { } break; } else if (tok == MACRO_ARG AND expand_macro_arg) { - push_macro(get_macro_arg(val), 0); // Play the tokens of the macro argument + play_macro(get_macro_arg(val), 0); // Play the tokens of the macro argument continue; } else if (tok == '#') { // Stringizing! stringify(); @@ -2683,7 +2679,7 @@ ast parse_cast_expression() { } else { // We need to put the current token and '(' back on the token stream. tokens = cons(cons(tok, val), 0); - push_tokens(tokens); + play_macro(tokens, 0); tok = '('; val = 0; } diff --git a/tests/_all/preprocessor/macro/fun-like.c b/tests/_all/preprocessor/macro/fun-like.c index d349a234..ccb5cf95 100644 --- a/tests/_all/preprocessor/macro/fun-like.c +++ b/tests/_all/preprocessor/macro/fun-like.c @@ -3,7 +3,7 @@ // putchar #include -// // Macro with a comma in the argument +// Macro with a comma in the argument #define FST(X, Y) X #define SND(X, Y) Y #define ADD_PAIR(X, Y) FST(X, Y) + SND(X, Y)