diff --git a/include/ast.h b/include/ast.h index 5838b49..7537820 100644 --- a/include/ast.h +++ b/include/ast.h @@ -77,4 +77,10 @@ int ast_node_init(ASTNode* node, ASTNodeType type); */ void ast_node_free(ASTNode* node); +/** + * @param node The node to convert to string + * + * @return A string description of the given AST Node + */ +char* str_ast_node(ASTNode* node); #endif // REGEX_AST diff --git a/src/ast.c b/src/ast.c index a407121..309b525 100644 --- a/src/ast.c +++ b/src/ast.c @@ -2,6 +2,16 @@ #include "ast.h" + +char* ast_str[] = { + "CharNode", + "StarNode", + "PlusNode", + "QuestionNode", + "OrNode", + "ConcatNode", +}; + // Allocate a new AST Node, and initialize it with the given type ASTNode* ast_node_create(ASTNodeType type) { ASTNode* node = malloc(sizeof(ASTNode)); @@ -51,3 +61,7 @@ void ast_node_free(ASTNode* node) { free(node); } + +char* str_ast_node(ASTNode* node) { + return ast_str[node->type]; +} diff --git a/src/parser.c b/src/parser.c index 11bb2fe..90d8dff 100644 --- a/src/parser.c +++ b/src/parser.c @@ -3,6 +3,12 @@ #include "parser.h" #include "token.h" +// Forward declaration of parsing functions +ASTNode* parse_base(Parser* parser); +ASTNode* parse_factor(Parser* parser); +ASTNode* parse_term(Parser* parser); +ASTNode* parse_expr(Parser* parser); + /** * Returns the next token with advancing the parser's position * @@ -59,12 +65,16 @@ int expect(Parser* parser, TokenType type){ return -1; } - return parser->tokens[parser->position].type == type ? 0 : -1; + Token* token = next(parser); + if (token == NULL) { + return type == EOF_ ? 0 : -1; + } + + return token->type == type ? 0 : -1; } /** - * TODO: WIP * Parse the `base` non-terminal. * * See the regex CFG on @@ -77,32 +87,96 @@ int expect(Parser* parser, TokenType type){ * * @return The root of the AST created by parsing the `base` non-terminal */ -ASTNode* parse_base(Parser* parser){ - return parser != NULL ? NULL : NULL; +ASTNode* parse_base(Parser* parser) { + if (parser == NULL) { + return NULL; + } + + Token* token = next(parser); + ASTNode* node = NULL; + + switch (token->type) { + case CHAR: + node = ast_node_create(CHAR_NODE); + node->extra.character = token->value; + break; + + case LPAREN:; + Token* next = peek(parser); + if (next != NULL && next->type == RPAREN) { + return NULL; + } + + node = parse_expr(parser); + + if (expect(parser, RPAREN) < 0) { + ast_node_free(node); + return NULL; + } + + break; + + default: + // Error + return NULL; + } + + return node; } /** - * TODO: WIP * Parse the `factor` non-terminal. * * See the regex CFG on * https://github.com/mkpro118/Regex-Engine/issues/6#issue-2337160940 * * The relevant production is reproduced below - * non terminal :: factor -> base op | factor op + * non terminal :: factor -> base op | epsilon * * @param parser The parser to operate on * * @return The root of the AST created by parsing the `factor` non-terminal */ -ASTNode* parse_factor(Parser* parser){ - return parser != NULL ? NULL : NULL; +ASTNode* parse_factor(Parser* parser) { + if (parser == NULL) { + return NULL; + } + + ASTNode* node = parse_base(parser); + + Token* token; + + while ((token = peek(parser))) { + ASTNode* parent = NULL; + + switch (token->type) { + case STAR: + parent = ast_node_create(STAR_NODE); + break; + + case PLUS: + parent = ast_node_create(PLUS_NODE); + break; + + case QUESTION: + parent = ast_node_create(QUESTION_NODE); + break; + + default: + return node; + } + + parent->child1 = node; + node = parent; + next(parser); + } + + return node; } /** - * TODO: WIP * Parse the `term` non-terminal. * * See the regex CFG on @@ -115,13 +189,45 @@ ASTNode* parse_factor(Parser* parser){ * * @return The root of the AST created by parsing the `term` non-terminal */ -ASTNode* parse_term(Parser* parser){ - return parser != NULL ? NULL : NULL; +ASTNode* parse_term(Parser* parser) { + if (parser == NULL) { + return NULL; + } + + ASTNode* left = parse_factor(parser); + + Token* token; + + while ((token = peek(parser))) { + if (token->type == OR || token->type == RPAREN) { + return left; + } + + ASTNode* right = parse_factor(parser); + if (right == NULL) { + ast_node_free(left); + return NULL; + } + + ASTNode* concat = ast_node_create(CONCAT_NODE); + if (concat == NULL) { + ast_node_free(left); + ast_node_free(right); + return NULL; + } + + + concat->child1 = left; + concat->extra.child2 = right; + + left = concat; + } + + return left; } /** - * TODO: WIP * Parse the `expr` non-terminal. * * See the regex CFG on @@ -135,7 +241,40 @@ ASTNode* parse_term(Parser* parser){ * @return The root of the AST created by parsing the `expr` non-terminal */ ASTNode* parse_expr(Parser* parser){ - return parser != NULL ? NULL : NULL; + if (parser == NULL) { + return NULL; + } + + + ASTNode* left = parse_term(parser); + + Token* token; + + while ((token = peek(parser)) && token->type == OR) { + // This is to consume the OR token + next(parser); + + ASTNode* right = parse_term(parser); + if (right == NULL) { + ast_node_free(left); + return NULL; + } + + ASTNode* or = ast_node_create(OR_NODE); + if (or == NULL) { + ast_node_free(left); + ast_node_free(right); + return NULL; + } + + + or->child1 = left; + or->extra.child2 = right; + + left = or; + } + + return left; } // Create a heap allocated parser from the given Lexer @@ -176,5 +315,5 @@ void parser_free(Parser* parser) { // Create a AST by parsing the tokens in the given parser ASTNode* parse(Parser* parser) { - return parser != NULL ? NULL : NULL; + return parse_expr(parser); } diff --git a/tests/test_parser.c b/tests/test_parser.c index 8c88960..c41a9e0 100644 --- a/tests/test_parser.c +++ b/tests/test_parser.c @@ -151,12 +151,12 @@ int test_expect(void) { int ret = expect(&parser, expected); assert_equals_int(ret, 0); - assert_equals_int(parser.position, 0); + assert_equals_int(parser.position, 1); // Something that was not expected - ret = expect(&parser, CHAR); + ret = expect(&parser, OR); assert_equals_int(ret, -1); - assert_equals_int(parser.position, 0); + assert_equals_int(parser.position, 2); // Bad input ret = expect(NULL, CHAR); @@ -331,7 +331,7 @@ int test_parse_term(void) { assert_equals_int(node->child1->extra.character, 'a'); // Check right child - assert_equals_int(node->extra.child2, CHAR_NODE); + assert_equals_int(node->extra.child2->type, CHAR_NODE); assert_equals_int(node->extra.child2->extra.character, 'b'); ast_node_free(node); @@ -362,15 +362,6 @@ int test_parse_term(void) { // Case: RPAREN token TEST_CASE(")") - // Case: STAR token - TEST_CASE("*") - - // Case: PLUS token - TEST_CASE("+") - - // Case: QUESTION token - TEST_CASE("?") - // Case: OR token TEST_CASE("|") @@ -421,7 +412,7 @@ int test_parse_expr(void) { ASTNode* node = parse_expr(&parser); assert_is_not_null(node); - assert_equals_int(parser.position, 3); + assert_equals_int(parser.position, 4); assert_equals_int(node->type, OR_NODE); @@ -463,17 +454,21 @@ int test_parse(void) { /* This will result in an AST that looks like * - * Level 0: OR - * | - * +---------+---------+ - * | | - * Level 1: CONCAT OR - * | | - * +----+----+ +-----+----+ - * | | | | - * Level 2: CHAR(a) QUESTION STAR PLUS - * | | | - * Level 3: CHAR(b) CHAR(c) CHAR(d) + * Level 0: OR + * | + * +--------+--------+ + * | | + * Level 1: OR PLUS + * | | + * +------+-------+ | + * | | | + * Level 2: CONCAT STAR CHAR(d) + * | | + * +----+----+ | + * | | | + * Level 3: CHAR(a) QUESTION CHAR(c) + * | + * Level 4: CHAR(b) */ // Manually create the AST @@ -499,13 +494,13 @@ int test_parse(void) { ops[3].child1 = &chars[0]; // Character('a') node ops[3].extra.child2 = &ops[0]; // Question("b?") node - // Setup the second or rightmost OR Node (for "c*|d+") - ops[4].child1 = &ops[1]; // Star("c*") node - ops[4].extra.child2 = &ops[2]; // Plus("d+") node + // Setup the first or leftmost OR Node (for "(ab?)|c*") + ops[4].child1 = &ops[3]; // Concat("ab?") node + ops[4].extra.child2 = &ops[1]; // Star("c*") node - // Setup the first or leftmost OR Node - ops[5].child1 = &ops[3]; // Concat("ab?") node - ops[5].extra.child2 = &ops[4]; // Or("c*|d+") node + // Setup the second or rightmost OR Node + ops[5].child1 = &ops[4]; // Or("(ab?)|c*") node + ops[5].extra.child2 = &ops[2]; // Plus("d+") node // Start test CREATE_PARSER; @@ -522,43 +517,43 @@ int test_parse(void) { assert_is_not_null(root->child1); assert_is_not_null(root->extra.child2); - // Level 1: Check root node's left child, i.e Concat node - ASTNode* concat = root->child1; + // Level 1: Check root node's left child, i.e OR node + ASTNode* or = root->child1; + assert_equals_int(or->type, ops[4].type); + + // OR's children should not be null + assert_is_not_null(or->child1); + assert_is_not_null(or->extra.child2); + + // Level 2: Check OR's left child, i.e. Concat node + ASTNode* concat = or->child1; assert_equals_int(concat->type, ops[3].type); // Concat's children should not be null assert_is_not_null(concat->child1); assert_is_not_null(concat->extra.child2); - // Level 2: Check concat's left child, i.e. Char(a) node + // Level 3: Check concat's left child, i.e. Char(a) node ASTNode* char_a = concat->child1; assert_equals_int(char_a->type, chars[0].type); assert_equals_int(char_a->extra.character, chars[0].extra.character); - // Level 2: Check concat's right child, i.e. Question node + // Level 3: Check concat's right child, i.e. Question node ASTNode* question = concat->extra.child2; assert_equals_int(question->type, ops[0].type); // Question's child should not be null assert_is_not_null(question->child1); - // Level 3: Check question's child, i.e Char(b) node + // Level 4: Check question's child, i.e Char(b) node ASTNode* char_b = question->child1; assert_equals_int(char_b->type, chars[1].type); assert_equals_int(char_b->extra.character, chars[1].extra.character); - // (Go back up the tree to Level 0 ...) - - // Level 1: Check the root node's right child, i.e. Or("c*|d+") node - ASTNode* or = root->extra.child2; - assert_equals_int(or->type, ops[4].type); - - // Or's children should not be null - assert_is_not_null(or->child1); - assert_is_not_null(or->extra.child2); + // (Go back up the tree to Level 1 ...) - // Level 2: Check Or's left child, i.e. Star("c*") node - ASTNode* star = or->child1; + // Level 2: Check OR's right child, i.e. Star("c*") node + ASTNode* star = or->extra.child2; assert_equals_int(star->type, ops[1].type); // Star's child should not be null @@ -569,16 +564,16 @@ int test_parse(void) { assert_equals_int(char_c->type, chars[2].type); assert_equals_int(char_c->extra.character, chars[2].extra.character); - // (Go back up the tree to Level 1 ...) + // (Go back up the tree to Level 0 ...) - // Level 2: Check Or's right child, i.e. Plus("d+") node - ASTNode* plus = or->extra.child2; + // Level 1: Check the root node's right child, i.e. Plus("d+") node + ASTNode* plus = root->extra.child2; assert_equals_int(plus->type, ops[2].type); // Plus's child should not be null assert_is_not_null(plus->child1); - // Level 3: Check Plus's child, i.e. Char(d) node + // Level 2: Check Plus's child, i.e. Char(d) node ASTNode* char_d = plus->child1; assert_equals_int(char_d->type, chars[3].type); assert_equals_int(char_d->extra.character, chars[3].extra.character); @@ -605,16 +600,6 @@ Test tests[] = { }; -int main() { - // Run selective tests - char* argv[] = { - "--run", - "test_parser_create", - "test_parser_init", - "test_peek", - "test_next", - "test_expect" - }; - int argc = sizeof(argv) / sizeof(char*); - return default_main(argv, argc); +int main(int argc, char* argv[]) { + return default_main(&argv[1], argc - 1); } diff --git a/tests/test_parser_regex.c b/tests/test_parser_regex.c new file mode 100644 index 0000000..0185f3e --- /dev/null +++ b/tests/test_parser_regex.c @@ -0,0 +1,274 @@ +#include "testlib/asserts.h" +#include "testlib/tests.h" +#include "lexer.h" +#include "parser.h" +#include "ast.h" + +#define ALLOCATE_PARSER {\ +lexer_init(&lexer, regex);\ +parser_init(&parser, &lexer);\ +}\ +ASTNode* result = parse(&parser) + +#define DEALLOCATE_PARSER do {\ +ast_node_free(result);\ +parser_free(&parser);\ +lexer_free(&lexer);\ +} while(0) + +int compare_ast(ASTNode* node1, ASTNode* node2) { + if (node1 == NULL && node2 == NULL) return 1; + if (node1 == NULL || node2 == NULL) return 0; + if (node1->type != node2->type) return 0; + + switch (node1->type) { + case CHAR_NODE: + return node1->extra.character == node2->extra.character; + case OR_NODE: + case CONCAT_NODE: + return compare_ast(node1->child1, node2->child1) && + compare_ast(node1->extra.child2, node2->extra.child2); + default: + return compare_ast(node1->child1, node2->child1); + } +} + +Lexer lexer; +Parser parser; + +// Test cases + +int test_simple_char() { + TEST_BEGIN; + + char* regex = "a"; + ALLOCATE_PARSER; + + ASTNode expected = {.type = CHAR_NODE, .extra = {.character = 'a'}}; + + int success = compare_ast(result, &expected); + assert_equals_int(success, 1); + + DEALLOCATE_PARSER; + TEST_END; +} + +int test_simple_concatenation() { + TEST_BEGIN; + + char* regex = "ab"; + ALLOCATE_PARSER; + + ASTNode expected = { + .type = CONCAT_NODE, + .child1 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'a'}}, + .extra.child2 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'b'}} + }; + + int success = compare_ast(result, &expected); + assert_equals_int(success, 1); + + DEALLOCATE_PARSER; + TEST_END; +} + +int test_simple_alternation() { + TEST_BEGIN; + + char* regex = "a|b"; + ALLOCATE_PARSER; + + ASTNode expected = { + .type = OR_NODE, + .child1 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'a'}}, + .extra.child2 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'b'}} + }; + + int success = compare_ast(result, &expected); + assert_equals_int(success, 1); + + DEALLOCATE_PARSER; + TEST_END; +} + +int test_complex_expression() { + TEST_BEGIN; + + char* regex = "(a|b)*c+d?"; + ALLOCATE_PARSER; + + ASTNode expected = { + .type = CONCAT_NODE, + .child1 = &(ASTNode){ + .type = CONCAT_NODE, + .child1 = &(ASTNode){ + .type = STAR_NODE, + .child1 = &(ASTNode){ + .type = OR_NODE, + .child1 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'a'}}, + .extra.child2 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'b'}} + }, + .extra = {0}, + }, + .extra.child2 = &(ASTNode){ + .type = PLUS_NODE, + .child1 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'c'}}, + .extra = {0} + } + }, + .extra.child2 = &(ASTNode){ + .type = QUESTION_NODE, + .child1 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'd'}}, + .extra = {0} + } + }; + + int success = compare_ast(result, &expected); + assert_equals_int(success, 1); + + DEALLOCATE_PARSER; + TEST_END; +} + +int test_nested_parentheses() { + TEST_BEGIN; + + char* regex = "((a|b)c|(d|e)f)g"; + ALLOCATE_PARSER; + + ASTNode expected = { + .type = CONCAT_NODE, + .child1 = &(ASTNode){ + .type = OR_NODE, + .child1 = &(ASTNode){ + .type = CONCAT_NODE, + .child1 = &(ASTNode){ + .type = OR_NODE, + .child1 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'a'}}, + .extra.child2 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'b'}} + }, + .extra.child2 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'c'}} + }, + .extra.child2 = &(ASTNode){ + .type = CONCAT_NODE, + .child1 = &(ASTNode){ + .type = OR_NODE, + .child1 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'd'}}, + .extra.child2 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'e'}} + }, + .extra.child2 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'f'}} + } + }, + .extra.child2 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'g'}} + }; + + int success = compare_ast(result, &expected); + assert_equals_int(success, 1); + + DEALLOCATE_PARSER; + TEST_END; +} + +int test_multiple_alternations() { + TEST_BEGIN; + + char* regex = "a|b|c|d|e"; + ALLOCATE_PARSER; + + ASTNode expected = { + .type = OR_NODE, + .child1 = &(ASTNode){ + .type = OR_NODE, + .child1 = &(ASTNode){ + .type = OR_NODE, + .child1 = &(ASTNode){ + .type = OR_NODE, + .child1 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'a'}}, + .extra.child2 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'b'}} + }, + .extra.child2 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'c'}} + }, + .extra.child2 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'd'}} + }, + .extra.child2 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'e'}} + }; + + int success = compare_ast(result, &expected); + assert_equals_int(success, 1); + + DEALLOCATE_PARSER; + TEST_END; +} + +int test_empty_parentheses() { + TEST_BEGIN; + + char* regex = "a()b"; + ALLOCATE_PARSER; + + assert_is_null(result); + + DEALLOCATE_PARSER; + TEST_END; +} + +int test_unmatched_parentheses() { + TEST_BEGIN; + + char* regex = "a(b"; + ALLOCATE_PARSER; + + assert_is_null(result); + + DEALLOCATE_PARSER; + TEST_END; +} + +int test_consecutive_operators() { + TEST_BEGIN; + + char* regex = "a**+?"; + ALLOCATE_PARSER; + + ASTNode expected = { + .type = QUESTION_NODE, + .child1 = &(ASTNode){ + .type = PLUS_NODE, + .child1 = &(ASTNode){ + .type = STAR_NODE, + .child1 = &(ASTNode){ + .type = STAR_NODE, + .child1 = &(ASTNode){.type = CHAR_NODE, .extra = {.character = 'a'}}, + .extra={0} + }, + .extra={0} + }, + .extra={0} + }, + .extra={0} + }; + + int success = compare_ast(result, &expected); + assert_equals_int(success, 1); + + DEALLOCATE_PARSER; + TEST_END; +} + +Test tests[] = { + {.name="test_simple_char", .func=test_simple_char}, + {.name="test_simple_concatenation", .func=test_simple_concatenation}, + {.name="test_simple_alternation", .func=test_simple_alternation}, + {.name="test_complex_expression", .func=test_complex_expression}, + {.name="test_nested_parentheses", .func=test_nested_parentheses}, + {.name="test_multiple_alternations", .func=test_multiple_alternations}, + {.name="test_empty_parentheses", .func=test_empty_parentheses}, + {.name="test_unmatched_parentheses", .func=test_unmatched_parentheses}, + {.name="test_consecutive_operators", .func=test_consecutive_operators}, + {.name=NULL}, + {.name=NULL}, +}; + +int main(int argc, char* argv[]) { + return default_main(&argv[1], argc - 1); +}