diff options
author | Bad Diode <bd@badd10de.dev> | 2021-10-29 19:11:40 +0200 |
---|---|---|
committer | Bad Diode <bd@badd10de.dev> | 2021-10-29 19:11:40 +0200 |
commit | 5ed73b695e6b463149ab0c9ae3eccb26a4ec5807 (patch) | |
tree | 01aa089934d1b49fe515fe86fffca01c471c69e9 | |
parent | e73a4c16a2269cdb2f5e7d66fb9839e4c44e14de (diff) | |
download | bdl-5ed73b695e6b463149ab0c9ae3eccb26a4ec5807.tar.gz bdl-5ed73b695e6b463149ab0c9ae3eccb26a4ec5807.zip |
Add parser for tokens->ast conversion
-rwxr-xr-x | src/lexer.c | 37 | ||||
-rwxr-xr-x | src/lexer.h | 9 | ||||
-rwxr-xr-x | src/main.c | 23 | ||||
-rw-r--r-- | src/parser.c | 257 | ||||
-rwxr-xr-x | src/parser.h | 84 |
5 files changed, 379 insertions, 31 deletions
diff --git a/src/lexer.c b/src/lexer.c index 6a417e4..ac93a5c 100755 --- a/src/lexer.c +++ b/src/lexer.c | |||
@@ -16,7 +16,7 @@ static const char* token_str[] = { | |||
16 | 16 | ||
17 | void | 17 | void |
18 | print_token(Token tok) { | 18 | print_token(Token tok) { |
19 | printf("[%4ld:%-4ld] ", tok.line, tok.column); | 19 | printf("[%4ld:%-4ld] ", tok.line, tok.col); |
20 | printf("%s", token_str[tok.type]); | 20 | printf("%s", token_str[tok.type]); |
21 | switch (tok.type) { | 21 | switch (tok.type) { |
22 | case TOKEN_FIXNUM: { | 22 | case TOKEN_FIXNUM: { |
@@ -128,11 +128,10 @@ find_primitive_type(const StringView value) { | |||
128 | return TOKEN_SYMBOL; | 128 | return TOKEN_SYMBOL; |
129 | } | 129 | } |
130 | 130 | ||
131 | Tokens | 131 | Token * |
132 | tokenize(const StringView *sv) { | 132 | tokenize(const StringView *sv, Errors *errors) { |
133 | Tokens tokens = {0}; | 133 | Token *tokens = NULL; |
134 | tokens.tokens = NULL; | 134 | array_init(tokens, 1); |
135 | array_init(tokens.tokens, 1); | ||
136 | Scanner scanner = (Scanner){ | 135 | Scanner scanner = (Scanner){ |
137 | .current = *sv, | 136 | .current = *sv, |
138 | .line_number = 1, | 137 | .line_number = 1, |
@@ -163,7 +162,7 @@ tokenize(const StringView *sv) { | |||
163 | n++; | 162 | n++; |
164 | } | 163 | } |
165 | if (!found) { | 164 | if (!found) { |
166 | error_push(&tokens.errors, (Error){ | 165 | error_push(errors, (Error){ |
167 | .type = ERR_TYPE_LEXER, | 166 | .type = ERR_TYPE_LEXER, |
168 | .value = ERR_UNMATCHED_STRING, | 167 | .value = ERR_UNMATCHED_STRING, |
169 | .line = line, | 168 | .line = line, |
@@ -178,9 +177,9 @@ tokenize(const StringView *sv) { | |||
178 | }, | 177 | }, |
179 | .type = TOKEN_STRING, | 178 | .type = TOKEN_STRING, |
180 | .line = line, | 179 | .line = line, |
181 | .column = col, | 180 | .col = col, |
182 | }; | 181 | }; |
183 | array_push(tokens.tokens, token); | 182 | array_push(tokens, token); |
184 | } break; | 183 | } break; |
185 | case '(': { | 184 | case '(': { |
186 | if (scan_peek(&scanner) == ')') { | 185 | if (scan_peek(&scanner) == ')') { |
@@ -188,25 +187,25 @@ tokenize(const StringView *sv) { | |||
188 | Token token = (Token){ | 187 | Token token = (Token){ |
189 | .type = TOKEN_NIL, | 188 | .type = TOKEN_NIL, |
190 | .line = line, | 189 | .line = line, |
191 | .column = col, | 190 | .col = col, |
192 | }; | 191 | }; |
193 | array_push(tokens.tokens, token); | 192 | array_push(tokens, token); |
194 | } else { | 193 | } else { |
195 | Token token = (Token){ | 194 | Token token = (Token){ |
196 | .type = TOKEN_LPAREN, | 195 | .type = TOKEN_LPAREN, |
197 | .line = line, | 196 | .line = line, |
198 | .column = col, | 197 | .col = col, |
199 | }; | 198 | }; |
200 | array_push(tokens.tokens, token); | 199 | array_push(tokens, token); |
201 | } | 200 | } |
202 | } break; | 201 | } break; |
203 | case ')': { | 202 | case ')': { |
204 | Token token = (Token){ | 203 | Token token = (Token){ |
205 | .type = TOKEN_RPAREN, | 204 | .type = TOKEN_RPAREN, |
206 | .line = line, | 205 | .line = line, |
207 | .column = col, | 206 | .col = col, |
208 | }; | 207 | }; |
209 | array_push(tokens.tokens, token); | 208 | array_push(tokens, token); |
210 | } break; | 209 | } break; |
211 | default: { | 210 | default: { |
212 | size_t n = 1; | 211 | size_t n = 1; |
@@ -224,10 +223,10 @@ tokenize(const StringView *sv) { | |||
224 | }, | 223 | }, |
225 | .type = TOKEN_SYMBOL, | 224 | .type = TOKEN_SYMBOL, |
226 | .line = line, | 225 | .line = line, |
227 | .column = col, | 226 | .col = col, |
228 | }; | 227 | }; |
229 | token.type = find_primitive_type(token.value); | 228 | token.type = find_primitive_type(token.value); |
230 | array_push(tokens.tokens, token); | 229 | array_push(tokens, token); |
231 | } break; | 230 | } break; |
232 | } | 231 | } |
233 | } | 232 | } |
@@ -236,9 +235,9 @@ tokenize(const StringView *sv) { | |||
236 | Token token = (Token){ | 235 | Token token = (Token){ |
237 | .type = TOKEN_EOF, | 236 | .type = TOKEN_EOF, |
238 | .line = scanner.line_number, | 237 | .line = scanner.line_number, |
239 | .column = 1, | 238 | .col = 1, |
240 | }; | 239 | }; |
241 | array_push(tokens.tokens, token); | 240 | array_push(tokens, token); |
242 | 241 | ||
243 | return tokens; | 242 | return tokens; |
244 | } | 243 | } |
diff --git a/src/lexer.h b/src/lexer.h index e56f5f2..43898bd 100755 --- a/src/lexer.h +++ b/src/lexer.h | |||
@@ -26,14 +26,9 @@ typedef struct Token { | |||
26 | TokenType type; | 26 | TokenType type; |
27 | StringView value; | 27 | StringView value; |
28 | size_t line; | 28 | size_t line; |
29 | size_t column; | 29 | size_t col; |
30 | } Token; | 30 | } Token; |
31 | 31 | ||
32 | typedef struct Tokens { | ||
33 | Token *tokens; | ||
34 | Errors errors; | ||
35 | } Tokens; | ||
36 | |||
37 | typedef struct Scanner { | 32 | typedef struct Scanner { |
38 | StringView current; | 33 | StringView current; |
39 | size_t line_number; | 34 | size_t line_number; |
@@ -62,6 +57,6 @@ bool is_delimiter(char c); | |||
62 | TokenType find_primitive_type(const StringView value); | 57 | TokenType find_primitive_type(const StringView value); |
63 | 58 | ||
64 | // Generate a list of tokens from the given string. | 59 | // Generate a list of tokens from the given string. |
65 | Tokens tokenize(const StringView *sv); | 60 | Token * tokenize(const StringView *sv, Errors *errors); |
66 | 61 | ||
67 | #endif // BDL_LEXER_H | 62 | #endif // BDL_LEXER_H |
@@ -7,6 +7,7 @@ | |||
7 | #include "string_view.c" | 7 | #include "string_view.c" |
8 | #include "errors.c" | 8 | #include "errors.c" |
9 | #include "lexer.c" | 9 | #include "lexer.c" |
10 | #include "parser.c" | ||
10 | 11 | ||
11 | void | 12 | void |
12 | init(void) { | 13 | init(void) { |
@@ -20,20 +21,32 @@ halt(void) { | |||
20 | 21 | ||
21 | void | 22 | void |
22 | process_source(const StringView *source, const char *file_name) { | 23 | process_source(const StringView *source, const char *file_name) { |
24 | Errors errors = {0}; | ||
25 | |||
23 | // Read tokens. | 26 | // Read tokens. |
24 | Tokens tokens = tokenize(source); | 27 | Token *tokens = tokenize(source, &errors); |
25 | if (tokens.errors.n != 0) { | 28 | if (errors.n != 0) { |
26 | report_errors(&tokens.errors, file_name); | 29 | report_errors(&errors, file_name); |
30 | array_free(tokens); | ||
31 | exit(EXIT_FAILURE); | ||
32 | } | ||
33 | |||
34 | // Parser. | ||
35 | Root *roots = parse(tokens, &errors); | ||
36 | if (errors.n != 0) { | ||
37 | report_errors(&errors, file_name); | ||
38 | free_roots(roots); | ||
39 | array_free(tokens); | ||
27 | exit(EXIT_FAILURE); | 40 | exit(EXIT_FAILURE); |
28 | } | 41 | } |
42 | array_free(tokens); | ||
29 | 43 | ||
30 | // TODO: Parser. | ||
31 | // TODO: Semantic analysis. | 44 | // TODO: Semantic analysis. |
32 | // TODO: Optimization. | 45 | // TODO: Optimization. |
33 | // TODO: Compilation. | 46 | // TODO: Compilation. |
34 | 47 | ||
35 | // Free resources. | 48 | // Free resources. |
36 | array_free(tokens.tokens); | 49 | free_roots(roots); |
37 | } | 50 | } |
38 | 51 | ||
39 | void | 52 | void |
diff --git a/src/parser.c b/src/parser.c new file mode 100644 index 0000000..2d70c9d --- /dev/null +++ b/src/parser.c | |||
@@ -0,0 +1,257 @@ | |||
1 | #include "parser.h" | ||
2 | #include "darray.h" | ||
3 | |||
4 | Token | ||
5 | peek_token(const Parser *parser) { | ||
6 | return parser->tokens[parser->current]; | ||
7 | } | ||
8 | |||
9 | Token | ||
10 | next_token(Parser *parser) { | ||
11 | return parser->tokens[parser->current++]; | ||
12 | } | ||
13 | |||
14 | Token | ||
15 | previous_token(Parser *parser) { | ||
16 | return parser->tokens[parser->current - 1]; | ||
17 | } | ||
18 | |||
19 | Token | ||
20 | rewind_token(Parser *parser) { | ||
21 | return parser->tokens[--parser->current]; | ||
22 | } | ||
23 | |||
24 | bool | ||
25 | has_next_token(const Parser *parser) { | ||
26 | return parser->current < array_size(parser->tokens) | ||
27 | && peek_token(parser).type != TOKEN_EOF; | ||
28 | } | ||
29 | |||
30 | Object * | ||
31 | parse_fixnum(Token tok) { | ||
32 | ssize_t num = 0; | ||
33 | ssize_t sign = 1; | ||
34 | for (size_t i = 0; i < tok.value.n; i++) { | ||
35 | char c = tok.value.start[i]; | ||
36 | if (c == '-') { | ||
37 | sign = -1; | ||
38 | continue; | ||
39 | } | ||
40 | num = num * 10 + (c - '0'); | ||
41 | } | ||
42 | Object *ret = object_alloc(tok, OBJ_TYPE_FIXNUM); | ||
43 | ret->fixnum = num * sign; | ||
44 | return ret; | ||
45 | } | ||
46 | |||
47 | Object * | ||
48 | parse_bool(Token tok) { | ||
49 | ObjectType type = tok.type == TOKEN_TRUE ? OBJ_TYPE_TRUE : OBJ_TYPE_FALSE; | ||
50 | Object *ret = object_alloc(tok, type); | ||
51 | return ret; | ||
52 | } | ||
53 | |||
54 | Object * | ||
55 | parse_string(Token tok) { | ||
56 | Object *ret = object_alloc(tok, OBJ_TYPE_STRING); | ||
57 | array_init(ret->text, tok.value.n); | ||
58 | array_insert(ret->text, tok.value.start, tok.value.n); | ||
59 | return ret; | ||
60 | } | ||
61 | |||
62 | Object * | ||
63 | parse_symbol(Token tok) { | ||
64 | Object *ret = object_alloc(tok, OBJ_TYPE_SYMBOL); | ||
65 | array_init(ret->text, tok.value.n); | ||
66 | array_insert(ret->text, tok.value.start, tok.value.n); | ||
67 | return ret; | ||
68 | } | ||
69 | |||
70 | Object * | ||
71 | parse_list(Parser *parser, Errors *errors) { | ||
72 | if (errors->n != 0) { | ||
73 | return NULL; | ||
74 | } | ||
75 | Token start = previous_token(parser); | ||
76 | Object *root = object_alloc(start, OBJ_TYPE_PAIR); | ||
77 | root->car = NULL; | ||
78 | root->cdr = NULL; | ||
79 | Object *current = root; | ||
80 | while (has_next_token(parser)) { | ||
81 | current->car = parse_tree(parser, errors); | ||
82 | if (errors->n != 0 || current->car == NULL) { | ||
83 | object_free(root); | ||
84 | return NULL; | ||
85 | } | ||
86 | |||
87 | Token tok = peek_token(parser); | ||
88 | if (tok.type == TOKEN_RPAREN) { | ||
89 | next_token(parser); | ||
90 | return root; | ||
91 | } | ||
92 | |||
93 | Object *next = object_alloc(start, OBJ_TYPE_PAIR); | ||
94 | next->car = NULL; | ||
95 | next->cdr = NULL; | ||
96 | current->cdr = next; | ||
97 | current = current->cdr; | ||
98 | } | ||
99 | object_free(root); | ||
100 | error_push(errors, (Error){ | ||
101 | .type = ERR_TYPE_PARSER, | ||
102 | .value = ERR_UNBALANCED_PAREN, | ||
103 | .line = start.line, | ||
104 | .col = start.col, | ||
105 | }); | ||
106 | return NULL; | ||
107 | } | ||
108 | |||
109 | Object * | ||
110 | parse_tree(Parser *parser, Errors *errors) { | ||
111 | Token tok = next_token(parser); | ||
112 | if (errors->n != 0) { | ||
113 | return NULL; | ||
114 | } | ||
115 | switch (tok.type) { | ||
116 | case TOKEN_FIXNUM: { | ||
117 | return parse_fixnum(tok); | ||
118 | } break; | ||
119 | case TOKEN_TRUE: | ||
120 | case TOKEN_FALSE: { | ||
121 | return parse_bool(tok); | ||
122 | } break; | ||
123 | case TOKEN_RPAREN: { | ||
124 | error_push(errors, (Error){ | ||
125 | .type = ERR_TYPE_PARSER, | ||
126 | .value = ERR_UNBALANCED_PAREN, | ||
127 | .line = tok.line, | ||
128 | .col = tok.col, | ||
129 | }); | ||
130 | return NULL; | ||
131 | } break; | ||
132 | case TOKEN_LPAREN: { | ||
133 | return parse_list(parser, errors); | ||
134 | } break; | ||
135 | case TOKEN_STRING: { | ||
136 | return parse_string(tok); | ||
137 | } break; | ||
138 | case TOKEN_SYMBOL: { | ||
139 | return parse_symbol(tok); | ||
140 | } break; | ||
141 | case TOKEN_NIL: { | ||
142 | return object_alloc(tok, OBJ_TYPE_NIL); | ||
143 | } break; | ||
144 | default: { | ||
145 | break; | ||
146 | } break; | ||
147 | } | ||
148 | error_push(errors, (Error){ | ||
149 | .type = ERR_TYPE_PARSER, | ||
150 | .value = ERR_EOF_REACHED, | ||
151 | .line = tok.line, | ||
152 | .col = tok.col, | ||
153 | }); | ||
154 | return NULL; | ||
155 | } | ||
156 | |||
157 | Root * | ||
158 | parse(Token *tokens, Errors *errors) { | ||
159 | Root *roots = NULL; | ||
160 | array_init(roots, 0); | ||
161 | Parser parser = { | ||
162 | .tokens = tokens, | ||
163 | .current = 0, | ||
164 | }; | ||
165 | while (has_next_token(&parser)) { | ||
166 | Object *root = parse_tree(&parser, errors); | ||
167 | OBJ_PRINT(root); | ||
168 | if (errors->n != 0) { | ||
169 | break; | ||
170 | } | ||
171 | array_push(roots, root); | ||
172 | } | ||
173 | return roots; | ||
174 | } | ||
175 | |||
176 | Object * | ||
177 | object_alloc(Token tok, ObjectType type) { | ||
178 | Object *node = malloc(sizeof(Object)); | ||
179 | node->line = tok.line; | ||
180 | node->col = tok.col; | ||
181 | node->type = type; | ||
182 | return node; | ||
183 | } | ||
184 | |||
185 | void | ||
186 | object_free(Object *node) { | ||
187 | if (node == NULL) { | ||
188 | return; | ||
189 | } | ||
190 | if (IS_STRING(node) || IS_SYMBOL(node)) { | ||
191 | array_free(node->text); | ||
192 | } | ||
193 | if (IS_PAIR(node)) { | ||
194 | object_free(node->car); | ||
195 | object_free(node->cdr); | ||
196 | } | ||
197 | free(node); | ||
198 | } | ||
199 | |||
200 | void | ||
201 | free_roots(Root *roots) { | ||
202 | if (roots != NULL) { | ||
203 | for (size_t i = 0; i < array_size(roots); i++) { | ||
204 | object_free(roots[i]); | ||
205 | } | ||
206 | array_free(roots); | ||
207 | } | ||
208 | } | ||
209 | |||
210 | void | ||
211 | display_pair(Object *obj) { | ||
212 | object_display(obj->car); | ||
213 | if (obj->cdr == NULL) { | ||
214 | return; | ||
215 | } | ||
216 | if (IS_PAIR(obj->cdr)) { | ||
217 | printf(" "); | ||
218 | display_pair(obj->cdr); | ||
219 | } else { | ||
220 | printf(" . "); | ||
221 | object_display(obj->cdr); | ||
222 | } | ||
223 | } | ||
224 | |||
225 | void | ||
226 | object_display(Object *obj) { | ||
227 | if (obj == NULL) { | ||
228 | printf("#{error}"); | ||
229 | return; | ||
230 | } | ||
231 | switch (obj->type) { | ||
232 | case OBJ_TYPE_FIXNUM: { | ||
233 | printf("%zd", obj->fixnum); | ||
234 | } break; | ||
235 | case OBJ_TYPE_TRUE: { | ||
236 | printf("true"); | ||
237 | } break; | ||
238 | case OBJ_TYPE_FALSE: { | ||
239 | printf("false"); | ||
240 | } break; | ||
241 | case OBJ_TYPE_NIL: { | ||
242 | printf("()"); | ||
243 | } break; | ||
244 | case OBJ_TYPE_STRING: { | ||
245 | printf("\"%.*s\"", (int)array_size(obj->text), obj->text); | ||
246 | } break; | ||
247 | case OBJ_TYPE_SYMBOL: { | ||
248 | printf(":%.*s", (int)array_size(obj->text), obj->text); | ||
249 | } break; | ||
250 | case OBJ_TYPE_PAIR: { | ||
251 | printf("("); | ||
252 | display_pair(obj); | ||
253 | printf(")"); | ||
254 | } break; | ||
255 | } | ||
256 | return; | ||
257 | } | ||
diff --git a/src/parser.h b/src/parser.h new file mode 100755 index 0000000..ee5febe --- /dev/null +++ b/src/parser.h | |||
@@ -0,0 +1,84 @@ | |||
1 | #ifndef BDL_PARSER_H | ||
2 | #define BDL_PARSER_H | ||
3 | |||
4 | #include "lexer.h" | ||
5 | |||
6 | typedef enum ObjectType { | ||
7 | OBJ_TYPE_NIL, | ||
8 | OBJ_TYPE_TRUE, | ||
9 | OBJ_TYPE_FALSE, | ||
10 | OBJ_TYPE_FIXNUM, | ||
11 | OBJ_TYPE_SYMBOL, | ||
12 | OBJ_TYPE_STRING, | ||
13 | OBJ_TYPE_PAIR, | ||
14 | } ObjectType; | ||
15 | |||
16 | typedef struct Object { | ||
17 | ObjectType type; | ||
18 | union { | ||
19 | // OBJ_TYPE_FIXNUM | ||
20 | ssize_t fixnum; | ||
21 | |||
22 | // OBJ_TYPE_STRING | ||
23 | // OBJ_TYPE_SYMBOL | ||
24 | char *text; | ||
25 | |||
26 | // OBJ_TYPE_PAIR | ||
27 | struct { | ||
28 | struct Object *car; | ||
29 | struct Object *cdr; | ||
30 | }; | ||
31 | }; | ||
32 | |||
33 | size_t line; | ||
34 | size_t col; | ||
35 | } Object; | ||
36 | |||
37 | typedef struct Parser { | ||
38 | Token *tokens; | ||
39 | size_t current; | ||
40 | } Parser; | ||
41 | |||
42 | typedef Object* Root; | ||
43 | |||
44 | // Mimics the functionality in the Scanner functions, but for tokens. | ||
45 | Token next_token(Parser *parser); | ||
46 | Token previous_token(Parser *parser); | ||
47 | Token rewind_token(Parser *parser); | ||
48 | Token peek_token(const Parser *parser); | ||
49 | bool has_next_token(const Parser *parser); | ||
50 | |||
51 | // Parsing operations. | ||
52 | Object * parse_tree(Parser *parser, Errors *errors); | ||
53 | Object * parse_symbol(Token tok); | ||
54 | Object * parse_string(Token tok); | ||
55 | Object * parse_bool(Token tok); | ||
56 | Object * parse_fixnum(Token tok); | ||
57 | Object * parse_list(Parser *parser, Errors *errors); | ||
58 | Root * parse(Token *tokens, Errors *errors); | ||
59 | |||
60 | // Object operations. | ||
61 | void object_display(Object *obj); | ||
62 | |||
63 | // Manage resources. | ||
64 | Object * object_alloc(Token tok, ObjectType type); | ||
65 | void object_free(Object *node); | ||
66 | void free_roots(Root *roots); | ||
67 | |||
68 | // | ||
69 | // Helper macros. | ||
70 | // | ||
71 | |||
72 | // Type checking. | ||
73 | #define IS_NIL(VAL) ((VAL)->type == OBJ_TYPE_NIL) | ||
74 | #define IS_TRUE(VAL) ((VAL)->type != OBJ_TYPE_FALSE) | ||
75 | #define IS_FALSE(VAL) ((VAL)->type == OBJ_TYPE_FALSE) | ||
76 | #define IS_FIXNUM(VAL) ((VAL)->type == OBJ_TYPE_FIXNUM) | ||
77 | #define IS_STRING(VAL) ((VAL)->type == OBJ_TYPE_STRING) | ||
78 | #define IS_SYMBOL(VAL) ((VAL)->type == OBJ_TYPE_SYMBOL) | ||
79 | #define IS_PAIR(VAL) ((VAL)->type == OBJ_TYPE_PAIR) | ||
80 | |||
81 | // Debug. | ||
82 | #define OBJ_PRINT(OBJ) object_display(OBJ); printf("\n"); | ||
83 | |||
84 | #endif // BDL_PARSER_H | ||