diff options
author | Bad Diode <bd@badd10de.dev> | 2021-10-09 11:45:54 +0200 |
---|---|---|
committer | Bad Diode <bd@badd10de.dev> | 2021-10-09 11:45:54 +0200 |
commit | aa5db9c6d13f353cfb839f732eb81d02d74fc1b7 (patch) | |
tree | d3e352e3e49e70804d74e36ab54008fc4ef3a65c /src | |
parent | 33c9c0cd69f6a3f5954222704755771f03c2ea04 (diff) | |
download | bdl-aa5db9c6d13f353cfb839f732eb81d02d74fc1b7.tar.gz bdl-aa5db9c6d13f353cfb839f732eb81d02d74fc1b7.zip |
Refactor tokenizer/parser to assign/use token types
Diffstat (limited to 'src')
-rwxr-xr-x | src/bootstrap/main.c | 294 |
1 files changed, 170 insertions, 124 deletions
diff --git a/src/bootstrap/main.c b/src/bootstrap/main.c index 3c5663a..65a61f4 100755 --- a/src/bootstrap/main.c +++ b/src/bootstrap/main.c | |||
@@ -60,21 +60,65 @@ read_line(void) { | |||
60 | return (StringView){.start = (char *)&readline_buf, .n = n}; | 60 | return (StringView){.start = (char *)&readline_buf, .n = n}; |
61 | } | 61 | } |
62 | 62 | ||
63 | typedef enum TokenType { | ||
64 | TOKEN_UNKNOWN = 0, | ||
65 | TOKEN_LPAREN, | ||
66 | TOKEN_RPAREN, | ||
67 | TOKEN_FIXNUM, | ||
68 | TOKEN_SYMBOL, | ||
69 | TOKEN_BOOL, | ||
70 | TOKEN_STRING, | ||
71 | } TokenType; | ||
72 | |||
73 | typedef struct Token { | ||
74 | TokenType type; | ||
75 | StringView value; | ||
76 | } Token; | ||
77 | |||
63 | typedef struct Tokens { | 78 | typedef struct Tokens { |
64 | StringView *start; | 79 | Token *start; |
65 | size_t n; | 80 | size_t n; |
66 | } Tokens; | 81 | } Tokens; |
67 | 82 | ||
83 | #define TRUE_TOKEN (StringView){"true", 4} | ||
84 | #define FALSE_TOKEN (StringView){"false", 5} | ||
85 | #define LPAREN_TOKEN (StringView){"(", 1} | ||
86 | #define RPAREN_TOKEN (StringView){")", 1} | ||
87 | |||
88 | TokenType | ||
89 | find_token_type(StringView value) { | ||
90 | bool is_fixnum = true; | ||
91 | for (size_t i = 0; i < value.n; i++) { | ||
92 | char c = value.start[i]; | ||
93 | if (i == 0 && c == '-' && value.n > 1) { | ||
94 | continue; | ||
95 | } | ||
96 | if (!isdigit(c)) { | ||
97 | is_fixnum = false; | ||
98 | break; | ||
99 | } | ||
100 | } | ||
101 | if (is_fixnum) { | ||
102 | return TOKEN_FIXNUM; | ||
103 | } | ||
104 | |||
105 | if (sv_equal(value, TRUE_TOKEN) || sv_equal(value, FALSE_TOKEN)) { | ||
106 | return TOKEN_BOOL; | ||
107 | } | ||
108 | |||
109 | return TOKEN_SYMBOL; | ||
110 | } | ||
111 | |||
68 | Tokens | 112 | Tokens |
69 | tokenize(StringView sv) { | 113 | tokenize(StringView sv) { |
70 | // NOTE: Not allocating any memory for now, but we are limited by a maximum | 114 | // NOTE: Not allocating any memory for now, but we are limited by a maximum |
71 | // number of tokens we can process. | 115 | // number of tokens we can process. |
72 | #define TOKENS_BUF_SIZE 1024 | 116 | #define TOKENS_BUF_SIZE 1024 |
73 | static StringView tokens_buf[TOKENS_BUF_SIZE]; | 117 | static Token tokens_buf[TOKENS_BUF_SIZE]; |
74 | 118 | ||
75 | // Clear buffer. | 119 | // Clear buffer. |
76 | for (size_t i = 0; i < TOKENS_BUF_SIZE; i++) { | 120 | for (size_t i = 0; i < TOKENS_BUF_SIZE; i++) { |
77 | tokens_buf[i] = (StringView){0}; | 121 | tokens_buf[i] = (Token){0}; |
78 | } | 122 | } |
79 | 123 | ||
80 | size_t n = 0; | 124 | size_t n = 0; |
@@ -88,11 +132,15 @@ tokenize(StringView sv) { | |||
88 | case '\t': | 132 | case '\t': |
89 | case '\v': { | 133 | case '\v': { |
90 | if (token_n != 0) { | 134 | if (token_n != 0) { |
91 | // Push token. | 135 | Token token = (Token){ |
92 | tokens_buf[n++] = (StringView){ | 136 | .type = TOKEN_UNKNOWN, |
93 | .start = &sv.start[i - token_n], | 137 | .value = (StringView){ |
94 | .n = token_n, | 138 | .start = &sv.start[i - token_n], |
139 | .n = token_n, | ||
140 | } | ||
95 | }; | 141 | }; |
142 | token.type = find_token_type(token.value); | ||
143 | tokens_buf[n++] = token; | ||
96 | token_n = 0; | 144 | token_n = 0; |
97 | } | 145 | } |
98 | continue; | 146 | continue; |
@@ -117,19 +165,14 @@ tokenize(StringView sv) { | |||
117 | string_end++; | 165 | string_end++; |
118 | } | 166 | } |
119 | 167 | ||
120 | // Push string token. | 168 | Token token = (Token){ |
121 | tokens_buf[n++] = (StringView){ | 169 | .type = TOKEN_STRING, |
122 | .start = &sv.start[string_start - 1], | 170 | .value = (StringView){ |
123 | .n = 1, | 171 | .start = &sv.start[string_start], |
124 | }; | 172 | .n = string_end - string_start, |
125 | tokens_buf[n++] = (StringView){ | 173 | } |
126 | .start = &sv.start[string_start], | ||
127 | .n = string_end - string_start, | ||
128 | }; | ||
129 | tokens_buf[n++] = (StringView){ | ||
130 | .start = &sv.start[string_end], | ||
131 | .n = 1, | ||
132 | }; | 174 | }; |
175 | tokens_buf[n++] = token; | ||
133 | token_n = 0; | 176 | token_n = 0; |
134 | i += string_end - string_start + 1; | 177 | i += string_end - string_start + 1; |
135 | } break; | 178 | } break; |
@@ -146,11 +189,12 @@ tokenize(StringView sv) { | |||
146 | fprintf(stderr, "error: lparen delimiter within symbol name\n"); | 189 | fprintf(stderr, "error: lparen delimiter within symbol name\n"); |
147 | return (Tokens){0}; | 190 | return (Tokens){0}; |
148 | } | 191 | } |
149 | // Push paren token. | 192 | |
150 | tokens_buf[n++] = (StringView){ | 193 | Token token = (Token){ |
151 | .start = &sv.start[i], | 194 | .type = TOKEN_LPAREN, |
152 | .n = 1, | 195 | .value = LPAREN_TOKEN, |
153 | }; | 196 | }; |
197 | tokens_buf[n++] = token; | ||
154 | } break; | 198 | } break; |
155 | case ')': { | 199 | case ')': { |
156 | if ((i + 1) < sv.n) { | 200 | if ((i + 1) < sv.n) { |
@@ -163,20 +207,27 @@ tokenize(StringView sv) { | |||
163 | 207 | ||
164 | if (token_n != 0) { | 208 | if (token_n != 0) { |
165 | // Push previous token. | 209 | // Push previous token. |
166 | tokens_buf[n++] = (StringView){ | 210 | Token token = (Token){ |
167 | .start = &sv.start[i - token_n], | 211 | .type = TOKEN_UNKNOWN, |
168 | .n = token_n, | 212 | .value = (StringView){ |
213 | .start = &sv.start[i - token_n], | ||
214 | .n = token_n, | ||
215 | } | ||
169 | }; | 216 | }; |
217 | token.type = find_token_type(token.value); | ||
218 | tokens_buf[n++] = token; | ||
170 | token_n = 0; | 219 | token_n = 0; |
171 | } | 220 | } |
172 | 221 | ||
173 | // Push paren token. | 222 | Token token = (Token){ |
174 | tokens_buf[n++] = (StringView){ | 223 | .type = TOKEN_RPAREN, |
175 | .start = &sv.start[i], | 224 | .value = RPAREN_TOKEN, |
176 | .n = 1, | ||
177 | }; | 225 | }; |
226 | tokens_buf[n++] = token; | ||
227 | } break; | ||
228 | case EOF: { | ||
229 | break; | ||
178 | } break; | 230 | } break; |
179 | // TODO: Handle double quotes and escaped quotes. | ||
180 | default: { | 231 | default: { |
181 | token_n++; | 232 | token_n++; |
182 | } break; | 233 | } break; |
@@ -184,21 +235,26 @@ tokenize(StringView sv) { | |||
184 | } | 235 | } |
185 | if (token_n != 0) { | 236 | if (token_n != 0) { |
186 | // End of line encountered. | 237 | // End of line encountered. |
187 | tokens_buf[n++] = (StringView){ | 238 | Token token = (Token){ |
188 | .start = &sv.start[sv.n - token_n], | 239 | .type = TOKEN_UNKNOWN, |
189 | .n = token_n, | 240 | .value = (StringView){ |
241 | .start = &sv.start[sv.n - token_n], | ||
242 | .n = token_n, | ||
243 | } | ||
190 | }; | 244 | }; |
245 | token.type = find_token_type(token.value); | ||
246 | tokens_buf[n++] = token; | ||
191 | } | 247 | } |
192 | 248 | ||
193 | return (Tokens){.start = (StringView *)&tokens_buf, .n = n}; | 249 | return (Tokens){.start = (Token *)&tokens_buf, .n = n}; |
194 | } | 250 | } |
195 | 251 | ||
196 | StringView * | 252 | Token * |
197 | consume_token(Tokens *tokens) { | 253 | consume_token(Tokens *tokens) { |
198 | if (tokens->n == 0) { | 254 | if (tokens->n == 0) { |
199 | return NULL; | 255 | return NULL; |
200 | } | 256 | } |
201 | StringView *ret = tokens->start; | 257 | Token *ret = tokens->start; |
202 | tokens->start = &tokens->start[1]; | 258 | tokens->start = &tokens->start[1]; |
203 | tokens->n--; | 259 | tokens->n--; |
204 | return ret; | 260 | return ret; |
@@ -250,7 +306,9 @@ typedef struct Object { | |||
250 | // Singletons. | 306 | // Singletons. |
251 | // | 307 | // |
252 | 308 | ||
253 | Object *empty_list; | 309 | Object *obj_nil; |
310 | Object *obj_true; | ||
311 | Object *obj_false; | ||
254 | 312 | ||
255 | // | 313 | // |
256 | // Environment. | 314 | // Environment. |
@@ -330,7 +388,7 @@ make_procedure(Object *(*proc)(struct Object *args)) { | |||
330 | } | 388 | } |
331 | 389 | ||
332 | Object * | 390 | Object * |
333 | cons(Object *car, Object *cdr) { | 391 | make_pair(Object *car, Object *cdr) { |
334 | Object *obj = malloc(sizeof(Object)); | 392 | Object *obj = malloc(sizeof(Object)); |
335 | obj->type = OBJ_TYPE_PAIR; | 393 | obj->type = OBJ_TYPE_PAIR; |
336 | obj->car = car; | 394 | obj->car = car; |
@@ -338,98 +396,68 @@ cons(Object *car, Object *cdr) { | |||
338 | return obj; | 396 | return obj; |
339 | } | 397 | } |
340 | 398 | ||
341 | bool | ||
342 | token_is_fixnum(StringView token) { | ||
343 | for (size_t i = 0; i < token.n; i++) { | ||
344 | char c = token.start[i]; | ||
345 | if (i == 0 && c == '-' && token.n > 1) { | ||
346 | continue; | ||
347 | } | ||
348 | if (!isdigit(c)) { | ||
349 | return false; | ||
350 | } | ||
351 | } | ||
352 | return true; | ||
353 | } | ||
354 | |||
355 | #define TRUE_TOKEN (StringView){"true", 4} | ||
356 | #define FALSE_TOKEN (StringView){"false", 5} | ||
357 | |||
358 | Object * | 399 | Object * |
359 | build_ast(Tokens *tokens) { | 400 | parse(Tokens *tokens) { |
360 | // DEBUG: Printing tokens. | ||
361 | // printf("N_TOKENS: %ld\n", tokens->n); | ||
362 | // for (size_t i = 0; i < tokens->n; i++) { | ||
363 | // printf("TOKEN: "); | ||
364 | // sv_write(tokens->start[i]); | ||
365 | // printf("\tN: %ld", tokens->start[i].n); | ||
366 | // printf("\n"); | ||
367 | // } | ||
368 | |||
369 | // TODO: Report error if we haven't consumed all the tokens? | ||
370 | while (tokens->n > 0) { | 401 | while (tokens->n > 0) { |
371 | StringView *token = consume_token(tokens); | 402 | Token *token = consume_token(tokens); |
372 | if (token == NULL) { | 403 | if (token == NULL) { |
373 | return NULL; | 404 | return NULL; |
374 | } | 405 | } |
375 | 406 | ||
376 | // OBJ_TYPE_FIXNUM | 407 | switch (token->type) { |
377 | if (token_is_fixnum(*token)) { | 408 | case TOKEN_FIXNUM: { |
378 | // Convert token to fixnum. | 409 | ssize_t num = 0; |
379 | ssize_t num = 0; | 410 | int sign = 1; |
380 | int sign = 1; | 411 | for (size_t i = 0; i < token->value.n; i++) { |
381 | for (size_t i = 0; i < token->n; i++) { | 412 | char c = token->value.start[i]; |
382 | char c = token->start[i]; | 413 | if (c == '-') { |
383 | if (c == '-') { | 414 | sign = -1; |
384 | sign = -1; | 415 | continue; |
385 | continue; | 416 | } |
417 | num = num * 10 + (c - '0'); | ||
386 | } | 418 | } |
387 | num = num * 10 + (c - '0'); | 419 | return make_fixnum(num * sign); |
388 | } | 420 | } break; |
389 | return make_fixnum(num * sign); | 421 | case TOKEN_BOOL: { |
390 | } | 422 | if (sv_equal(token->value, TRUE_TOKEN)) { |
391 | 423 | return obj_true; | |
392 | // OBJ_TYPE_BOOL | 424 | } |
393 | if (sv_equal(*token, TRUE_TOKEN)) { | 425 | if (sv_equal(token->value, FALSE_TOKEN)) { |
394 | return make_boolean(true); | 426 | return obj_false; |
395 | } | 427 | } |
396 | if (sv_equal(*token, FALSE_TOKEN)) { | 428 | } break; |
397 | return make_boolean(false); | 429 | case TOKEN_RPAREN: { |
398 | } | ||
399 | |||
400 | // OBJ_TYPE_LIST | ||
401 | if (token->start[0] == ')') { | ||
402 | return NULL; | ||
403 | } | ||
404 | if (token->start[0] == '(') { | ||
405 | if (tokens->n > 0 && tokens->start[0].start[0] == ')') { | ||
406 | return empty_list; | ||
407 | } | ||
408 | |||
409 | Object *next_obj = build_ast(tokens); | ||
410 | if (next_obj == NULL) { | ||
411 | return NULL; | 430 | return NULL; |
412 | } | 431 | } break; |
413 | Object *root = cons(next_obj, empty_list); | 432 | case TOKEN_LPAREN: { |
414 | Object *list = root; | 433 | if (tokens->n > 0 && tokens->start[0].type == TOKEN_RPAREN) { |
415 | while (tokens->n > 0 && (next_obj = build_ast(tokens)) != NULL) { | 434 | return obj_nil; |
416 | list->cdr = cons(next_obj, empty_list); | 435 | } |
417 | list = list->cdr; | ||
418 | } | ||
419 | return root; | ||
420 | } | ||
421 | 436 | ||
422 | // OBJ_TYPE_STRING | 437 | Object *next_obj = parse(tokens); |
423 | if (token->start[0] == '"') { | 438 | if (next_obj == NULL) { |
424 | Object *obj = make_empty_string(); | 439 | return NULL; |
425 | token = consume_token(tokens); | 440 | } |
426 | append_string(obj, *token); | 441 | Object *root = make_pair(next_obj, obj_nil); |
427 | consume_token(tokens); | 442 | Object *list = root; |
428 | return obj; | 443 | while (tokens->n > 0 && (next_obj = parse(tokens)) != NULL) { |
444 | list->cdr = make_pair(next_obj, obj_nil); | ||
445 | list = list->cdr; | ||
446 | } | ||
447 | return root; | ||
448 | } break; | ||
449 | case TOKEN_STRING: { | ||
450 | Object *obj = make_empty_string(); | ||
451 | append_string(obj, token->value); | ||
452 | return obj; | ||
453 | } break; | ||
454 | case TOKEN_SYMBOL: { | ||
455 | return make_symbol(token->value.start, token->value.n); | ||
456 | } break; | ||
457 | default: { | ||
458 | fprintf(stderr, "error: unknown token\n"); | ||
459 | } break; | ||
429 | } | 460 | } |
430 | |||
431 | // OBJ_TYPE_SYMBOL | ||
432 | return make_symbol(token->start, token->n); | ||
433 | } | 461 | } |
434 | 462 | ||
435 | return NULL; | 463 | return NULL; |
@@ -601,7 +629,9 @@ init(void) { | |||
601 | } | 629 | } |
602 | 630 | ||
603 | // Initialize singletons. | 631 | // Initialize singletons. |
604 | empty_list = make_empty_list(); | 632 | obj_nil = make_empty_list(); |
633 | obj_true = make_boolean(true); | ||
634 | obj_false = make_boolean(false); | ||
605 | 635 | ||
606 | // Add primitive functions. | 636 | // Add primitive functions. |
607 | environment[env_n++] = (EnvSymbol){make_symbol("+", 1), make_procedure(proc_add)}; | 637 | environment[env_n++] = (EnvSymbol){make_symbol("+", 1), make_procedure(proc_add)}; |
@@ -639,7 +669,7 @@ eval(Object *root) { | |||
639 | } | 669 | } |
640 | } break; | 670 | } break; |
641 | default: { | 671 | default: { |
642 | printf("TYPE NOT IMPLEMENTED FOR EVAL.\n"); | 672 | printf("error: can't eval type %d.\n", root->type); |
643 | } break; | 673 | } break; |
644 | } | 674 | } |
645 | 675 | ||
@@ -654,8 +684,24 @@ main(void) { | |||
654 | printf(REPL_PROMPT); | 684 | printf(REPL_PROMPT); |
655 | StringView line = read_line(); | 685 | StringView line = read_line(); |
656 | Tokens tokens = tokenize(line); | 686 | Tokens tokens = tokenize(line); |
657 | Object *ast = build_ast(&tokens); | 687 | #if DEBUG |
688 | printf("N_TOKENS: %ld\n", tokens.n); | ||
689 | for (size_t i = 0; i < tokens.n; i++) { | ||
690 | printf("\tTYPE: %3d ", tokens.start[i].type); | ||
691 | printf("N: %3ld ", tokens.start[i].value.n); | ||
692 | printf("VALUE: "); | ||
693 | sv_write(tokens.start[i].value); | ||
694 | printf("\n"); | ||
695 | } | ||
696 | #endif | ||
697 | Object *ast = parse(&tokens); | ||
658 | if (ast) { | 698 | if (ast) { |
699 | #if DEBUG | ||
700 | printf("AST: "); | ||
701 | display(ast); | ||
702 | printf("\n"); | ||
703 | printf("EVAL: "); | ||
704 | #endif | ||
659 | display(eval(ast)); | 705 | display(eval(ast)); |
660 | printf("\n"); | 706 | printf("\n"); |
661 | } | 707 | } |