From aa5db9c6d13f353cfb839f732eb81d02d74fc1b7 Mon Sep 17 00:00:00 2001 From: Bad Diode Date: Sat, 9 Oct 2021 11:45:54 +0200 Subject: Refactor tokenizer/parser to assign/use token types --- src/bootstrap/main.c | 294 +++++++++++++++++++++++++++++---------------------- 1 file changed, 170 insertions(+), 124 deletions(-) diff --git a/src/bootstrap/main.c b/src/bootstrap/main.c index 3c5663a..65a61f4 100755 --- a/src/bootstrap/main.c +++ b/src/bootstrap/main.c @@ -60,21 +60,65 @@ read_line(void) { return (StringView){.start = (char *)&readline_buf, .n = n}; } +typedef enum TokenType { + TOKEN_UNKNOWN = 0, + TOKEN_LPAREN, + TOKEN_RPAREN, + TOKEN_FIXNUM, + TOKEN_SYMBOL, + TOKEN_BOOL, + TOKEN_STRING, +} TokenType; + +typedef struct Token { + TokenType type; + StringView value; +} Token; + typedef struct Tokens { - StringView *start; + Token *start; size_t n; } Tokens; +#define TRUE_TOKEN (StringView){"true", 4} +#define FALSE_TOKEN (StringView){"false", 5} +#define LPAREN_TOKEN (StringView){"(", 1} +#define RPAREN_TOKEN (StringView){")", 1} + +TokenType +find_token_type(StringView value) { + bool is_fixnum = true; + for (size_t i = 0; i < value.n; i++) { + char c = value.start[i]; + if (i == 0 && c == '-' && value.n > 1) { + continue; + } + if (!isdigit(c)) { + is_fixnum = false; + break; + } + } + if (is_fixnum) { + return TOKEN_FIXNUM; + } + + if (sv_equal(value, TRUE_TOKEN) || sv_equal(value, FALSE_TOKEN)) { + return TOKEN_BOOL; + } + + return TOKEN_SYMBOL; +} + Tokens tokenize(StringView sv) { // NOTE: Not allocating any memory for now, but we are limited by a maximum // number of tokens we can process. #define TOKENS_BUF_SIZE 1024 - static StringView tokens_buf[TOKENS_BUF_SIZE]; + static Token tokens_buf[TOKENS_BUF_SIZE]; // Clear buffer. for (size_t i = 0; i < TOKENS_BUF_SIZE; i++) { - tokens_buf[i] = (StringView){0}; + tokens_buf[i] = (Token){0}; } size_t n = 0; @@ -88,11 +132,15 @@ tokenize(StringView sv) { case '\t': case '\v': { if (token_n != 0) { - // Push token. - tokens_buf[n++] = (StringView){ - .start = &sv.start[i - token_n], - .n = token_n, + Token token = (Token){ + .type = TOKEN_UNKNOWN, + .value = (StringView){ + .start = &sv.start[i - token_n], + .n = token_n, + } }; + token.type = find_token_type(token.value); + tokens_buf[n++] = token; token_n = 0; } continue; @@ -117,19 +165,14 @@ tokenize(StringView sv) { string_end++; } - // Push string token. - tokens_buf[n++] = (StringView){ - .start = &sv.start[string_start - 1], - .n = 1, - }; - tokens_buf[n++] = (StringView){ - .start = &sv.start[string_start], - .n = string_end - string_start, - }; - tokens_buf[n++] = (StringView){ - .start = &sv.start[string_end], - .n = 1, + Token token = (Token){ + .type = TOKEN_STRING, + .value = (StringView){ + .start = &sv.start[string_start], + .n = string_end - string_start, + } }; + tokens_buf[n++] = token; token_n = 0; i += string_end - string_start + 1; } break; @@ -146,11 +189,12 @@ tokenize(StringView sv) { fprintf(stderr, "error: lparen delimiter within symbol name\n"); return (Tokens){0}; } - // Push paren token. - tokens_buf[n++] = (StringView){ - .start = &sv.start[i], - .n = 1, + + Token token = (Token){ + .type = TOKEN_LPAREN, + .value = LPAREN_TOKEN, }; + tokens_buf[n++] = token; } break; case ')': { if ((i + 1) < sv.n) { @@ -163,20 +207,27 @@ tokenize(StringView sv) { if (token_n != 0) { // Push previous token. - tokens_buf[n++] = (StringView){ - .start = &sv.start[i - token_n], - .n = token_n, + Token token = (Token){ + .type = TOKEN_UNKNOWN, + .value = (StringView){ + .start = &sv.start[i - token_n], + .n = token_n, + } }; + token.type = find_token_type(token.value); + tokens_buf[n++] = token; token_n = 0; } - // Push paren token. - tokens_buf[n++] = (StringView){ - .start = &sv.start[i], - .n = 1, + Token token = (Token){ + .type = TOKEN_RPAREN, + .value = RPAREN_TOKEN, }; + tokens_buf[n++] = token; + } break; + case EOF: { + break; } break; - // TODO: Handle double quotes and escaped quotes. default: { token_n++; } break; @@ -184,21 +235,26 @@ tokenize(StringView sv) { } if (token_n != 0) { // End of line encountered. - tokens_buf[n++] = (StringView){ - .start = &sv.start[sv.n - token_n], - .n = token_n, + Token token = (Token){ + .type = TOKEN_UNKNOWN, + .value = (StringView){ + .start = &sv.start[sv.n - token_n], + .n = token_n, + } }; + token.type = find_token_type(token.value); + tokens_buf[n++] = token; } - return (Tokens){.start = (StringView *)&tokens_buf, .n = n}; + return (Tokens){.start = (Token *)&tokens_buf, .n = n}; } -StringView * +Token * consume_token(Tokens *tokens) { if (tokens->n == 0) { return NULL; } - StringView *ret = tokens->start; + Token *ret = tokens->start; tokens->start = &tokens->start[1]; tokens->n--; return ret; @@ -250,7 +306,9 @@ typedef struct Object { // Singletons. // -Object *empty_list; +Object *obj_nil; +Object *obj_true; +Object *obj_false; // // Environment. @@ -330,7 +388,7 @@ make_procedure(Object *(*proc)(struct Object *args)) { } Object * -cons(Object *car, Object *cdr) { +make_pair(Object *car, Object *cdr) { Object *obj = malloc(sizeof(Object)); obj->type = OBJ_TYPE_PAIR; obj->car = car; @@ -338,98 +396,68 @@ cons(Object *car, Object *cdr) { return obj; } -bool -token_is_fixnum(StringView token) { - for (size_t i = 0; i < token.n; i++) { - char c = token.start[i]; - if (i == 0 && c == '-' && token.n > 1) { - continue; - } - if (!isdigit(c)) { - return false; - } - } - return true; -} - -#define TRUE_TOKEN (StringView){"true", 4} -#define FALSE_TOKEN (StringView){"false", 5} - Object * -build_ast(Tokens *tokens) { - // DEBUG: Printing tokens. - // printf("N_TOKENS: %ld\n", tokens->n); - // for (size_t i = 0; i < tokens->n; i++) { - // printf("TOKEN: "); - // sv_write(tokens->start[i]); - // printf("\tN: %ld", tokens->start[i].n); - // printf("\n"); - // } - - // TODO: Report error if we haven't consumed all the tokens? +parse(Tokens *tokens) { while (tokens->n > 0) { - StringView *token = consume_token(tokens); + Token *token = consume_token(tokens); if (token == NULL) { return NULL; } - // OBJ_TYPE_FIXNUM - if (token_is_fixnum(*token)) { - // Convert token to fixnum. - ssize_t num = 0; - int sign = 1; - for (size_t i = 0; i < token->n; i++) { - char c = token->start[i]; - if (c == '-') { - sign = -1; - continue; + switch (token->type) { + case TOKEN_FIXNUM: { + ssize_t num = 0; + int sign = 1; + for (size_t i = 0; i < token->value.n; i++) { + char c = token->value.start[i]; + if (c == '-') { + sign = -1; + continue; + } + num = num * 10 + (c - '0'); } - num = num * 10 + (c - '0'); - } - return make_fixnum(num * sign); - } - - // OBJ_TYPE_BOOL - if (sv_equal(*token, TRUE_TOKEN)) { - return make_boolean(true); - } - if (sv_equal(*token, FALSE_TOKEN)) { - return make_boolean(false); - } - - // OBJ_TYPE_LIST - if (token->start[0] == ')') { - return NULL; - } - if (token->start[0] == '(') { - if (tokens->n > 0 && tokens->start[0].start[0] == ')') { - return empty_list; - } - - Object *next_obj = build_ast(tokens); - if (next_obj == NULL) { + return make_fixnum(num * sign); + } break; + case TOKEN_BOOL: { + if (sv_equal(token->value, TRUE_TOKEN)) { + return obj_true; + } + if (sv_equal(token->value, FALSE_TOKEN)) { + return obj_false; + } + } break; + case TOKEN_RPAREN: { return NULL; - } - Object *root = cons(next_obj, empty_list); - Object *list = root; - while (tokens->n > 0 && (next_obj = build_ast(tokens)) != NULL) { - list->cdr = cons(next_obj, empty_list); - list = list->cdr; - } - return root; - } + } break; + case TOKEN_LPAREN: { + if (tokens->n > 0 && tokens->start[0].type == TOKEN_RPAREN) { + return obj_nil; + } - // OBJ_TYPE_STRING - if (token->start[0] == '"') { - Object *obj = make_empty_string(); - token = consume_token(tokens); - append_string(obj, *token); - consume_token(tokens); - return obj; + Object *next_obj = parse(tokens); + if (next_obj == NULL) { + return NULL; + } + Object *root = make_pair(next_obj, obj_nil); + Object *list = root; + while (tokens->n > 0 && (next_obj = parse(tokens)) != NULL) { + list->cdr = make_pair(next_obj, obj_nil); + list = list->cdr; + } + return root; + } break; + case TOKEN_STRING: { + Object *obj = make_empty_string(); + append_string(obj, token->value); + return obj; + } break; + case TOKEN_SYMBOL: { + return make_symbol(token->value.start, token->value.n); + } break; + default: { + fprintf(stderr, "error: unknown token\n"); + } break; } - - // OBJ_TYPE_SYMBOL - return make_symbol(token->start, token->n); } return NULL; @@ -601,7 +629,9 @@ init(void) { } // Initialize singletons. - empty_list = make_empty_list(); + obj_nil = make_empty_list(); + obj_true = make_boolean(true); + obj_false = make_boolean(false); // Add primitive functions. environment[env_n++] = (EnvSymbol){make_symbol("+", 1), make_procedure(proc_add)}; @@ -639,7 +669,7 @@ eval(Object *root) { } } break; default: { - printf("TYPE NOT IMPLEMENTED FOR EVAL.\n"); + printf("error: can't eval type %d.\n", root->type); } break; } @@ -654,8 +684,24 @@ main(void) { printf(REPL_PROMPT); StringView line = read_line(); Tokens tokens = tokenize(line); - Object *ast = build_ast(&tokens); +#if DEBUG + printf("N_TOKENS: %ld\n", tokens.n); + for (size_t i = 0; i < tokens.n; i++) { + printf("\tTYPE: %3d ", tokens.start[i].type); + printf("N: %3ld ", tokens.start[i].value.n); + printf("VALUE: "); + sv_write(tokens.start[i].value); + printf("\n"); + } +#endif + Object *ast = parse(&tokens); if (ast) { +#if DEBUG + printf("AST: "); + display(ast); + printf("\n"); + printf("EVAL: "); +#endif display(eval(ast)); printf("\n"); } -- cgit v1.2.1