From eeff5e273f22aa28e81ab080e9ffdce85ac394b8 Mon Sep 17 00:00:00 2001 From: Bad Diode Date: Fri, 22 Oct 2021 09:59:31 +0200 Subject: Prepare skeleton for bytecode interpreter --- src/treewalk/lexer.c | 257 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 src/treewalk/lexer.c (limited to 'src/treewalk/lexer.c') diff --git a/src/treewalk/lexer.c b/src/treewalk/lexer.c new file mode 100644 index 0000000..38ca37c --- /dev/null +++ b/src/treewalk/lexer.c @@ -0,0 +1,257 @@ +#include "lexer.h" + +void +print_token(Token tok) { + printf("LINE: %3ld COL: %3ld ", tok.line, tok.column); + switch (tok.type) { + case TOKEN_LPAREN: { + printf("TOKEN_LPAREN"); + } break; + case TOKEN_RPAREN: { + printf("TOKEN_RPAREN"); + } break; + case TOKEN_QUOTE: { + printf("TOKEN_QUOTE"); + } break; + case TOKEN_TRUE: { + printf("TOKEN_TRUE"); + } break; + case TOKEN_FALSE: { + printf("TOKEN_FALSE"); + } break; + case TOKEN_NIL: { + printf("TOKEN_NIL"); + } break; + case TOKEN_FIXNUM: { + printf("TOKEN_FIXNUM -> "); + sv_write(&tok.value, stdout); + } break; + case TOKEN_SYMBOL: { + printf("TOKEN_SYMBOL -> "); + sv_write(&tok.value, stdout); + } break; + case TOKEN_STRING: { + printf("TOKEN_STRING -> "); + sv_write(&tok.value, stdout); + } break; + case TOKEN_EOF: { + printf("TOKEN_EOF"); + } break; + case TOKEN_UNKNOWN: { + printf("TOKEN_UNKNOWN"); + } break; + } + printf("\n"); +} + +char +scan_next(Scanner *scanner) { + char c = sv_next(&scanner->current); + if (c == '\n') { + scanner->line_number++; + scanner->col_number = 1; + } else { + scanner->col_number++; + } + scanner->offset++; + return c; +} + +char +scan_peek(const Scanner *scanner) { + return sv_peek(&scanner->current); +} + +bool +scan_has_next(const Scanner *scanner) { + return scanner->current.n != 0; +} + +void +skip_whitespace(Scanner *scanner) { + while (scan_has_next(scanner)) { + char c = scan_peek(scanner); + switch (c) { + case ' ': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': { + scan_next(scanner); + } break; + default: { + return; + } break; + } + } +} + +bool +is_delimiter(char c) { + switch (c) { + case EOF: + case '\0': + case ';': + case '"': + case '\'': + case '(': + case ')': + case ' ': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': { + return true; + } break; + } + return false; +} + +TokenType +find_primitive_type(const StringView value) { + bool is_fixnum = true; + for (size_t i = 0; i < value.n; i++) { + char c = value.start[i]; + if (i == 0 && c == '-' && value.n > 1) { + continue; + } + if (!(c >= '0' && c <= '9')) { + is_fixnum = false; + break; + } + } + if (is_fixnum) { + return TOKEN_FIXNUM; + } + if (sv_equal(&value, &(StringView){"true", 4})) { + return TOKEN_TRUE; + } + if (sv_equal(&value, &(StringView){"false", 5})) { + return TOKEN_FALSE; + } + return TOKEN_SYMBOL; +} + +Token * +tokenize(const StringView *sv) { + Token *tokens = NULL; + array_init(tokens, 1); + Scanner scanner = (Scanner){ + .current = *sv, + .line_number = 1, + .col_number = 1, + }; + + while (scan_has_next(&scanner)) { + skip_whitespace(&scanner); + size_t line = scanner.line_number; + size_t col = scanner.col_number; + size_t offset = scanner.offset; + char c = scan_next(&scanner); + switch (c) { + case ';': { + while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} + } break; + case '"': { + char prev = c; + bool found = false; + size_t n = 0; + while (scan_has_next(&scanner)) { + c = scan_next(&scanner); + if (c == '"' && prev != '\\') { + found = true; + break; + } + prev = c; + n++; + } + if (!found) { + error_push((Error){ + .type = ERR_TYPE_LEXER, + .value = ERR_UNMATCHED_STRING, + .line = line, + .col = col, + }); + return tokens; + } + Token token = (Token){ + .value = (StringView){ + .start = &sv->start[offset + 1], + .n = n, + }, + .type = TOKEN_STRING, + .line = line, + .column = col, + }; + array_push(tokens, token); + } break; + case '\'': { + Token token = (Token){ + .type = TOKEN_QUOTE, + .line = line, + .column = col, + }; + array_push(tokens, token); + } break; + case '(': { + if (scan_peek(&scanner) == ')') { + scan_next(&scanner); + Token token = (Token){ + .type = TOKEN_NIL, + .line = line, + .column = col, + }; + array_push(tokens, token); + } else { + Token token = (Token){ + .type = TOKEN_LPAREN, + .line = line, + .column = col, + }; + array_push(tokens, token); + } + } break; + case ')': { + Token token = (Token){ + .type = TOKEN_RPAREN, + .line = line, + .column = col, + }; + array_push(tokens, token); + } break; + default: { + size_t n = 1; + while (!is_delimiter(scan_peek(&scanner))) { + scan_next(&scanner); + n++; + } + if (c == EOF || c == '\0') { + break; + } + Token token = (Token){ + .value = (StringView){ + .start = &sv->start[offset], + .n = n, + }, + .type = TOKEN_SYMBOL, + .line = line, + .column = col, + }; + token.type = find_primitive_type(token.value); + array_push(tokens, token); + } break; + } + } + + // Push EOF token. + Token token = (Token){ + .type = TOKEN_EOF, + .line = scanner.line_number, + .column = 1, + }; + array_push(tokens, token); + + return tokens; +} -- cgit v1.2.1