From 2627e81de26667b7bc9d88304473e2a234fee7fe Mon Sep 17 00:00:00 2001 From: Bad Diode Date: Mon, 11 Oct 2021 20:59:05 +0200 Subject: Simplify lexer code --- src/bootstrap/lexer.c | 215 ++++++++++++++++++++++++-------------------------- 1 file changed, 104 insertions(+), 111 deletions(-) diff --git a/src/bootstrap/lexer.c b/src/bootstrap/lexer.c index fc53d3c..1add4dc 100644 --- a/src/bootstrap/lexer.c +++ b/src/bootstrap/lexer.c @@ -85,24 +85,14 @@ push_token(Tokens *tokens, Token tok) { } typedef struct Scanner { - StringView orig; StringView current; size_t line_number; size_t col_number; size_t offset; - size_t lexeme_n; - size_t lexeme_offset; - size_t lexeme_line_number; - size_t lexeme_col_number; } Scanner; char scan_next(Scanner *scanner) { - if (scanner->lexeme_n == 0) { - scanner->lexeme_line_number = scanner->line_number; - scanner->lexeme_col_number = scanner->col_number; - scanner->lexeme_offset = scanner->offset; - } char c = sv_next(&scanner->current); if (c == '\n') { scanner->line_number++; @@ -124,27 +114,24 @@ scan_has_next(const Scanner *scanner) { return scanner->current.n != 0; } -bool -scan_has_lexeme(const Scanner * scanner) { - return scanner->lexeme_n != 0; -} - -Token -scan_get_lexeme(Scanner * scanner) { - Token token = (Token){ - .type = TOKEN_UNKNOWN, - .value = (StringView){ - .start = &scanner->orig.start[scanner->lexeme_offset], - .n = scanner->lexeme_n, - }, - .line = scanner->lexeme_line_number, - .column = scanner->lexeme_col_number, - }; - scanner->lexeme_n = 0; - scanner->lexeme_line_number = scanner->line_number; - scanner->lexeme_col_number = scanner->col_number; - scanner->lexeme_offset = scanner->offset; - return token; +void +skip_whitespace(Scanner *scanner) { + while (scan_has_next(scanner)) { + char c = scan_peek(scanner); + switch (c) { + case ' ': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': { + scan_next(scanner); + } break; + default: { + return; + } break; + } + } } TokenType @@ -172,52 +159,51 @@ find_primitive_type(StringView value) { return TOKEN_SYMBOL; } +bool +is_delimiter(char c) { + switch (c) { + case EOF: + case '\0': + case ';': + case '"': + case '\'': + case '(': + case ')': + case ' ': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': { + return true; + } break; + } + return false; +} + Tokens tokenize(const StringView *sv) { Tokens tokens = (Tokens){0}; Scanner scanner = (Scanner){ - .orig = *sv, .current = *sv, .line_number = 1, .col_number = 1, - .lexeme_line_number = 1, - .lexeme_col_number = 1, }; while (scan_has_next(&scanner)) { + skip_whitespace(&scanner); + size_t line = scanner.line_number; + size_t col = scanner.col_number; + size_t offset = scanner.offset; char c = scan_next(&scanner); switch (c) { - case ' ': - case '\f': - case '\n': - case '\r': - case '\t': - case '\v': { - if (scan_has_lexeme(&scanner)) { - Token token = scan_get_lexeme(&scanner); - token.type = find_primitive_type(token.value); - push_token(&tokens, token); - } - } break; case ';': { - if (scan_has_lexeme(&scanner)) { - Token token = scan_get_lexeme(&scanner); - token.type = find_primitive_type(token.value); - push_token(&tokens, token); - } while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} } break; case '"': { - if (scan_has_lexeme(&scanner)) { - Token token = scan_get_lexeme(&scanner); - token.type = find_primitive_type(token.value); - push_token(&tokens, token); - scanner.lexeme_col_number--; - scanner.lexeme_offset--; - } - char prev = c; bool found = false; + size_t n = 0; while (scan_has_next(&scanner)) { c = scan_next(&scanner); if (c == '"' && prev != '\\') { @@ -225,82 +211,89 @@ tokenize(const StringView *sv) { break; } prev = c; - scanner.lexeme_n++; + n++; } - scanner.lexeme_col_number--; - if (found) { - Token token = scan_get_lexeme(&scanner); - token.type = TOKEN_STRING; - push_token(&tokens, token); - } else { + if (!found) { // TODO: Report error: couldn't find the closing quotes. } + Token token = (Token){ + .value = (StringView){ + .start = &sv->start[offset + 1], + .n = n, + }, + .type = TOKEN_STRING, + .line = line, + .column = col, + }; + push_token(&tokens, token); } break; case '\'': { - if (scan_has_lexeme(&scanner)) { - Token token = scan_get_lexeme(&scanner); - token.type = find_primitive_type(token.value); - push_token(&tokens, token); - scanner.lexeme_col_number--; - scanner.lexeme_offset--; - } - Token token = scan_get_lexeme(&scanner); - token.type = TOKEN_QUOTE; + Token token = (Token){ + .type = TOKEN_QUOTE, + .line = line, + .column = col, + }; push_token(&tokens, token); } break; case '(': { - if (scan_has_lexeme(&scanner)) { - Token token = scan_get_lexeme(&scanner); - token.type = find_primitive_type(token.value); - push_token(&tokens, token); - scanner.lexeme_col_number--; - scanner.lexeme_offset--; - } - scanner.lexeme_n++; if (scan_peek(&scanner) == ')') { - scanner.lexeme_n++; scan_next(&scanner); - Token token = scan_get_lexeme(&scanner); - token.type = TOKEN_NIL; + Token token = (Token){ + .type = TOKEN_NIL, + .line = line, + .column = col, + }; push_token(&tokens, token); } else { - Token token = scan_get_lexeme(&scanner); - token.type = TOKEN_LPAREN; + Token token = (Token){ + .type = TOKEN_LPAREN, + .line = line, + .column = col, + }; push_token(&tokens, token); } } break; case ')': { - if (scan_has_lexeme(&scanner)) { - Token token = scan_get_lexeme(&scanner); - token.type = find_primitive_type(token.value); - push_token(&tokens, token); - scanner.lexeme_col_number--; - scanner.lexeme_offset--; - } - scanner.lexeme_n++; - Token token = scan_get_lexeme(&scanner); - token.type = TOKEN_RPAREN; + Token token = (Token){ + .type = TOKEN_RPAREN, + .line = line, + .column = col, + }; push_token(&tokens, token); } break; - case EOF: { - break; - } break; default: { - scanner.lexeme_n++; + size_t n = 1; + while (scan_has_next(&scanner)) { + c = scan_next(&scanner); + if (is_delimiter(c)) { + break; + } + n++; + } + if (c == EOF || c == '\0') { + break; + } + Token token = (Token){ + .value = (StringView){ + .start = &sv->start[offset], + .n = n, + }, + .type = TOKEN_SYMBOL, + .line = line, + .column = col, + }; + token.type = find_primitive_type(token.value); + push_token(&tokens, token); } break; } } - // Push current lexeme if any. - if (scan_has_lexeme(&scanner)) { - Token token = scan_get_lexeme(&scanner); - token.type = find_primitive_type(token.value); - push_token(&tokens, token); - } - // Push EOF token. - Token token = scan_get_lexeme(&scanner); - token.type = TOKEN_EOF; + Token token = (Token){ + .type = TOKEN_EOF, + .line = scanner.line_number, + .column = 1, + }; push_token(&tokens, token); return tokens; -- cgit v1.2.1