From 463690390b45ddd96545ae958e2605e262966c9f Mon Sep 17 00:00:00 2001 From: Bad Diode Date: Mon, 11 Oct 2021 19:33:29 +0200 Subject: Add a new version of the lexer for token gen --- src/bootstrap/lexer.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++ src/bootstrap/main.c | 17 ++- src/bootstrap/read_line.c | 1 - 3 files changed, 320 insertions(+), 5 deletions(-) create mode 100644 src/bootstrap/lexer.c diff --git a/src/bootstrap/lexer.c b/src/bootstrap/lexer.c new file mode 100644 index 0000000..fc53d3c --- /dev/null +++ b/src/bootstrap/lexer.c @@ -0,0 +1,307 @@ +typedef enum TokenType { + TOKEN_UNKNOWN = 0, + TOKEN_LPAREN, + TOKEN_RPAREN, + TOKEN_QUOTE, + TOKEN_TRUE, + TOKEN_FALSE, + TOKEN_NIL, + TOKEN_FIXNUM, + TOKEN_SYMBOL, + TOKEN_STRING, + TOKEN_EOF, +} TokenType; + +typedef struct Token { + TokenType type; + StringView value; + size_t line; + size_t column; +} Token; + +typedef struct Tokens { + Token *buf; + size_t size; + size_t cap; +} Tokens; + +void +print_token(Token tok) { + printf("LINE: %3ld COL: %3ld ", tok.line, tok.column); + switch (tok.type) { + case TOKEN_LPAREN: { + printf("TOKEN_LPAREN"); + } break; + case TOKEN_RPAREN: { + printf("TOKEN_RPAREN"); + } break; + case TOKEN_QUOTE: { + printf("TOKEN_QUOTE"); + } break; + case TOKEN_TRUE: { + printf("TOKEN_TRUE"); + } break; + case TOKEN_FALSE: { + printf("TOKEN_FALSE"); + } break; + case TOKEN_NIL: { + printf("TOKEN_NIL"); + } break; + case TOKEN_FIXNUM: { + printf("TOKEN_FIXNUM -> "); + sv_write(&tok.value, stdout); + } break; + case TOKEN_SYMBOL: { + printf("TOKEN_SYMBOL -> "); + sv_write(&tok.value, stdout); + } break; + case TOKEN_STRING: { + printf("TOKEN_STRING -> "); + sv_write(&tok.value, stdout); + } break; + case TOKEN_EOF: { + printf("TOKEN_EOF"); + } break; + case TOKEN_UNKNOWN: { + printf("TOKEN_UNKNOWN"); + } break; + } + printf("\n"); +} + +#define TOK_BUF_CAP 256 + +void +push_token(Tokens *tokens, Token tok) { + if (tokens->buf == NULL) { + tokens->size = 0; + tokens->cap = TOK_BUF_CAP; + tokens->buf = malloc(tokens->cap * sizeof(Token)); + } else if (tokens->size == tokens->cap) { + tokens->cap *= 2; + tokens->buf = realloc(tokens->buf, tokens->cap * sizeof(Token)); + } + tokens->buf[tokens->size++] = tok; +} + +typedef struct Scanner { + StringView orig; + StringView current; + size_t line_number; + size_t col_number; + size_t offset; + size_t lexeme_n; + size_t lexeme_offset; + size_t lexeme_line_number; + size_t lexeme_col_number; +} Scanner; + +char +scan_next(Scanner *scanner) { + if (scanner->lexeme_n == 0) { + scanner->lexeme_line_number = scanner->line_number; + scanner->lexeme_col_number = scanner->col_number; + scanner->lexeme_offset = scanner->offset; + } + char c = sv_next(&scanner->current); + if (c == '\n') { + scanner->line_number++; + scanner->col_number = 1; + } else { + scanner->col_number++; + } + scanner->offset++; + return c; +} + +char +scan_peek(const Scanner *scanner) { + return sv_peek(&scanner->current); +} + +bool +scan_has_next(const Scanner *scanner) { + return scanner->current.n != 0; +} + +bool +scan_has_lexeme(const Scanner * scanner) { + return scanner->lexeme_n != 0; +} + +Token +scan_get_lexeme(Scanner * scanner) { + Token token = (Token){ + .type = TOKEN_UNKNOWN, + .value = (StringView){ + .start = &scanner->orig.start[scanner->lexeme_offset], + .n = scanner->lexeme_n, + }, + .line = scanner->lexeme_line_number, + .column = scanner->lexeme_col_number, + }; + scanner->lexeme_n = 0; + scanner->lexeme_line_number = scanner->line_number; + scanner->lexeme_col_number = scanner->col_number; + scanner->lexeme_offset = scanner->offset; + return token; +} + +TokenType +find_primitive_type(StringView value) { + bool is_fixnum = true; + for (size_t i = 0; i < value.n; i++) { + char c = value.start[i]; + if (i == 0 && c == '-' && value.n > 1) { + continue; + } + if (!(c >= '0' && c <= '9')) { + is_fixnum = false; + break; + } + } + if (is_fixnum) { + return TOKEN_FIXNUM; + } + if (sv_equal(&value, &(StringView){"true", 4})) { + return TOKEN_TRUE; + } + if (sv_equal(&value, &(StringView){"false", 5})) { + return TOKEN_FALSE; + } + return TOKEN_SYMBOL; +} + +Tokens +tokenize(const StringView *sv) { + Tokens tokens = (Tokens){0}; + Scanner scanner = (Scanner){ + .orig = *sv, + .current = *sv, + .line_number = 1, + .col_number = 1, + .lexeme_line_number = 1, + .lexeme_col_number = 1, + }; + + while (scan_has_next(&scanner)) { + char c = scan_next(&scanner); + switch (c) { + case ' ': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': { + if (scan_has_lexeme(&scanner)) { + Token token = scan_get_lexeme(&scanner); + token.type = find_primitive_type(token.value); + push_token(&tokens, token); + } + } break; + case ';': { + if (scan_has_lexeme(&scanner)) { + Token token = scan_get_lexeme(&scanner); + token.type = find_primitive_type(token.value); + push_token(&tokens, token); + } + while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} + } break; + case '"': { + if (scan_has_lexeme(&scanner)) { + Token token = scan_get_lexeme(&scanner); + token.type = find_primitive_type(token.value); + push_token(&tokens, token); + scanner.lexeme_col_number--; + scanner.lexeme_offset--; + } + + char prev = c; + bool found = false; + while (scan_has_next(&scanner)) { + c = scan_next(&scanner); + if (c == '"' && prev != '\\') { + found = true; + break; + } + prev = c; + scanner.lexeme_n++; + } + scanner.lexeme_col_number--; + if (found) { + Token token = scan_get_lexeme(&scanner); + token.type = TOKEN_STRING; + push_token(&tokens, token); + } else { + // TODO: Report error: couldn't find the closing quotes. + } + } break; + case '\'': { + if (scan_has_lexeme(&scanner)) { + Token token = scan_get_lexeme(&scanner); + token.type = find_primitive_type(token.value); + push_token(&tokens, token); + scanner.lexeme_col_number--; + scanner.lexeme_offset--; + } + Token token = scan_get_lexeme(&scanner); + token.type = TOKEN_QUOTE; + push_token(&tokens, token); + } break; + case '(': { + if (scan_has_lexeme(&scanner)) { + Token token = scan_get_lexeme(&scanner); + token.type = find_primitive_type(token.value); + push_token(&tokens, token); + scanner.lexeme_col_number--; + scanner.lexeme_offset--; + } + scanner.lexeme_n++; + if (scan_peek(&scanner) == ')') { + scanner.lexeme_n++; + scan_next(&scanner); + Token token = scan_get_lexeme(&scanner); + token.type = TOKEN_NIL; + push_token(&tokens, token); + } else { + Token token = scan_get_lexeme(&scanner); + token.type = TOKEN_LPAREN; + push_token(&tokens, token); + } + } break; + case ')': { + if (scan_has_lexeme(&scanner)) { + Token token = scan_get_lexeme(&scanner); + token.type = find_primitive_type(token.value); + push_token(&tokens, token); + scanner.lexeme_col_number--; + scanner.lexeme_offset--; + } + scanner.lexeme_n++; + Token token = scan_get_lexeme(&scanner); + token.type = TOKEN_RPAREN; + push_token(&tokens, token); + } break; + case EOF: { + break; + } break; + default: { + scanner.lexeme_n++; + } break; + } + } + + // Push current lexeme if any. + if (scan_has_lexeme(&scanner)) { + Token token = scan_get_lexeme(&scanner); + token.type = find_primitive_type(token.value); + push_token(&tokens, token); + } + + // Push EOF token. + Token token = scan_get_lexeme(&scanner); + token.type = TOKEN_EOF; + push_token(&tokens, token); + + return tokens; +} diff --git a/src/bootstrap/main.c b/src/bootstrap/main.c index 662831e..113ee48 100755 --- a/src/bootstrap/main.c +++ b/src/bootstrap/main.c @@ -5,10 +5,19 @@ #include "string_view.c" #include "read_line.c" +#include "lexer.c" void process_source(const StringView *source) { - sv_write(source, stdout); + Tokens tokens = tokenize(source); + + // Print tokens. + for (size_t i = 0; i < tokens.size; i++) { + Token tok = tokens.buf[i]; + print_token(tok); + } + + free(tokens.buf); } #define REPL_PROMPT "bdl> " @@ -57,13 +66,13 @@ run_file(char *file_name) { fclose(file); } -#define STDIN_BUF_SIZE 16 +#define STDIN_BUF_CAP 16 void run_stdin(void) { size_t buf_size = 0; - size_t buf_cap = STDIN_BUF_SIZE; - char *source = malloc(sizeof(char) * buf_cap); + size_t buf_cap = STDIN_BUF_CAP; + char *source = malloc(buf_cap * sizeof(char)); char c; while ((c = getchar()) != EOF) { diff --git a/src/bootstrap/read_line.c b/src/bootstrap/read_line.c index 7612d05..603bfee 100644 --- a/src/bootstrap/read_line.c +++ b/src/bootstrap/read_line.c @@ -29,4 +29,3 @@ read_line(void) { }; return sv; } - -- cgit v1.2.1