From 859c33f37f0174a7b9d76cdcbe889ff12047c99c Mon Sep 17 00:00:00 2001 From: Bad Diode Date: Sat, 9 Oct 2021 19:00:17 +0200 Subject: Split main into separate files --- src/bootstrap/lexer.c | 207 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 src/bootstrap/lexer.c (limited to 'src/bootstrap/lexer.c') diff --git a/src/bootstrap/lexer.c b/src/bootstrap/lexer.c new file mode 100644 index 0000000..dd5c0f2 --- /dev/null +++ b/src/bootstrap/lexer.c @@ -0,0 +1,207 @@ +typedef enum TokenType { + TOKEN_UNKNOWN = 0, + TOKEN_LPAREN, + TOKEN_RPAREN, + TOKEN_FIXNUM, + TOKEN_SYMBOL, + TOKEN_BOOL, + TOKEN_STRING, +} TokenType; + +typedef struct Token { + TokenType type; + StringView value; +} Token; + +typedef struct Tokens { + Token *start; + size_t n; +} Tokens; + +#define TRUE_TOKEN (StringView){"true", 4} +#define FALSE_TOKEN (StringView){"false", 5} +#define LPAREN_TOKEN (StringView){"(", 1} +#define RPAREN_TOKEN (StringView){")", 1} + +TokenType +find_token_type(StringView value) { + bool is_fixnum = true; + for (size_t i = 0; i < value.n; i++) { + char c = value.start[i]; + if (i == 0 && c == '-' && value.n > 1) { + continue; + } + if (!isdigit(c)) { + is_fixnum = false; + break; + } + } + if (is_fixnum) { + return TOKEN_FIXNUM; + } + + if (sv_equal(value, TRUE_TOKEN) || sv_equal(value, FALSE_TOKEN)) { + return TOKEN_BOOL; + } + + return TOKEN_SYMBOL; +} + +Tokens +tokenize(StringView sv) { + // NOTE: Not allocating any memory for now, but we are limited by a maximum + // number of tokens we can process. + #define TOKENS_BUF_SIZE 1024 + static Token tokens_buf[TOKENS_BUF_SIZE]; + + // Clear buffer. + for (size_t i = 0; i < TOKENS_BUF_SIZE; i++) { + tokens_buf[i] = (Token){0}; + } + + size_t n = 0; + size_t token_n = 0; + for (size_t i = 0; i < sv.n; i++) { + switch (sv.start[i]) { + case ' ': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': { + if (token_n != 0) { + Token token = (Token){ + .type = TOKEN_UNKNOWN, + .value = (StringView){ + .start = &sv.start[i - token_n], + .n = token_n, + } + }; + token.type = find_token_type(token.value); + tokens_buf[n++] = token; + token_n = 0; + } + } break; + case ';': { + if (token_n != 0) { + Token token = (Token){ + .type = TOKEN_UNKNOWN, + .value = (StringView){ + .start = &sv.start[i - token_n], + .n = token_n, + } + }; + token.type = find_token_type(token.value); + tokens_buf[n++] = token; + token_n = 0; + } + + // Advance until the next newline. + do { + i++; + } while (i < sv.n && sv.start[(i + 1)] != '\n'); + } break; + case '"': { + if (token_n != 0) { + fprintf(stderr, "error: string started inside symbol\n"); + return (Tokens){0}; + } + + // Find end delimiter. + size_t string_start = i + 1; + size_t string_end = i + 1; + while (true) { + if (sv.start[string_end] == '"' && sv.start[string_end - 1] != '\\') { + break; + } + if (string_end >= sv.n) { + fprintf(stderr, "error: string delimiter not found\n"); + return (Tokens){0}; + } + string_end++; + } + + Token token = (Token){ + .type = TOKEN_STRING, + .value = (StringView){ + .start = &sv.start[string_start], + .n = string_end - string_start, + } + }; + tokens_buf[n++] = token; + token_n = 0; + i += string_end - string_start + 1; + } break; + case '(': { + if ((i + 1) < sv.n) { + char next_c = sv.start[i + 1]; + if (isspace(next_c)) { + fprintf(stderr, "error: lparen delimiter followed by space\n"); + return (Tokens){0}; + } + } + + if (token_n != 0) { + fprintf(stderr, "error: lparen delimiter within symbol name\n"); + return (Tokens){0}; + } + + Token token = (Token){ + .type = TOKEN_LPAREN, + .value = LPAREN_TOKEN, + }; + tokens_buf[n++] = token; + } break; + case ')': { + if ((i + 1) < sv.n) { + char next_c = sv.start[i + 1]; + if ((next_c != ')' && !isspace(next_c))) { + fprintf(stderr, "error: rparen delimiter within symbol name\n"); + return (Tokens){0}; + } + } + + if (token_n != 0) { + // Push previous token. + Token token = (Token){ + .type = TOKEN_UNKNOWN, + .value = (StringView){ + .start = &sv.start[i - token_n], + .n = token_n, + } + }; + token.type = find_token_type(token.value); + tokens_buf[n++] = token; + token_n = 0; + } + + Token token = (Token){ + .type = TOKEN_RPAREN, + .value = RPAREN_TOKEN, + }; + tokens_buf[n++] = token; + } break; + case EOF: { + break; + } break; + default: { + token_n++; + } break; + } + } + if (token_n != 0) { + // End of line encountered. + Token token = (Token){ + .type = TOKEN_UNKNOWN, + .value = (StringView){ + .start = &sv.start[sv.n - token_n], + .n = token_n, + } + }; + token.type = find_token_type(token.value); + tokens_buf[n++] = token; + } + + return (Tokens){.start = (Token *)&tokens_buf, .n = n}; +} + -- cgit v1.2.1