From e7cd0d47a603e4199b0ee7daa2434fc0db602bad Mon Sep 17 00:00:00 2001 From: Bad Diode Date: Sat, 15 Jun 2024 16:52:36 +0200 Subject: Move lexer code to lexer.c file --- Makefile | 3 - src/lexer.c | 734 ++++++++++++++++++++++++++++++++++++++++++------------------ src/lexer.h | 99 -------- src/main.c | 631 +-------------------------------------------------- 4 files changed, 532 insertions(+), 935 deletions(-) delete mode 100644 src/lexer.h diff --git a/Makefile b/Makefile index 45c1389..4e66983 100644 --- a/Makefile +++ b/Makefile @@ -43,9 +43,6 @@ $(BIN): $(SRC_MAIN) $(WATCH_SRC) $(BUILD_DIR) $(BUILD_DIR): mkdir -p $(BUILD_DIR) -tests: $(BIN) - ./$(BIN) tests/constants/numbers.bdl - run: $(BIN) $(BIN) tests/literals.bad diff --git a/src/lexer.c b/src/lexer.c index a6d7c74..df998f2 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1,130 +1,192 @@ -#include "lexer.h" -#include "errors.h" - -static const char* token_str[] = { - [TOKEN_UNKNOWN] = "UNKNOWN", - [TOKEN_LPAREN] = "LPAREN", - [TOKEN_RPAREN] = "RPAREN", - [TOKEN_LSQUARE] = "LSQUARE", - [TOKEN_RSQUARE] = "RSQUARE", - [TOKEN_LCURLY] = "LCURLY", - [TOKEN_RCURLY] = "RCURLY", - [TOKEN_NUMBER] = "NUMBER", - [TOKEN_SYMBOL] = "SYMBOL", - [TOKEN_STRING] = "STRING", - [TOKEN_NIL] = "NIL", - [TOKEN_TRUE] = "TRUE", - [TOKEN_FALSE] = "FALSE", - [TOKEN_LAMBDA] = "LAMBDA", - [TOKEN_IF] = "IF", - [TOKEN_DEF] = "DEF", - [TOKEN_SET] = "SET", - [TOKEN_FUN] = "FUN", - [TOKEN_STRUCT] = "STRUCT", - [TOKEN_ADD] = "ADD", - [TOKEN_SUB] = "SUB", - [TOKEN_MUL] = "MUL", - [TOKEN_DIV] = "DIV", - [TOKEN_MOD] = "MOD", - [TOKEN_NOT] = "NOT", - [TOKEN_AND] = "AND", - [TOKEN_OR] = "OR", - [TOKEN_EQ] = "EQ", - [TOKEN_LT] = "LT", - [TOKEN_GT] = "GT", - [TOKEN_LE] = "LE", - [TOKEN_GE] = "GE", - [TOKEN_COLON] = "COLON", - [TOKEN_DOT] = "DOT", - [TOKEN_AT] = "AT", - [TOKEN_EOF] = "EOF", -}; +#define LEXER_MEM GB(2) + +typedef enum TokenType { + TOK_UNKNOWN = 0, + + // Parentheses. + TOK_LPAREN, // ( + TOK_RPAREN, // ) + TOK_LSQUARE, // [ + TOK_RSQUARE, // ] + TOK_LCURLY, // { + TOK_RCURLY, // } + + // Basic literals. + TOK_NUMBER, + TOK_SYMBOL, + TOK_STRING, + + // Keywords. + TOK_BREAK, // break + TOK_CASE, // case + TOK_CONTINUE, // continue + TOK_FALSE, // false + TOK_FUN, // fun + TOK_IF, // if + TOK_LET, // let + TOK_MATCH, // match + TOK_NIL, // nil + TOK_RETURN, // return + TOK_SET, // set + TOK_STRUCT, // struct + TOK_TRUE, // true + TOK_WHILE, // while + + // Arithmetic ops. + TOK_ADD, // + + TOK_SUB, // - + TOK_MUL, // * + TOK_DIV, // / + TOK_MOD, // % + + // Logical ops. + TOK_NOT, // ! + TOK_AND, // && + TOK_OR, // || + TOK_EQ, // == + TOK_NOTEQ, // != + TOK_LT, // < + TOK_GT, // > + TOK_LE, // <= + TOK_GE, // >= + + // Bitwise ops. + TOK_BITNOT, // ~ + TOK_BITAND, // & + TOK_BITOR, // | + TOK_BITLSHIFT, // << + TOK_BITRSHIFT, // >> + + // Special ops. + TOK_COLON, // : + TOK_DOT, // . + TOK_AT, // @ + TOK_ASSIGN, // = + + // End of file. + TOK_EOF, +} TokenType; + +Str token_str[] = { + [TOK_UNKNOWN] = cstr("UNKNOWN"), + + // Parentheses. + [TOK_LPAREN] = cstr("LPAREN"), + [TOK_RPAREN] = cstr("RPAREN"), + [TOK_LSQUARE] = cstr("LSQUARE"), + [TOK_RSQUARE] = cstr("RSQUARE"), + [TOK_LCURLY] = cstr("LCURLY"), + [TOK_RCURLY] = cstr("RCURLY"), + + // Basic literals. + [TOK_NUMBER] = cstr("NUMBER"), + [TOK_SYMBOL] = cstr("SYMBOL"), + [TOK_STRING] = cstr("STRING"), + + // Keywords. + [TOK_BREAK] = cstr("BREAK"), + [TOK_CASE] = cstr("CASE"), + [TOK_CONTINUE] = cstr("CONTINUE"), + [TOK_FALSE] = cstr("FALSE"), + [TOK_FUN] = cstr("FUN"), + [TOK_IF] = cstr("IF"), + [TOK_LET] = cstr("LET"), + [TOK_MATCH] = cstr("MATCH"), + [TOK_NIL] = cstr("NIL"), + [TOK_RETURN] = cstr("RETURN"), + [TOK_SET] = cstr("SET"), + [TOK_STRUCT] = cstr("STRUCT"), + [TOK_TRUE] = cstr("TRUE"), + [TOK_WHILE] = cstr("WHILE"), + + // Arithmetic ops. + [TOK_ADD] = cstr("ADD"), + [TOK_SUB] = cstr("SUB"), + [TOK_MUL] = cstr("MUL"), + [TOK_DIV] = cstr("DIV"), + [TOK_MOD] = cstr("MOD"), -typedef struct Keyword { - char *str; - size_t n; - TokenType token; -} Keyword; - -#define KEYWORD(STR,TOK) {(STR), sizeof(STR) - 1, (TOK)} - -static const Keyword keywords[] = { - KEYWORD("nil", TOKEN_NIL), - KEYWORD("true", TOKEN_TRUE), - KEYWORD("false", TOKEN_FALSE), - KEYWORD("lambda", TOKEN_LAMBDA), - KEYWORD("if", TOKEN_IF), - KEYWORD("def", TOKEN_DEF), - KEYWORD("set", TOKEN_SET), - KEYWORD("fun", TOKEN_FUN), - KEYWORD("struct", TOKEN_STRUCT), - KEYWORD("+", TOKEN_ADD), - KEYWORD("-", TOKEN_SUB), - KEYWORD("*", TOKEN_MUL), - KEYWORD("/", TOKEN_DIV), - KEYWORD("%", TOKEN_MOD), - KEYWORD("not", TOKEN_NOT), - KEYWORD("and", TOKEN_AND), - KEYWORD("or", TOKEN_OR), - KEYWORD("=", TOKEN_EQ), - KEYWORD("<", TOKEN_LT), - KEYWORD(">", TOKEN_GT), - KEYWORD("<=", TOKEN_LE), - KEYWORD(">=", TOKEN_GE), + // Logical ops. + [TOK_NOT] = cstr("NOT"), + [TOK_AND] = cstr("AND"), + [TOK_OR] = cstr("OR"), + [TOK_EQ] = cstr("EQ"), + [TOK_NOTEQ] = cstr("NOTEQ"), + [TOK_LT] = cstr("LT"), + [TOK_GT] = cstr("GT"), + [TOK_LE] = cstr("LE"), + [TOK_GE] = cstr("GE"), + + // Bitwise ops. + [TOK_BITNOT] = cstr("BITNOT"), + [TOK_BITAND] = cstr("BITAND"), + [TOK_BITOR] = cstr("BITOR"), + [TOK_BITLSHIFT] = cstr("BITLSHIFT"), + [TOK_BITRSHIFT] = cstr("BITRSHIFT"), + + // Special ops. + [TOK_COLON] = cstr("COLON"), + [TOK_DOT] = cstr("DOT"), + [TOK_AT] = cstr("AT"), + [TOK_ASSIGN] = cstr("ASSIGN"), + + // End of file. + [TOK_EOF] = cstr("EOF"), }; -void -print_token(Token tok) { - printf("[%4ld:%-4ld] ", tok.line, tok.col); - printf("%s", token_str[tok.type]); - switch (tok.type) { - case TOKEN_NUMBER: - case TOKEN_SYMBOL: - case TOKEN_STRING: { - printf(" -> "); - sv_write(&tok.value); - } break; - default: { - } break; - } - printf("\n"); -} +typedef struct Token { + TokenType type; + Str val; + sz line; + sz col; +} Token; + +typedef struct Scanner { + Str str; + sz line; + sz col; +} Scanner; char scan_next(Scanner *scanner) { - char c = sv_next(&scanner->current); + char c = str_next(&scanner->str); if (c == '\n') { - scanner->line_number++; - scanner->col_number = 1; + scanner->line++; + scanner->col = 0; } else { - scanner->col_number++; + scanner->col++; } - scanner->offset++; return c; } -void -scan_rewind(Scanner *scanner) { - sv_rewind(&scanner->current); - scanner->offset--; +bool +scan_has_next(Scanner *scanner) { + return scanner->str.size; } char -scan_peek(const Scanner *scanner) { - return sv_peek(&scanner->current); +scan_peek(Scanner *scanner) { + return str_peek(scanner->str); } -bool -scan_has_next(const Scanner *scanner) { - return scanner->current.n != 0; +void +scan_skip_line(Scanner *scanner) { + SearchResult newline = array_find_next(scanner->str, cstr("\n")); + if (newline.found) { + scanner->str.mem += newline.pos + 1; + scanner->str.size -= newline.pos + 1; + scanner->line++; + scanner->col = 0; + } } void -skip_whitespace(Scanner *scanner) { +scan_skip_whitespace(Scanner *scanner) { while (scan_has_next(scanner)) { char c = scan_peek(scanner); switch (c) { case ' ': + case ',': // Commas are just syntactic sugar. case '\f': case '\n': case '\r': @@ -132,6 +194,10 @@ skip_whitespace(Scanner *scanner) { case '\v': { scan_next(scanner); } break; + case ';': { + // Found a comment! (skip) + scan_skip_line(scanner); + } break; default: { return; } break; @@ -140,22 +206,33 @@ skip_whitespace(Scanner *scanner) { } bool -is_delimiter(char c) { +scan_is_valid_split(char c) { switch (c) { - case EOF: - case '\0': case ';': - case '"': - case '\'': case '(': case ')': case '[': case ']': case '{': case '}': + case '+': + case '-': + case '*': + case '/': + case '%': + case '!': + case '=': + case '<': + case '>': + case '~': + case '&': + case '|': case ':': + case '.': case '@': + case '"': case ' ': + case ',': case '\f': case '\n': case '\r': @@ -167,122 +244,351 @@ is_delimiter(char c) { return false; } -TokenType -find_token_type(const StringView value) { - for (size_t i = 0; i < sizeof(keywords) / sizeof(Keyword); i++) { - StringView keyword = (StringView){keywords[i].str, keywords[i].n}; - if (sv_equal(&value, &keyword)) { - return keywords[i].token; +void +scan_skip_until_valid(Scanner *scanner) { + while (scan_has_next(scanner)) { + char c = scan_peek(scanner); + if (scan_is_valid_split(c)) { + return; } + scan_next(scanner); } - return TOKEN_SYMBOL; } -void -print_tokens(Token *tokens) { - for (size_t i = 0; i < array_size(tokens); i++) { - print_token(tokens[i]); - } +Token +emit_token(Scanner current, Scanner *scanner, TokenType t) { + Str val = current.str; + val.size = current.str.size - scanner->str.size; + val.size = val.size < 0 ? 0 : val.size; + return (Token){ + .val = val, + .line = current.line + 1, + .col = current.col + 1, + .type = t, + }; } -Token * -tokenize(const StringView *sv) { - Token *tokens = NULL; - array_init(tokens, 1); - Scanner scanner = (Scanner){ - .current = *sv, - .line_number = 1, - .col_number = 1, +Token +emit_token_err(Scanner *scanner, Str err_msg) { + return (Token){ + .line = scanner->line + 1, + .col = scanner->col + 1, + .val = err_msg, + .type = TOK_UNKNOWN, }; +} - while (scan_has_next(&scanner)) { - skip_whitespace(&scanner); - size_t line = scanner.line_number; - size_t col = scanner.col_number; - size_t offset = scanner.offset; - Token token = (Token){ - .type = TOKEN_UNKNOWN, - .line = line, - .col = col, - }; - char c = scan_next(&scanner); - switch (c) { - case ';': { - while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} +Token +emit_token_number(Scanner *scanner) { + Scanner current = *scanner; + char c = scan_peek(scanner); + if (c == '+' || c == '-') { + scan_next(scanner); + if (str_has_prefix(scanner->str, cstr("0b")) || + str_has_prefix(scanner->str, cstr("0x"))) { + scan_skip_until_valid(scanner); + return emit_token_err( + ¤t, + cstr("malformed number: binary/hex numbers can't be signed")); + } + } + if (str_has_prefix(scanner->str, cstr("0b"))) { + scan_next(scanner); + scan_next(scanner); + while (scan_has_next(scanner)) { + c = scan_peek(scanner); + if (c == '0' || c == '1' || c == '_') { + scan_next(scanner); continue; - } break; - case '"': { - char prev = c; - bool found = false; - size_t n = 0; - while (scan_has_next(&scanner)) { - c = scan_next(&scanner); - if (c == '"' && prev != '\\') { - found = true; - break; - } - prev = c; - n++; - } - if (!found) { - push_error(ERR_TYPE_LEXER, ERR_UNMATCHED_STRING, line, col); - return tokens; - } - token.value = (StringView){ - .start = &sv->start[offset + 1], - .n = n, - }; - token.type = TOKEN_STRING; - } break; - case '(': { token.type = TOKEN_LPAREN; } break; - case ')': { token.type = TOKEN_RPAREN; } break; - case '[': { token.type = TOKEN_LSQUARE; } break; - case ']': { token.type = TOKEN_RSQUARE; } break; - case '{': { token.type = TOKEN_LCURLY; } break; - case '}': { token.type = TOKEN_RCURLY; } break; - case ':': { token.type = TOKEN_COLON; } break; - case '.': { token.type = TOKEN_DOT; } break; - case '@': { token.type = TOKEN_AT; } break; - default: { - if (c == EOF || c == '\0') { - token.type = TOKEN_EOF; - break; - } - size_t n = 1; - bool num = c == '-' && !is_delimiter(scan_peek(&scanner)); - num = num || (c == '+' && !is_delimiter(scan_peek(&scanner))); - num = num || (c >= '0' && c <= '9'); - if (num) { - while (!is_delimiter(scan_peek(&scanner))) { - c = scan_next(&scanner); - n++; - } - token.value = (StringView){ - .start = &sv->start[offset], - .n = n, - }; - token.type = TOKEN_NUMBER; - } else { - while (!is_delimiter(scan_peek(&scanner))) { - if (scan_peek(&scanner) == '.') { - break; - } - c = scan_next(&scanner); - n++; - } - token.value = (StringView){ - .start = &sv->start[offset], - .n = n, - }; - token.type = find_token_type(token.value); - } - } break; + } + if (scan_is_valid_split(c)) { + return emit_token(current, scanner, TOK_NUMBER); + } + scan_skip_until_valid(scanner); + return emit_token_err( + ¤t, cstr("malformed number: invalid binary number")); + } + } else if (str_has_prefix(scanner->str, cstr("0x"))) { + scan_next(scanner); + scan_next(scanner); + while (scan_has_next(scanner)) { + c = scan_peek(scanner); + if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F') || c == '_') { + scan_next(scanner); + continue; + } + if (scan_is_valid_split(c)) { + return emit_token(current, scanner, TOK_NUMBER); + } + scan_skip_until_valid(scanner); + return emit_token_err(¤t, + cstr("malformed number: invalid hex number")); + } + } else { + // Integral. + while (scan_has_next(scanner)) { + c = scan_peek(scanner); + if (c == '.') { + scan_next(scanner); + break; + } + if ((c >= '0' && c <= '9') || c == '_') { + scan_next(scanner); + continue; + } + if (scan_is_valid_split(c)) { + return emit_token(current, scanner, TOK_NUMBER); + } + scan_skip_until_valid(scanner); + return emit_token_err(¤t, cstr("malformed number")); + } + c = scan_peek(scanner); + if (!(c >= '0' && c <= '9')) { + return emit_token_err(¤t, + cstr("malformed number: no decimal digits")); + } + // Decimals. + while (scan_has_next(scanner)) { + c = scan_peek(scanner); + if (c == 'e' || c == 'E') { + scan_next(scanner); + break; + } + if ((c >= '0' && c <= '9') || c == '_') { + scan_next(scanner); + continue; + } + if (scan_is_valid_split(c)) { + return emit_token(current, scanner, TOK_NUMBER); + } + scan_skip_until_valid(scanner); + return emit_token_err(¤t, cstr("malformed number")); } - if (token.type == TOKEN_UNKNOWN) { - push_error(ERR_TYPE_LEXER, ERR_UNKNOWN_TOK_TYPE, line, col); - return tokens; + // Exponent. + c = scan_peek(scanner); + if (c == '+' || c == '-') { + scan_next(scanner); } - array_push(tokens, token); + while (scan_has_next(scanner)) { + c = scan_peek(scanner); + if ((c >= '0' && c <= '9') || c == '_') { + scan_next(scanner); + continue; + } + if (c == '.') { + scan_next(scanner); + return emit_token_err( + ¤t, + cstr("malformed number: decimals not allowed on exponent")); + } + if (scan_is_valid_split(c)) { + return emit_token(current, scanner, TOK_NUMBER); + } + scan_skip_until_valid(scanner); + return emit_token_err(¤t, cstr("malformed number")); + } + } + return emit_token_err(¤t, cstr("malformed number")); +} + +Token +scan_token(Scanner *scanner) { + assert(scanner); + + scan_skip_whitespace(scanner); + if (!scan_has_next(scanner)) { + return emit_token(*scanner, scanner, TOK_EOF); + } + + Scanner current = *scanner; + char c = scan_next(scanner); + switch (c) { + case '(': + return emit_token(current, scanner, TOK_LPAREN); + case ')': + return emit_token(current, scanner, TOK_RPAREN); + case '[': + return emit_token(current, scanner, TOK_LSQUARE); + case ']': + return emit_token(current, scanner, TOK_RSQUARE); + case '{': + return emit_token(current, scanner, TOK_LCURLY); + case '}': + return emit_token(current, scanner, TOK_RCURLY); + case '+': { + char p = scan_peek(scanner); + if (p >= '0' && p <= '9') { + *scanner = current; + return emit_token_number(scanner); + } + return emit_token(current, scanner, TOK_ADD); + }; + case '-': { + char p = scan_peek(scanner); + if (p >= '0' && p <= '9') { + *scanner = current; + return emit_token_number(scanner); + } + return emit_token(current, scanner, TOK_ADD); + }; + case '*': + return emit_token(current, scanner, TOK_MUL); + case '/': + return emit_token(current, scanner, TOK_DIV); + case '%': + return emit_token(current, scanner, TOK_MOD); + case '!': { + if (scan_peek(scanner) == '=') { + scan_next(scanner); + return emit_token(current, scanner, TOK_NOTEQ); + } + return emit_token(current, scanner, TOK_NOT); + }; + case '=': { + if (scan_peek(scanner) == '=') { + scan_next(scanner); + return emit_token(current, scanner, TOK_EQ); + } + return emit_token(current, scanner, TOK_ASSIGN); + }; + case '<': { + char p = scan_peek(scanner); + if (p == '=') { + scan_next(scanner); + return emit_token(current, scanner, TOK_LE); + } + if (p == '<') { + scan_next(scanner); + return emit_token(current, scanner, TOK_BITLSHIFT); + } + return emit_token(current, scanner, TOK_LT); + }; + case '>': { + char p = scan_peek(scanner); + if (p == '=') { + scan_next(scanner); + return emit_token(current, scanner, TOK_GE); + } + if (p == '>') { + scan_next(scanner); + return emit_token(current, scanner, TOK_BITRSHIFT); + } + return emit_token(current, scanner, TOK_GT); + }; + case '~': + return emit_token(current, scanner, TOK_BITNOT); + case '&': { + if (scan_peek(scanner) == '&') { + scan_next(scanner); + return emit_token(current, scanner, TOK_AND); + } + return emit_token(current, scanner, TOK_BITAND); + }; + case '|': { + if (scan_peek(scanner) == '|') { + scan_next(scanner); + return emit_token(current, scanner, TOK_OR); + } + return emit_token(current, scanner, TOK_BITOR); + }; + case ':': + return emit_token(current, scanner, TOK_COLON); + case '.': + return emit_token(current, scanner, TOK_DOT); + case '@': + return emit_token(current, scanner, TOK_AT); + case '"': { + while (scan_has_next(scanner)) { + c = scan_next(scanner); + if (c == '\\') { + scan_next(scanner); + continue; + } + if (c == '"') { + return emit_token(current, scanner, TOK_STRING); + } + } + return emit_token_err(¤t, cstr("mismatched string quotes")); + }; + } + if (c >= '0' && c <= '9') { + *scanner = current; + return emit_token_number(scanner); + } + + scan_skip_until_valid(scanner); + Str val = current.str; + val.size = current.str.size - scanner->str.size; + val.size = val.size < 0 ? 0 : val.size; + if (val.size == 0) { + return emit_token_err(¤t, cstr("unexpected character")); + } + switch (val.mem[0]) { + case 'b': { + if (str_has_prefix(val, cstr("break"))) { + return emit_token(current, scanner, TOK_BREAK); + } + } break; + case 'c': { + if (str_has_prefix(val, cstr("case"))) { + return emit_token(current, scanner, TOK_CASE); + } + if (str_has_prefix(val, cstr("continue"))) { + return emit_token(current, scanner, TOK_CONTINUE); + } + } break; + case 'f': { + if (str_has_prefix(val, cstr("false"))) { + return emit_token(current, scanner, TOK_FALSE); + } + if (str_has_prefix(val, cstr("fun"))) { + return emit_token(current, scanner, TOK_FUN); + } + } break; + case 'i': { + if (str_has_prefix(val, cstr("if"))) { + return emit_token(current, scanner, TOK_IF); + } + } break; + case 'l': { + if (str_has_prefix(val, cstr("let"))) { + return emit_token(current, scanner, TOK_LET); + } + } break; + case 'm': { + if (str_has_prefix(val, cstr("match"))) { + return emit_token(current, scanner, TOK_MATCH); + } + } break; + case 'n': { + if (str_has_prefix(val, cstr("nil"))) { + return emit_token(current, scanner, TOK_NIL); + } + } break; + case 'r': { + if (str_has_prefix(val, cstr("return"))) { + return emit_token(current, scanner, TOK_RETURN); + } + } break; + case 's': { + if (str_has_prefix(val, cstr("set"))) { + return emit_token(current, scanner, TOK_SET); + } + if (str_has_prefix(val, cstr("struct"))) { + return emit_token(current, scanner, TOK_STRUCT); + } + } break; + case 't': { + if (str_has_prefix(val, cstr("true"))) { + return emit_token(current, scanner, TOK_TRUE); + } + } break; + case 'w': { + if (str_has_prefix(val, cstr("while"))) { + return emit_token(current, scanner, TOK_WHILE); + } + } break; } - return tokens; + return emit_token(current, scanner, TOK_SYMBOL); } diff --git a/src/lexer.h b/src/lexer.h deleted file mode 100644 index 949abaf..0000000 --- a/src/lexer.h +++ /dev/null @@ -1,99 +0,0 @@ -#ifndef BDL_LEXER_H -#define BDL_LEXER_H - -#include "string_view.h" - -typedef enum TokenType { - TOKEN_UNKNOWN = 0, - - // Parentheses. - TOKEN_LPAREN, - TOKEN_RPAREN, - TOKEN_LSQUARE, - TOKEN_RSQUARE, - TOKEN_LCURLY, - TOKEN_RCURLY, - - // Primitive types. - TOKEN_NUMBER, - TOKEN_SYMBOL, - TOKEN_STRING, - TOKEN_NIL, - TOKEN_TRUE, - TOKEN_FALSE, - - // Keywords. - TOKEN_LAMBDA, - TOKEN_IF, - TOKEN_DEF, - TOKEN_SET, - TOKEN_FUN, - TOKEN_STRUCT, - - // Arithmetic ops. - TOKEN_ADD, - TOKEN_SUB, - TOKEN_MUL, - TOKEN_DIV, - TOKEN_MOD, - - // Boolean operations. - TOKEN_NOT, - TOKEN_AND, - TOKEN_OR, - TOKEN_EQ, - TOKEN_LT, - TOKEN_GT, - TOKEN_LE, - TOKEN_GE, - - // Special operators. - TOKEN_COLON, - TOKEN_DOT, - TOKEN_AT, - - // End of file. - TOKEN_EOF, -} TokenType; - -typedef struct Token { - TokenType type; - StringView value; - size_t line; - size_t col; -} Token; - -typedef struct Scanner { - StringView current; - size_t line_number; - size_t col_number; - size_t offset; -} Scanner; - -// Print a token to standard output for debugging purposes. -void print_token(Token tok); - -// Same functionality as with StringView, but keeping track of line and column -// numbers. -char scan_next(Scanner *scanner); -char scan_peek(const Scanner *scanner); - -// Check if the current scanner still have characters left. -bool scan_has_next(const Scanner *scanner); - -// Advance the scanner until we ran out of whitespace. -void skip_whitespace(Scanner *scanner); - -// Check if a given character is a delimiter. -bool is_delimiter(char c); - -// Extract the token type from the current string. -TokenType find_token_type(const StringView value); - -// Generate a list of tokens from the given string. -Token * tokenize(const StringView *sv); - -// Display tokens from token list. -void print_tokens(Token *tokens); - -#endif // BDL_LEXER_H diff --git a/src/main.c b/src/main.c index edd70aa..9848b8b 100644 --- a/src/main.c +++ b/src/main.c @@ -3,6 +3,7 @@ #include #include "badlib.h" +#include "lexer.c" typedef enum ExecMode { RUN_NORMAL, @@ -14,607 +15,11 @@ typedef enum ExecMode { static ExecMode mode = RUN_NORMAL; -#define LEXER_MEM GB(2) - void init(void) { log_init_default(); } -typedef enum TokenType { - TOK_UNKNOWN = 0, - - // Parentheses. - TOK_LPAREN, // ( - TOK_RPAREN, // ) - TOK_LSQUARE, // [ - TOK_RSQUARE, // ] - TOK_LCURLY, // { - TOK_RCURLY, // } - - // Basic literals. - TOK_NUMBER, - TOK_SYMBOL, - TOK_STRING, - - // Keywords. - TOK_BREAK, // break - TOK_CASE, // case - TOK_CONTINUE, // continue - TOK_FALSE, // false - TOK_FUN, // fun - TOK_IF, // if - TOK_LET, // let - TOK_MATCH, // match - TOK_NIL, // nil - TOK_RETURN, // return - TOK_SET, // set - TOK_STRUCT, // struct - TOK_TRUE, // true - TOK_WHILE, // while - - // Arithmetic ops. - TOK_ADD, // + - TOK_SUB, // - - TOK_MUL, // * - TOK_DIV, // / - TOK_MOD, // % - - // Logical ops. - TOK_NOT, // ! - TOK_AND, // && - TOK_OR, // || - TOK_EQ, // == - TOK_NOTEQ, // != - TOK_LT, // < - TOK_GT, // > - TOK_LE, // <= - TOK_GE, // >= - - // Bitwise ops. - TOK_BITNOT, // ~ - TOK_BITAND, // & - TOK_BITOR, // | - TOK_BITLSHIFT, // << - TOK_BITRSHIFT, // >> - - // Special ops. - TOK_COLON, // : - TOK_DOT, // . - TOK_AT, // @ - TOK_ASSIGN, // = - - // End of file. - TOK_EOF, -} TokenType; - -Str token_str[] = { - [TOK_UNKNOWN] = cstr("UNKNOWN"), - - // Parentheses. - [TOK_LPAREN] = cstr("LPAREN"), - [TOK_RPAREN] = cstr("RPAREN"), - [TOK_LSQUARE] = cstr("LSQUARE"), - [TOK_RSQUARE] = cstr("RSQUARE"), - [TOK_LCURLY] = cstr("LCURLY"), - [TOK_RCURLY] = cstr("RCURLY"), - - // Basic literals. - [TOK_NUMBER] = cstr("NUMBER"), - [TOK_SYMBOL] = cstr("SYMBOL"), - [TOK_STRING] = cstr("STRING"), - - // Keywords. - [TOK_BREAK] = cstr("BREAK"), - [TOK_CASE] = cstr("CASE"), - [TOK_CONTINUE] = cstr("CONTINUE"), - [TOK_FALSE] = cstr("FALSE"), - [TOK_FUN] = cstr("FUN"), - [TOK_IF] = cstr("IF"), - [TOK_LET] = cstr("LET"), - [TOK_MATCH] = cstr("MATCH"), - [TOK_NIL] = cstr("NIL"), - [TOK_RETURN] = cstr("RETURN"), - [TOK_SET] = cstr("SET"), - [TOK_STRUCT] = cstr("STRUCT"), - [TOK_TRUE] = cstr("TRUE"), - [TOK_WHILE] = cstr("WHILE"), - - // Arithmetic ops. - [TOK_ADD] = cstr("ADD"), - [TOK_SUB] = cstr("SUB"), - [TOK_MUL] = cstr("MUL"), - [TOK_DIV] = cstr("DIV"), - [TOK_MOD] = cstr("MOD"), - - // Logical ops. - [TOK_NOT] = cstr("NOT"), - [TOK_AND] = cstr("AND"), - [TOK_OR] = cstr("OR"), - [TOK_EQ] = cstr("EQ"), - [TOK_NOTEQ] = cstr("NOTEQ"), - [TOK_LT] = cstr("LT"), - [TOK_GT] = cstr("GT"), - [TOK_LE] = cstr("LE"), - [TOK_GE] = cstr("GE"), - - // Bitwise ops. - [TOK_BITNOT] = cstr("BITNOT"), - [TOK_BITAND] = cstr("BITAND"), - [TOK_BITOR] = cstr("BITOR"), - [TOK_BITLSHIFT] = cstr("BITLSHIFT"), - [TOK_BITRSHIFT] = cstr("BITRSHIFT"), - - // Special ops. - [TOK_COLON] = cstr("COLON"), - [TOK_DOT] = cstr("DOT"), - [TOK_AT] = cstr("AT"), - [TOK_ASSIGN] = cstr("ASSIGN"), - - // End of file. - [TOK_EOF] = cstr("EOF"), -}; - -typedef struct Token { - TokenType type; - Str val; - sz line; - sz col; -} Token; - -typedef struct Scanner { - Str str; - sz line; - sz col; - Arena *storage; -} Scanner; - -char -scan_next(Scanner *scanner) { - char c = str_next(&scanner->str); - if (c == '\n') { - scanner->line++; - scanner->col = 0; - } else { - scanner->col++; - } - return c; -} - -bool -scan_has_next(Scanner *scanner) { - return scanner->str.size; -} - -char -scan_peek(Scanner *scanner) { - return str_peek(scanner->str); -} - -Token -emit_token(Scanner current, Scanner *scanner, TokenType t) { - Str val = current.str; - val.size = current.str.size - scanner->str.size; - val.size = val.size < 0 ? 0 : val.size; - return (Token){ - .val = val, - .line = current.line + 1, - .col = current.col + 1, - .type = t, - }; -} - -Token -emit_token_err(Scanner *scanner, Str err_msg) { - return (Token){ - .line = scanner->line + 1, - .col = scanner->col + 1, - .val = err_msg, - .type = TOK_UNKNOWN, - }; -} - -void -scan_skip_line(Scanner *scanner) { - SearchResult newline = array_find_next(scanner->str, cstr("\n")); - if (newline.found) { - scanner->str.mem += newline.pos + 1; - scanner->str.size -= newline.pos + 1; - scanner->line++; - scanner->col = 0; - } -} - -void -scan_skip_whitespace(Scanner *scanner) { - while (scan_has_next(scanner)) { - char c = scan_peek(scanner); - switch (c) { - case ' ': - case ',': // Commas are just syntactic sugar. - case '\f': - case '\n': - case '\r': - case '\t': - case '\v': { - scan_next(scanner); - } break; - case ';': { - // Found a comment! (skip) - scan_skip_line(scanner); - } break; - default: { - return; - } break; - } - } -} - -bool -is_valid_split(char c) { - switch (c) { - case ';': - case '(': - case ')': - case '[': - case ']': - case '{': - case '}': - case '+': - case '-': - case '*': - case '/': - case '%': - case '!': - case '=': - case '<': - case '>': - case '~': - case '&': - case '|': - case ':': - case '.': - case '@': - case '"': - case ' ': - case ',': - case '\f': - case '\n': - case '\r': - case '\t': - case '\v': { - return true; - } break; - } - return false; -} - -void -scan_skip_until_valid(Scanner *scanner) { - while (scan_has_next(scanner)) { - char c = scan_peek(scanner); - if (is_valid_split(c)) { - return; - } - scan_next(scanner); - } -} - -Token -emit_token_number(Scanner *scanner) { - Scanner current = *scanner; - char c = scan_peek(scanner); - if (c == '+' || c == '-') { - scan_next(scanner); - if (str_has_prefix(scanner->str, cstr("0b")) || - str_has_prefix(scanner->str, cstr("0x"))) { - scan_skip_until_valid(scanner); - return emit_token_err( - ¤t, - cstr("malformed number: binary/hex numbers can't be signed")); - } - } - if (str_has_prefix(scanner->str, cstr("0b"))) { - scan_next(scanner); - scan_next(scanner); - while (scan_has_next(scanner)) { - c = scan_peek(scanner); - if (c == '0' || c == '1' || c == '_') { - scan_next(scanner); - continue; - } - if (is_valid_split(c)) { - return emit_token(current, scanner, TOK_NUMBER); - } - scan_skip_until_valid(scanner); - return emit_token_err( - ¤t, cstr("malformed number: invalid binary number")); - } - } else if (str_has_prefix(scanner->str, cstr("0x"))) { - scan_next(scanner); - scan_next(scanner); - while (scan_has_next(scanner)) { - c = scan_peek(scanner); - if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || - (c >= 'A' && c <= 'F') || c == '_') { - scan_next(scanner); - continue; - } - if (is_valid_split(c)) { - return emit_token(current, scanner, TOK_NUMBER); - } - scan_skip_until_valid(scanner); - return emit_token_err(¤t, - cstr("malformed number: invalid hex number")); - } - } else { - // Integral. - while (scan_has_next(scanner)) { - c = scan_peek(scanner); - if (c == '.') { - scan_next(scanner); - break; - } - if ((c >= '0' && c <= '9') || c == '_') { - scan_next(scanner); - continue; - } - if (is_valid_split(c)) { - return emit_token(current, scanner, TOK_NUMBER); - } - scan_skip_until_valid(scanner); - return emit_token_err(¤t, cstr("malformed number")); - } - c = scan_peek(scanner); - if (!(c >= '0' && c <= '9')) { - return emit_token_err(¤t, - cstr("malformed number: no decimal digits")); - } - // Decimals. - while (scan_has_next(scanner)) { - c = scan_peek(scanner); - if (c == 'e' || c == 'E') { - scan_next(scanner); - break; - } - if ((c >= '0' && c <= '9') || c == '_') { - scan_next(scanner); - continue; - } - if (is_valid_split(c)) { - return emit_token(current, scanner, TOK_NUMBER); - } - scan_skip_until_valid(scanner); - return emit_token_err(¤t, cstr("malformed number")); - } - // Exponent. - c = scan_peek(scanner); - if (c == '+' || c == '-') { - scan_next(scanner); - } - while (scan_has_next(scanner)) { - c = scan_peek(scanner); - if ((c >= '0' && c <= '9') || c == '_') { - scan_next(scanner); - continue; - } - if (c == '.') { - scan_next(scanner); - return emit_token_err( - ¤t, - cstr("malformed number: decimals not allowed on exponent")); - } - if (is_valid_split(c)) { - return emit_token(current, scanner, TOK_NUMBER); - } - scan_skip_until_valid(scanner); - return emit_token_err(¤t, cstr("malformed number")); - } - } - return emit_token_err(¤t, cstr("malformed number")); -} - -Token -scan_token(Scanner *scanner) { - assert(scanner); - - scan_skip_whitespace(scanner); - if (!scan_has_next(scanner)) { - return emit_token(*scanner, scanner, TOK_EOF); - } - - Scanner current = *scanner; - char c = scan_next(scanner); - switch (c) { - case '(': - return emit_token(current, scanner, TOK_LPAREN); - case ')': - return emit_token(current, scanner, TOK_RPAREN); - case '[': - return emit_token(current, scanner, TOK_LSQUARE); - case ']': - return emit_token(current, scanner, TOK_RSQUARE); - case '{': - return emit_token(current, scanner, TOK_LCURLY); - case '}': - return emit_token(current, scanner, TOK_RCURLY); - case '+': { - char p = scan_peek(scanner); - if (p >= '0' && p <= '9') { - *scanner = current; - return emit_token_number(scanner); - } - return emit_token(current, scanner, TOK_ADD); - }; - case '-': { - char p = scan_peek(scanner); - if (p >= '0' && p <= '9') { - *scanner = current; - return emit_token_number(scanner); - } - return emit_token(current, scanner, TOK_ADD); - }; - case '*': - return emit_token(current, scanner, TOK_MUL); - case '/': - return emit_token(current, scanner, TOK_DIV); - case '%': - return emit_token(current, scanner, TOK_MOD); - case '!': { - if (scan_peek(scanner) == '=') { - scan_next(scanner); - return emit_token(current, scanner, TOK_NOTEQ); - } - return emit_token(current, scanner, TOK_NOT); - }; - case '=': { - if (scan_peek(scanner) == '=') { - scan_next(scanner); - return emit_token(current, scanner, TOK_EQ); - } - return emit_token(current, scanner, TOK_ASSIGN); - }; - case '<': { - char p = scan_peek(scanner); - if (p == '=') { - scan_next(scanner); - return emit_token(current, scanner, TOK_LE); - } - if (p == '<') { - scan_next(scanner); - return emit_token(current, scanner, TOK_BITLSHIFT); - } - return emit_token(current, scanner, TOK_LT); - }; - case '>': { - char p = scan_peek(scanner); - if (p == '=') { - scan_next(scanner); - return emit_token(current, scanner, TOK_GE); - } - if (p == '>') { - scan_next(scanner); - return emit_token(current, scanner, TOK_BITRSHIFT); - } - return emit_token(current, scanner, TOK_GT); - }; - case '~': - return emit_token(current, scanner, TOK_BITNOT); - case '&': { - if (scan_peek(scanner) == '&') { - scan_next(scanner); - return emit_token(current, scanner, TOK_AND); - } - return emit_token(current, scanner, TOK_BITAND); - }; - case '|': { - if (scan_peek(scanner) == '|') { - scan_next(scanner); - return emit_token(current, scanner, TOK_OR); - } - return emit_token(current, scanner, TOK_BITOR); - }; - case ':': - return emit_token(current, scanner, TOK_COLON); - case '.': - return emit_token(current, scanner, TOK_DOT); - case '@': - return emit_token(current, scanner, TOK_AT); - case '"': { - while (scan_has_next(scanner)) { - c = scan_next(scanner); - if (c == '\\') { - scan_next(scanner); - continue; - } - if (c == '"') { - return emit_token(current, scanner, TOK_STRING); - } - } - return emit_token_err(¤t, cstr("mismatched string quotes")); - }; - } - if (c >= '0' && c <= '9') { - *scanner = current; - return emit_token_number(scanner); - } - - scan_skip_until_valid(scanner); - Str val = current.str; - val.size = current.str.size - scanner->str.size; - val.size = val.size < 0 ? 0 : val.size; - if (val.size == 0) { - return emit_token_err(¤t, cstr("unexpected character")); - } - switch (val.mem[0]) { - case 'b': { - if (str_has_prefix(val, cstr("break"))) { - return emit_token(current, scanner, TOK_BREAK); - } - } break; - case 'c': { - if (str_has_prefix(val, cstr("case"))) { - return emit_token(current, scanner, TOK_CASE); - } - if (str_has_prefix(val, cstr("continue"))) { - return emit_token(current, scanner, TOK_CONTINUE); - } - } break; - case 'f': { - if (str_has_prefix(val, cstr("false"))) { - return emit_token(current, scanner, TOK_FALSE); - } - if (str_has_prefix(val, cstr("fun"))) { - return emit_token(current, scanner, TOK_FUN); - } - } break; - case 'i': { - if (str_has_prefix(val, cstr("if"))) { - return emit_token(current, scanner, TOK_IF); - } - } break; - case 'l': { - if (str_has_prefix(val, cstr("let"))) { - return emit_token(current, scanner, TOK_LET); - } - } break; - case 'm': { - if (str_has_prefix(val, cstr("match"))) { - return emit_token(current, scanner, TOK_MATCH); - } - } break; - case 'n': { - if (str_has_prefix(val, cstr("nil"))) { - return emit_token(current, scanner, TOK_NIL); - } - } break; - case 'r': { - if (str_has_prefix(val, cstr("return"))) { - return emit_token(current, scanner, TOK_RETURN); - } - } break; - case 's': { - if (str_has_prefix(val, cstr("set"))) { - return emit_token(current, scanner, TOK_SET); - } - if (str_has_prefix(val, cstr("struct"))) { - return emit_token(current, scanner, TOK_STRUCT); - } - } break; - case 't': { - if (str_has_prefix(val, cstr("true"))) { - return emit_token(current, scanner, TOK_TRUE); - } - } break; - case 'w': { - if (str_has_prefix(val, cstr("while"))) { - return emit_token(current, scanner, TOK_WHILE); - } - } break; - } - return emit_token(current, scanner, TOK_SYMBOL); -} - void process_file(Str path) { Arena lexer_arena = arena_create(LEXER_MEM, os_allocator); @@ -628,36 +33,24 @@ process_file(Str path) { Scanner scanner = { .str = file.data, - .storage = &lexer_arena, }; Token tok = {0}; + sz errors = 0; while (tok.type != TOK_EOF) { tok = scan_token(&scanner); - eprintln("%s:%d:%d:%s %s", path, tok.line, tok.col, token_str[tok.type], - tok.val); + if (tok.type == TOK_UNKNOWN) { + eprintln("%s:%d:%d:%s %s", path, tok.line, tok.col, + token_str[tok.type], tok.val); + errors++; + } } - // while (true) { - // Token tok = scan_token(&scanner); - // println("%s:%d:%d:%s %s", path, tok.line, tok.col, - // token_str[tok.type], - // tok.val); - // if (tok.type == TOK_EOF) break; - // } - // Str scanner = file.data; - // // NOTE: Testing file read line by line. - // for (sz i = 0; scanner.size != 0; i++) { - // Str line = str_split(&scanner, cstr("\n")); - // println("%x{4} %s", i + 1, line); - // } - - // println("<<< %x{4} %b{4} %f{2} %s %{Arena} >>>", 123, 3, 1.345, - // cstr("BOOM!"), &logger_inf.storage); + // Only proceed if there are no errors. + if (errors) { + goto stop; + } - // println("%{Mem}", &(Array){lexer_arena.beg, lexer_arena.size}); - // eprintln("%s:%d:%d: %s -> %c", path, 1, 1, cstr("error: testing string - // logger"), 'X'); while (true) {} - // TODO: run lexer. +stop: // Free up resources. arena_destroy(&lexer_arena, os_allocator); } -- cgit v1.2.1