#include "lexer.h" #include "errors.h" static const char* token_str[] = { [TOKEN_UNKNOWN] = "TOKEN_UNKNOWN", [TOKEN_LPAREN] = "TOKEN_LPAREN", [TOKEN_RPAREN] = "TOKEN_RPAREN", [TOKEN_LSQUARE] = "TOKEN_LSQUARE", [TOKEN_RSQUARE] = "TOKEN_RSQUARE", [TOKEN_LCURLY] = "TOKEN_LCURLY", [TOKEN_RCURLY] = "TOKEN_RCURLY", [TOKEN_NUMBER] = "TOKEN_NUMBER", [TOKEN_SYMBOL] = "TOKEN_SYMBOL", [TOKEN_STRING] = "TOKEN_STRING", [TOKEN_NIL] = "TOKEN_NIL", [TOKEN_TRUE] = "TOKEN_TRUE", [TOKEN_FALSE] = "TOKEN_FALSE", [TOKEN_LAMBDA] = "TOKEN_LAMBDA", [TOKEN_IF] = "TOKEN_IF", [TOKEN_DEF] = "TOKEN_DEF", [TOKEN_SET] = "TOKEN_SET", [TOKEN_FUN] = "TOKEN_FUN", [TOKEN_STRUCT] = "TOKEN_STRUCT", [TOKEN_COLON] = "TOKEN_COLON", [TOKEN_DOT] = "TOKEN_DOT", [TOKEN_AT] = "TOKEN_AT", [TOKEN_EOF] = "TOKEN_EOF", }; void print_token(Token tok) { printf("[%4ld:%-4ld] ", tok.line, tok.col); printf("%s", token_str[tok.type]); switch (tok.type) { case TOKEN_NUMBER: case TOKEN_SYMBOL: case TOKEN_STRING: { printf(" -> "); sv_write(&tok.value); } break; default: { } break; } printf("\n"); } char scan_next(Scanner *scanner) { char c = sv_next(&scanner->current); if (c == '\n') { scanner->line_number++; scanner->col_number = 1; } else { scanner->col_number++; } scanner->offset++; return c; } void scan_rewind(Scanner *scanner) { sv_rewind(&scanner->current); scanner->offset--; } char scan_peek(const Scanner *scanner) { return sv_peek(&scanner->current); } bool scan_has_next(const Scanner *scanner) { return scanner->current.n != 0; } void skip_whitespace(Scanner *scanner) { while (scan_has_next(scanner)) { char c = scan_peek(scanner); switch (c) { case ' ': case '\f': case '\n': case '\r': case '\t': case '\v': { scan_next(scanner); } break; default: { return; } break; } } } bool is_delimiter(char c) { switch (c) { case EOF: case '\0': case ';': case '"': case '\'': case '(': case ')': case '[': case ']': case '{': case '}': case ':': case '@': case ' ': case '\f': case '\n': case '\r': case '\t': case '\v': { return true; } break; } return false; } #define TOKEN_IS_KEYWORD(VAL, KEYWORD) \ sv_equal(&(VAL), &(StringView){(KEYWORD), sizeof(KEYWORD) - 1}) size_t scan_number_token(Scanner *scanner) { char first = scan_next(scanner); char second = scan_peek(scanner); size_t n = 1; if (first == '0' && !is_delimiter(second)) { if (second == 'x') { // Hex constant. scan_next(scanner); n++; if (is_delimiter(scan_peek(scanner))) { return 0; } while (!is_delimiter(scan_peek(scanner))) { char c = scan_next(scanner); if (!(c >= '0' && c <= '9') && !(c >= 'a' && c <= 'f') && !(c >= 'A' && c <= 'F')) { return 0; } n++; } return n; } else if (second == 'b') { // Binary constant. scan_next(scanner); n++; if (is_delimiter(scan_peek(scanner))) { return 0; } while (!is_delimiter(scan_peek(scanner))) { char c = scan_next(scanner); if (!(c == '0' || c == '1')) { return 0; } n++; } } } // Decimal number or floating point. bool has_dot = false; while (!is_delimiter(scan_peek(scanner))) { char c = scan_next(scanner); if (c == '.') { if (has_dot) { return 0; } has_dot = true; } else if (!(c >= '0' && c <= '9')) { return 0; } n++; } return n; } TokenType find_token_type(const StringView value) { if (TOKEN_IS_KEYWORD(value, "nil")) { return TOKEN_NIL; } if (TOKEN_IS_KEYWORD(value, "true")) { return TOKEN_TRUE; } if (TOKEN_IS_KEYWORD(value, "false")) { return TOKEN_FALSE; } if (TOKEN_IS_KEYWORD(value, "lambda")) { return TOKEN_LAMBDA; } if (TOKEN_IS_KEYWORD(value, "if")) { return TOKEN_IF; } if (TOKEN_IS_KEYWORD(value, "def")) { return TOKEN_DEF; } if (TOKEN_IS_KEYWORD(value, "set!")) { return TOKEN_SET; } if (TOKEN_IS_KEYWORD(value, "fun")) { return TOKEN_FUN; } if (TOKEN_IS_KEYWORD(value, "struct")) { return TOKEN_STRUCT; } return TOKEN_SYMBOL; } void print_tokens(Token *tokens) { for (size_t i = 0; i < array_size(tokens); i++) { print_token(tokens[i]); } } Token * tokenize(const StringView *sv) { Token *tokens = NULL; array_init(tokens, 1); Scanner scanner = (Scanner){ .current = *sv, .line_number = 1, .col_number = 1, }; while (scan_has_next(&scanner)) { skip_whitespace(&scanner); size_t line = scanner.line_number; size_t col = scanner.col_number; size_t offset = scanner.offset; Token token = (Token){ .type = TOKEN_UNKNOWN, .line = line, .col = col, }; char c = scan_next(&scanner); switch (c) { case ';': { while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} continue; } break; case '"': { char prev = c; bool found = false; size_t n = 0; while (scan_has_next(&scanner)) { c = scan_next(&scanner); if (c == '"' && prev != '\\') { found = true; break; } prev = c; n++; } if (!found) { push_error(ERR_TYPE_LEXER, ERR_UNMATCHED_STRING, line, col); return tokens; } token.value = (StringView){ .start = &sv->start[offset + 1], .n = n, }; token.type = TOKEN_STRING; } break; case '(': { token.type = TOKEN_LPAREN; } break; case ')': { token.type = TOKEN_RPAREN; } break; case '[': { token.type = TOKEN_LSQUARE; } break; case ']': { token.type = TOKEN_RSQUARE; } break; case '{': { token.type = TOKEN_LCURLY; } break; case '}': { token.type = TOKEN_RCURLY; } break; case ':': { token.type = TOKEN_COLON; } break; case '.': { token.type = TOKEN_DOT; } break; case '@': { token.type = TOKEN_AT; } break; default: { if (c == EOF || c == '\0') { token.type = TOKEN_EOF; break; } size_t n = 1; if (c == '-' && !is_delimiter(scan_peek(&scanner))) { n += scan_number_token(&scanner); token.value = (StringView){ .start = &sv->start[offset], .n = n, }; token.type = TOKEN_NUMBER; } else if (c >= '0' && c <= '9') { scan_rewind(&scanner); n = scan_number_token(&scanner); if (n == 0) { push_error(ERR_TYPE_LEXER, ERR_MALFORMED_NUMBER, line, col); return tokens; } token.value = (StringView){ .start = &sv->start[offset], .n = n, }; token.type = TOKEN_NUMBER; } else { while (!is_delimiter(scan_peek(&scanner))) { if (scan_peek(&scanner) == '.') { break; } c = scan_next(&scanner); n++; } token.value = (StringView){ .start = &sv->start[offset], .n = n, }; token.type = find_token_type(token.value); } } break; } if (token.type == TOKEN_UNKNOWN) { push_error(ERR_TYPE_LEXER, ERR_UNKNOWN_TOK_TYPE, line, col); return tokens; } array_push(tokens, token); } return tokens; }