From ee1a5de91c875fb66724dc21c02333bfebe2a812 Mon Sep 17 00:00:00 2001 From: Bad Diode Date: Tue, 1 Feb 2022 18:36:52 +0100 Subject: Add new syntax to lexer and prepare refactor --- src/lexer.c | 224 ++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 144 insertions(+), 80 deletions(-) (limited to 'src/lexer.c') diff --git a/src/lexer.c b/src/lexer.c index 09c8f6c..56b670b 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -5,7 +5,11 @@ static const char* token_str[] = { [TOKEN_UNKNOWN] = "TOKEN_UNKNOWN", [TOKEN_LPAREN] = "TOKEN_LPAREN", [TOKEN_RPAREN] = "TOKEN_RPAREN", - [TOKEN_FIXNUM] = "TOKEN_FIXNUM", + [TOKEN_LSQUARE] = "TOKEN_LSQUARE", + [TOKEN_RSQUARE] = "TOKEN_RSQUARE", + [TOKEN_LCURLY] = "TOKEN_LCURLY", + [TOKEN_RCURLY] = "TOKEN_RCURLY", + [TOKEN_NUMBER] = "TOKEN_NUMBER", [TOKEN_SYMBOL] = "TOKEN_SYMBOL", [TOKEN_STRING] = "TOKEN_STRING", [TOKEN_NIL] = "TOKEN_NIL", @@ -16,6 +20,10 @@ static const char* token_str[] = { [TOKEN_DEF] = "TOKEN_DEF", [TOKEN_SET] = "TOKEN_SET", [TOKEN_FUN] = "TOKEN_FUN", + [TOKEN_STRUCT] = "TOKEN_STRUCT", + [TOKEN_COLON] = "TOKEN_COLON", + [TOKEN_DOT] = "TOKEN_DOT", + [TOKEN_AT] = "TOKEN_AT", [TOKEN_EOF] = "TOKEN_EOF", }; @@ -24,14 +32,8 @@ print_token(Token tok) { printf("[%4ld:%-4ld] ", tok.line, tok.col); printf("%s", token_str[tok.type]); switch (tok.type) { - case TOKEN_FIXNUM: { - printf(" -> "); - sv_write(&tok.value); - } break; - case TOKEN_SYMBOL: { - printf(" -> "); - sv_write(&tok.value); - } break; + case TOKEN_NUMBER: + case TOKEN_SYMBOL: case TOKEN_STRING: { printf(" -> "); sv_write(&tok.value); @@ -55,6 +57,12 @@ scan_next(Scanner *scanner) { return c; } +void +scan_rewind(Scanner *scanner) { + sv_rewind(&scanner->current); + scanner->offset--; +} + char scan_peek(const Scanner *scanner) { return sv_peek(&scanner->current); @@ -95,6 +103,12 @@ is_delimiter(char c) { case '\'': case '(': case ')': + case '[': + case ']': + case '{': + case '}': + case ':': + case '@': case ' ': case '\f': case '\n': @@ -110,22 +124,65 @@ is_delimiter(char c) { #define TOKEN_IS_KEYWORD(VAL, KEYWORD) \ sv_equal(&(VAL), &(StringView){(KEYWORD), sizeof(KEYWORD) - 1}) -TokenType -find_primitive_type(const StringView value) { - bool is_fixnum = true; - for (size_t i = 0; i < value.n; i++) { - char c = value.start[i]; - if (i == 0 && c == '-' && value.n > 1) { - continue; - } - if (!(c >= '0' && c <= '9')) { - is_fixnum = false; - break; +size_t +scan_number_token(Scanner *scanner) { + char first = scan_next(scanner); + char second = scan_peek(scanner); + size_t n = 1; + if (first == '0' && !is_delimiter(second)) { + if (second == 'x') { + // Hex constant. + scan_next(scanner); + n++; + if (is_delimiter(scan_peek(scanner))) { + return 0; + } + while (!is_delimiter(scan_peek(scanner))) { + char c = scan_next(scanner); + if (!(c >= '0' && c <= '9') && + !(c >= 'a' && c <= 'f') && + !(c >= 'A' && c <= 'F')) { + return 0; + } + n++; + } + return n; + } else if (second == 'b') { + // Binary constant. + scan_next(scanner); + n++; + if (is_delimiter(scan_peek(scanner))) { + return 0; + } + while (!is_delimiter(scan_peek(scanner))) { + char c = scan_next(scanner); + if (!(c == '0' || c == '1')) { + return 0; + } + n++; + } } } - if (is_fixnum) { - return TOKEN_FIXNUM; + + // Decimal number or floating point. + bool has_dot = false; + while (!is_delimiter(scan_peek(scanner))) { + char c = scan_next(scanner); + if (c == '.') { + if (has_dot) { + return 0; + } + has_dot = true; + } else if (!(c >= '0' && c <= '9')) { + return 0; + } + n++; } + return n; +} + +TokenType +find_token_type(const StringView value) { if (TOKEN_IS_KEYWORD(value, "nil")) { return TOKEN_NIL; } if (TOKEN_IS_KEYWORD(value, "true")) { return TOKEN_TRUE; } if (TOKEN_IS_KEYWORD(value, "false")) { return TOKEN_FALSE; } @@ -134,12 +191,20 @@ find_primitive_type(const StringView value) { if (TOKEN_IS_KEYWORD(value, "def")) { return TOKEN_DEF; } if (TOKEN_IS_KEYWORD(value, "set!")) { return TOKEN_SET; } if (TOKEN_IS_KEYWORD(value, "fun")) { return TOKEN_FUN; } + if (TOKEN_IS_KEYWORD(value, "struct")) { return TOKEN_STRUCT; } return TOKEN_SYMBOL; } +void +print_tokens(Token *tokens) { + for (size_t i = 0; i < array_size(tokens); i++) { + print_token(tokens[i]); + } +} + Token * -tokenize(const StringView *sv, Errors *errors) { +tokenize(const StringView *sv) { Token *tokens = NULL; array_init(tokens, 1); Scanner scanner = (Scanner){ @@ -153,10 +218,16 @@ tokenize(const StringView *sv, Errors *errors) { size_t line = scanner.line_number; size_t col = scanner.col_number; size_t offset = scanner.offset; + Token token = (Token){ + .type = TOKEN_UNKNOWN, + .line = line, + .col = col, + }; char c = scan_next(&scanner); switch (c) { case ';': { while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} + continue; } break; case '"': { char prev = c; @@ -172,73 +243,66 @@ tokenize(const StringView *sv, Errors *errors) { n++; } if (!found) { - error_push(errors, (Error){ - .type = ERR_TYPE_LEXER, - .value = ERR_UNMATCHED_STRING, - .line = line, - .col = col, - }); + push_error(ERR_TYPE_LEXER, ERR_UNMATCHED_STRING, line, col); return tokens; } - Token token = (Token){ - .value = (StringView){ - .start = &sv->start[offset + 1], - .n = n, - }, - .type = TOKEN_STRING, - .line = line, - .col = col, - }; - array_push(tokens, token); - } break; - case '(': { - if (scan_peek(&scanner) == ')') { - scan_next(&scanner); - Token token = (Token){ - .type = TOKEN_NIL, - .line = line, - .col = col, - }; - array_push(tokens, token); - } else { - Token token = (Token){ - .type = TOKEN_LPAREN, - .line = line, - .col = col, - }; - array_push(tokens, token); - } - } break; - case ')': { - Token token = (Token){ - .type = TOKEN_RPAREN, - .line = line, - .col = col, + token.value = (StringView){ + .start = &sv->start[offset + 1], + .n = n, }; - array_push(tokens, token); + token.type = TOKEN_STRING; } break; + case '(': { token.type = TOKEN_LPAREN; } break; + case ')': { token.type = TOKEN_RPAREN; } break; + case '[': { token.type = TOKEN_LSQUARE; } break; + case ']': { token.type = TOKEN_RSQUARE; } break; + case '{': { token.type = TOKEN_LCURLY; } break; + case '}': { token.type = TOKEN_RCURLY; } break; + case ':': { token.type = TOKEN_COLON; } break; + case '.': { token.type = TOKEN_DOT; } break; + case '@': { token.type = TOKEN_AT; } break; default: { size_t n = 1; - while (!is_delimiter(scan_peek(&scanner))) { - scan_next(&scanner); - n++; - } - if (c == EOF || c == '\0') { - break; - } - Token token = (Token){ - .value = (StringView){ + if (c == '-' && !is_delimiter(scan_peek(&scanner))) { + n += scan_number_token(&scanner); + token.value = (StringView){ .start = &sv->start[offset], .n = n, - }, - .type = TOKEN_SYMBOL, - .line = line, - .col = col, - }; - token.type = find_primitive_type(token.value); - array_push(tokens, token); + }; + token.type = TOKEN_NUMBER; + } else if (c >= '0' && c <= '9') { + scan_rewind(&scanner); + n = scan_number_token(&scanner); + if (n == 0) { + push_error(ERR_TYPE_LEXER, ERR_MALFORMED_NUMBER, line, col); + return tokens; + } + token.value = (StringView){ + .start = &sv->start[offset], + .n = n, + }; + token.type = TOKEN_NUMBER; + } else { + while (!is_delimiter(scan_peek(&scanner))) { + if (scan_peek(&scanner) == '.') { + break; + } + c = scan_next(&scanner); + n++; + } + token.value = (StringView){ + .start = &sv->start[offset], + .n = n, + }; + token.type = find_token_type(token.value); + } } break; } + if (token.type == TOKEN_UNKNOWN) { + push_error(ERR_TYPE_LEXER, ERR_UNKNOWN_TOK_TYPE, line, col); + return tokens; + } + array_push(tokens, token); } // Push EOF token. -- cgit v1.2.1