typedef enum TokenType { TOKEN_UNKNOWN = 0, TOKEN_LPAREN, TOKEN_RPAREN, TOKEN_QUOTE, TOKEN_TRUE, TOKEN_FALSE, TOKEN_NIL, TOKEN_FIXNUM, TOKEN_SYMBOL, TOKEN_STRING, TOKEN_EOF, } TokenType; typedef struct Token { TokenType type; StringView value; size_t line; size_t column; } Token; typedef struct Tokens { Token *buf; size_t size; size_t cap; } Tokens; void print_token(Token tok) { printf("LINE: %3ld COL: %3ld ", tok.line, tok.column); switch (tok.type) { case TOKEN_LPAREN: { printf("TOKEN_LPAREN"); } break; case TOKEN_RPAREN: { printf("TOKEN_RPAREN"); } break; case TOKEN_QUOTE: { printf("TOKEN_QUOTE"); } break; case TOKEN_TRUE: { printf("TOKEN_TRUE"); } break; case TOKEN_FALSE: { printf("TOKEN_FALSE"); } break; case TOKEN_NIL: { printf("TOKEN_NIL"); } break; case TOKEN_FIXNUM: { printf("TOKEN_FIXNUM -> "); sv_write(&tok.value, stdout); } break; case TOKEN_SYMBOL: { printf("TOKEN_SYMBOL -> "); sv_write(&tok.value, stdout); } break; case TOKEN_STRING: { printf("TOKEN_STRING -> "); sv_write(&tok.value, stdout); } break; case TOKEN_EOF: { printf("TOKEN_EOF"); } break; case TOKEN_UNKNOWN: { printf("TOKEN_UNKNOWN"); } break; } printf("\n"); } #define TOK_BUF_CAP 256 void push_token(Tokens *tokens, Token tok) { if (tokens->buf == NULL) { tokens->size = 0; tokens->cap = TOK_BUF_CAP; tokens->buf = malloc(tokens->cap * sizeof(Token)); } else if (tokens->size == tokens->cap) { tokens->cap *= 2; tokens->buf = realloc(tokens->buf, tokens->cap * sizeof(Token)); } tokens->buf[tokens->size++] = tok; } typedef struct Scanner { StringView orig; StringView current; size_t line_number; size_t col_number; size_t offset; size_t lexeme_n; size_t lexeme_offset; size_t lexeme_line_number; size_t lexeme_col_number; } Scanner; char scan_next(Scanner *scanner) { if (scanner->lexeme_n == 0) { scanner->lexeme_line_number = scanner->line_number; scanner->lexeme_col_number = scanner->col_number; scanner->lexeme_offset = scanner->offset; } char c = sv_next(&scanner->current); if (c == '\n') { scanner->line_number++; scanner->col_number = 1; } else { scanner->col_number++; } scanner->offset++; return c; } char scan_peek(const Scanner *scanner) { return sv_peek(&scanner->current); } bool scan_has_next(const Scanner *scanner) { return scanner->current.n != 0; } bool scan_has_lexeme(const Scanner * scanner) { return scanner->lexeme_n != 0; } Token scan_get_lexeme(Scanner * scanner) { Token token = (Token){ .type = TOKEN_UNKNOWN, .value = (StringView){ .start = &scanner->orig.start[scanner->lexeme_offset], .n = scanner->lexeme_n, }, .line = scanner->lexeme_line_number, .column = scanner->lexeme_col_number, }; scanner->lexeme_n = 0; scanner->lexeme_line_number = scanner->line_number; scanner->lexeme_col_number = scanner->col_number; scanner->lexeme_offset = scanner->offset; return token; } TokenType find_primitive_type(StringView value) { bool is_fixnum = true; for (size_t i = 0; i < value.n; i++) { char c = value.start[i]; if (i == 0 && c == '-' && value.n > 1) { continue; } if (!(c >= '0' && c <= '9')) { is_fixnum = false; break; } } if (is_fixnum) { return TOKEN_FIXNUM; } if (sv_equal(&value, &(StringView){"true", 4})) { return TOKEN_TRUE; } if (sv_equal(&value, &(StringView){"false", 5})) { return TOKEN_FALSE; } return TOKEN_SYMBOL; } Tokens tokenize(const StringView *sv) { Tokens tokens = (Tokens){0}; Scanner scanner = (Scanner){ .orig = *sv, .current = *sv, .line_number = 1, .col_number = 1, .lexeme_line_number = 1, .lexeme_col_number = 1, }; while (scan_has_next(&scanner)) { char c = scan_next(&scanner); switch (c) { case ' ': case '\f': case '\n': case '\r': case '\t': case '\v': { if (scan_has_lexeme(&scanner)) { Token token = scan_get_lexeme(&scanner); token.type = find_primitive_type(token.value); push_token(&tokens, token); } } break; case ';': { if (scan_has_lexeme(&scanner)) { Token token = scan_get_lexeme(&scanner); token.type = find_primitive_type(token.value); push_token(&tokens, token); } while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} } break; case '"': { if (scan_has_lexeme(&scanner)) { Token token = scan_get_lexeme(&scanner); token.type = find_primitive_type(token.value); push_token(&tokens, token); scanner.lexeme_col_number--; scanner.lexeme_offset--; } char prev = c; bool found = false; while (scan_has_next(&scanner)) { c = scan_next(&scanner); if (c == '"' && prev != '\\') { found = true; break; } prev = c; scanner.lexeme_n++; } scanner.lexeme_col_number--; if (found) { Token token = scan_get_lexeme(&scanner); token.type = TOKEN_STRING; push_token(&tokens, token); } else { // TODO: Report error: couldn't find the closing quotes. } } break; case '\'': { if (scan_has_lexeme(&scanner)) { Token token = scan_get_lexeme(&scanner); token.type = find_primitive_type(token.value); push_token(&tokens, token); scanner.lexeme_col_number--; scanner.lexeme_offset--; } Token token = scan_get_lexeme(&scanner); token.type = TOKEN_QUOTE; push_token(&tokens, token); } break; case '(': { if (scan_has_lexeme(&scanner)) { Token token = scan_get_lexeme(&scanner); token.type = find_primitive_type(token.value); push_token(&tokens, token); scanner.lexeme_col_number--; scanner.lexeme_offset--; } scanner.lexeme_n++; if (scan_peek(&scanner) == ')') { scanner.lexeme_n++; scan_next(&scanner); Token token = scan_get_lexeme(&scanner); token.type = TOKEN_NIL; push_token(&tokens, token); } else { Token token = scan_get_lexeme(&scanner); token.type = TOKEN_LPAREN; push_token(&tokens, token); } } break; case ')': { if (scan_has_lexeme(&scanner)) { Token token = scan_get_lexeme(&scanner); token.type = find_primitive_type(token.value); push_token(&tokens, token); scanner.lexeme_col_number--; scanner.lexeme_offset--; } scanner.lexeme_n++; Token token = scan_get_lexeme(&scanner); token.type = TOKEN_RPAREN; push_token(&tokens, token); } break; case EOF: { break; } break; default: { scanner.lexeme_n++; } break; } } // Push current lexeme if any. if (scan_has_lexeme(&scanner)) { Token token = scan_get_lexeme(&scanner); token.type = find_primitive_type(token.value); push_token(&tokens, token); } // Push EOF token. Token token = scan_get_lexeme(&scanner); token.type = TOKEN_EOF; push_token(&tokens, token); return tokens; }