typedef enum TokenType { TOKEN_UNKNOWN = 0, TOKEN_LPAREN, TOKEN_RPAREN, TOKEN_FIXNUM, TOKEN_SYMBOL, TOKEN_BOOL, TOKEN_STRING, } TokenType; typedef struct Token { TokenType type; StringView value; } Token; typedef struct Tokens { Token *start; size_t n; } Tokens; #define TRUE_TOKEN (StringView){"true", 4} #define FALSE_TOKEN (StringView){"false", 5} #define LPAREN_TOKEN (StringView){"(", 1} #define RPAREN_TOKEN (StringView){")", 1} TokenType find_token_type(StringView value) { bool is_fixnum = true; for (size_t i = 0; i < value.n; i++) { char c = value.start[i]; if (i == 0 && c == '-' && value.n > 1) { continue; } if (!isdigit(c)) { is_fixnum = false; break; } } if (is_fixnum) { return TOKEN_FIXNUM; } if (sv_equal(value, TRUE_TOKEN) || sv_equal(value, FALSE_TOKEN)) { return TOKEN_BOOL; } return TOKEN_SYMBOL; } Tokens tokenize(StringView sv) { // NOTE: Not allocating any memory for now, but we are limited by a maximum // number of tokens we can process. #define TOKENS_BUF_SIZE KB(64) static Token tokens_buf[TOKENS_BUF_SIZE]; // Clear buffer. for (size_t i = 0; i < TOKENS_BUF_SIZE; i++) { tokens_buf[i] = (Token){0}; } size_t n = 0; size_t token_n = 0; for (size_t i = 0; i < sv.n; i++) { switch (sv.start[i]) { case ' ': case '\f': case '\n': case '\r': case '\t': case '\v': { if (token_n != 0) { Token token = (Token){ .type = TOKEN_UNKNOWN, .value = (StringView){ .start = &sv.start[i - token_n], .n = token_n, } }; token.type = find_token_type(token.value); tokens_buf[n++] = token; token_n = 0; } } break; case ';': { if (token_n != 0) { Token token = (Token){ .type = TOKEN_UNKNOWN, .value = (StringView){ .start = &sv.start[i - token_n], .n = token_n, } }; token.type = find_token_type(token.value); tokens_buf[n++] = token; token_n = 0; } // Advance until the next newline. do { i++; } while (i < sv.n && sv.start[(i + 1)] != '\n'); } break; case '"': { if (token_n != 0) { fprintf(stderr, "error: string started inside symbol\n"); return (Tokens){0}; } // Find end delimiter. size_t string_start = i + 1; size_t string_end = i + 1; while (true) { if (sv.start[string_end] == '"' && sv.start[string_end - 1] != '\\') { break; } if (string_end >= sv.n) { fprintf(stderr, "error: string delimiter not found\n"); return (Tokens){0}; } string_end++; } Token token = (Token){ .type = TOKEN_STRING, .value = (StringView){ .start = &sv.start[string_start], .n = string_end - string_start, } }; tokens_buf[n++] = token; token_n = 0; i += string_end - string_start + 1; } break; case '(': { if ((i + 1) < sv.n) { char next_c = sv.start[i + 1]; if (isspace(next_c)) { fprintf(stderr, "error: lparen delimiter followed by space\n"); return (Tokens){0}; } } if (token_n != 0) { fprintf(stderr, "error: lparen delimiter within symbol name\n"); return (Tokens){0}; } Token token = (Token){ .type = TOKEN_LPAREN, .value = LPAREN_TOKEN, }; tokens_buf[n++] = token; } break; case ')': { if ((i + 1) < sv.n) { char next_c = sv.start[i + 1]; if ((next_c != ')' && !isspace(next_c))) { fprintf(stderr, "error: rparen delimiter within symbol name\n"); return (Tokens){0}; } } if (token_n != 0) { // Push previous token. Token token = (Token){ .type = TOKEN_UNKNOWN, .value = (StringView){ .start = &sv.start[i - token_n], .n = token_n, } }; token.type = find_token_type(token.value); tokens_buf[n++] = token; token_n = 0; } Token token = (Token){ .type = TOKEN_RPAREN, .value = RPAREN_TOKEN, }; tokens_buf[n++] = token; } break; case EOF: { break; } break; default: { token_n++; } break; } } if (token_n != 0) { // End of line encountered. Token token = (Token){ .type = TOKEN_UNKNOWN, .value = (StringView){ .start = &sv.start[sv.n - token_n], .n = token_n, } }; token.type = find_token_type(token.value); tokens_buf[n++] = token; } return (Tokens){.start = (Token *)&tokens_buf, .n = n}; }