diff options
author | Bad Diode <bd@badd10de.dev> | 2024-06-15 16:52:36 +0200 |
---|---|---|
committer | Bad Diode <bd@badd10de.dev> | 2024-06-15 16:52:36 +0200 |
commit | e7cd0d47a603e4199b0ee7daa2434fc0db602bad (patch) | |
tree | 511cfbe2cea66e45b4ca7669ed9a101763ae3537 | |
parent | 893b52223d274c675272cee55768a9d5853420fb (diff) | |
download | bdl-e7cd0d47a603e4199b0ee7daa2434fc0db602bad.tar.gz bdl-e7cd0d47a603e4199b0ee7daa2434fc0db602bad.zip |
Move lexer code to lexer.c file
-rw-r--r-- | Makefile | 3 | ||||
-rw-r--r-- | src/lexer.c | 734 | ||||
-rw-r--r-- | src/lexer.h | 99 | ||||
-rw-r--r-- | src/main.c | 631 |
4 files changed, 532 insertions, 935 deletions
@@ -43,9 +43,6 @@ $(BIN): $(SRC_MAIN) $(WATCH_SRC) $(BUILD_DIR) | |||
43 | $(BUILD_DIR): | 43 | $(BUILD_DIR): |
44 | mkdir -p $(BUILD_DIR) | 44 | mkdir -p $(BUILD_DIR) |
45 | 45 | ||
46 | tests: $(BIN) | ||
47 | ./$(BIN) tests/constants/numbers.bdl | ||
48 | |||
49 | run: $(BIN) | 46 | run: $(BIN) |
50 | $(BIN) tests/literals.bad | 47 | $(BIN) tests/literals.bad |
51 | 48 | ||
diff --git a/src/lexer.c b/src/lexer.c index a6d7c74..df998f2 100644 --- a/src/lexer.c +++ b/src/lexer.c | |||
@@ -1,130 +1,192 @@ | |||
1 | #include "lexer.h" | 1 | #define LEXER_MEM GB(2) |
2 | #include "errors.h" | 2 | |
3 | 3 | typedef enum TokenType { | |
4 | static const char* token_str[] = { | 4 | TOK_UNKNOWN = 0, |
5 | [TOKEN_UNKNOWN] = "UNKNOWN", | 5 | |
6 | [TOKEN_LPAREN] = "LPAREN", | 6 | // Parentheses. |
7 | [TOKEN_RPAREN] = "RPAREN", | 7 | TOK_LPAREN, // ( |
8 | [TOKEN_LSQUARE] = "LSQUARE", | 8 | TOK_RPAREN, // ) |
9 | [TOKEN_RSQUARE] = "RSQUARE", | 9 | TOK_LSQUARE, // [ |
10 | [TOKEN_LCURLY] = "LCURLY", | 10 | TOK_RSQUARE, // ] |
11 | [TOKEN_RCURLY] = "RCURLY", | 11 | TOK_LCURLY, // { |
12 | [TOKEN_NUMBER] = "NUMBER", | 12 | TOK_RCURLY, // } |
13 | [TOKEN_SYMBOL] = "SYMBOL", | 13 | |
14 | [TOKEN_STRING] = "STRING", | 14 | // Basic literals. |
15 | [TOKEN_NIL] = "NIL", | 15 | TOK_NUMBER, |
16 | [TOKEN_TRUE] = "TRUE", | 16 | TOK_SYMBOL, |
17 | [TOKEN_FALSE] = "FALSE", | 17 | TOK_STRING, |
18 | [TOKEN_LAMBDA] = "LAMBDA", | 18 | |
19 | [TOKEN_IF] = "IF", | 19 | // Keywords. |
20 | [TOKEN_DEF] = "DEF", | 20 | TOK_BREAK, // break |
21 | [TOKEN_SET] = "SET", | 21 | TOK_CASE, // case |
22 | [TOKEN_FUN] = "FUN", | 22 | TOK_CONTINUE, // continue |
23 | [TOKEN_STRUCT] = "STRUCT", | 23 | TOK_FALSE, // false |
24 | [TOKEN_ADD] = "ADD", | 24 | TOK_FUN, // fun |
25 | [TOKEN_SUB] = "SUB", | 25 | TOK_IF, // if |
26 | [TOKEN_MUL] = "MUL", | 26 | TOK_LET, // let |
27 | [TOKEN_DIV] = "DIV", | 27 | TOK_MATCH, // match |
28 | [TOKEN_MOD] = "MOD", | 28 | TOK_NIL, // nil |
29 | [TOKEN_NOT] = "NOT", | 29 | TOK_RETURN, // return |
30 | [TOKEN_AND] = "AND", | 30 | TOK_SET, // set |
31 | [TOKEN_OR] = "OR", | 31 | TOK_STRUCT, // struct |
32 | [TOKEN_EQ] = "EQ", | 32 | TOK_TRUE, // true |
33 | [TOKEN_LT] = "LT", | 33 | TOK_WHILE, // while |
34 | [TOKEN_GT] = "GT", | 34 | |
35 | [TOKEN_LE] = "LE", | 35 | // Arithmetic ops. |
36 | [TOKEN_GE] = "GE", | 36 | TOK_ADD, // + |
37 | [TOKEN_COLON] = "COLON", | 37 | TOK_SUB, // - |
38 | [TOKEN_DOT] = "DOT", | 38 | TOK_MUL, // * |
39 | [TOKEN_AT] = "AT", | 39 | TOK_DIV, // / |
40 | [TOKEN_EOF] = "EOF", | 40 | TOK_MOD, // % |
41 | }; | 41 | |
42 | // Logical ops. | ||
43 | TOK_NOT, // ! | ||
44 | TOK_AND, // && | ||
45 | TOK_OR, // || | ||
46 | TOK_EQ, // == | ||
47 | TOK_NOTEQ, // != | ||
48 | TOK_LT, // < | ||
49 | TOK_GT, // > | ||
50 | TOK_LE, // <= | ||
51 | TOK_GE, // >= | ||
52 | |||
53 | // Bitwise ops. | ||
54 | TOK_BITNOT, // ~ | ||
55 | TOK_BITAND, // & | ||
56 | TOK_BITOR, // | | ||
57 | TOK_BITLSHIFT, // << | ||
58 | TOK_BITRSHIFT, // >> | ||
59 | |||
60 | // Special ops. | ||
61 | TOK_COLON, // : | ||
62 | TOK_DOT, // . | ||
63 | TOK_AT, // @ | ||
64 | TOK_ASSIGN, // = | ||
65 | |||
66 | // End of file. | ||
67 | TOK_EOF, | ||
68 | } TokenType; | ||
69 | |||
70 | Str token_str[] = { | ||
71 | [TOK_UNKNOWN] = cstr("UNKNOWN"), | ||
72 | |||
73 | // Parentheses. | ||
74 | [TOK_LPAREN] = cstr("LPAREN"), | ||
75 | [TOK_RPAREN] = cstr("RPAREN"), | ||
76 | [TOK_LSQUARE] = cstr("LSQUARE"), | ||
77 | [TOK_RSQUARE] = cstr("RSQUARE"), | ||
78 | [TOK_LCURLY] = cstr("LCURLY"), | ||
79 | [TOK_RCURLY] = cstr("RCURLY"), | ||
80 | |||
81 | // Basic literals. | ||
82 | [TOK_NUMBER] = cstr("NUMBER"), | ||
83 | [TOK_SYMBOL] = cstr("SYMBOL"), | ||
84 | [TOK_STRING] = cstr("STRING"), | ||
85 | |||
86 | // Keywords. | ||
87 | [TOK_BREAK] = cstr("BREAK"), | ||
88 | [TOK_CASE] = cstr("CASE"), | ||
89 | [TOK_CONTINUE] = cstr("CONTINUE"), | ||
90 | [TOK_FALSE] = cstr("FALSE"), | ||
91 | [TOK_FUN] = cstr("FUN"), | ||
92 | [TOK_IF] = cstr("IF"), | ||
93 | [TOK_LET] = cstr("LET"), | ||
94 | [TOK_MATCH] = cstr("MATCH"), | ||
95 | [TOK_NIL] = cstr("NIL"), | ||
96 | [TOK_RETURN] = cstr("RETURN"), | ||
97 | [TOK_SET] = cstr("SET"), | ||
98 | [TOK_STRUCT] = cstr("STRUCT"), | ||
99 | [TOK_TRUE] = cstr("TRUE"), | ||
100 | [TOK_WHILE] = cstr("WHILE"), | ||
101 | |||
102 | // Arithmetic ops. | ||
103 | [TOK_ADD] = cstr("ADD"), | ||
104 | [TOK_SUB] = cstr("SUB"), | ||
105 | [TOK_MUL] = cstr("MUL"), | ||
106 | [TOK_DIV] = cstr("DIV"), | ||
107 | [TOK_MOD] = cstr("MOD"), | ||
42 | 108 | ||
43 | typedef struct Keyword { | 109 | // Logical ops. |
44 | char *str; | 110 | [TOK_NOT] = cstr("NOT"), |
45 | size_t n; | 111 | [TOK_AND] = cstr("AND"), |
46 | TokenType token; | 112 | [TOK_OR] = cstr("OR"), |
47 | } Keyword; | 113 | [TOK_EQ] = cstr("EQ"), |
48 | 114 | [TOK_NOTEQ] = cstr("NOTEQ"), | |
49 | #define KEYWORD(STR,TOK) {(STR), sizeof(STR) - 1, (TOK)} | 115 | [TOK_LT] = cstr("LT"), |
50 | 116 | [TOK_GT] = cstr("GT"), | |
51 | static const Keyword keywords[] = { | 117 | [TOK_LE] = cstr("LE"), |
52 | KEYWORD("nil", TOKEN_NIL), | 118 | [TOK_GE] = cstr("GE"), |
53 | KEYWORD("true", TOKEN_TRUE), | 119 | |
54 | KEYWORD("false", TOKEN_FALSE), | 120 | // Bitwise ops. |
55 | KEYWORD("lambda", TOKEN_LAMBDA), | 121 | [TOK_BITNOT] = cstr("BITNOT"), |
56 | KEYWORD("if", TOKEN_IF), | 122 | [TOK_BITAND] = cstr("BITAND"), |
57 | KEYWORD("def", TOKEN_DEF), | 123 | [TOK_BITOR] = cstr("BITOR"), |
58 | KEYWORD("set", TOKEN_SET), | 124 | [TOK_BITLSHIFT] = cstr("BITLSHIFT"), |
59 | KEYWORD("fun", TOKEN_FUN), | 125 | [TOK_BITRSHIFT] = cstr("BITRSHIFT"), |
60 | KEYWORD("struct", TOKEN_STRUCT), | 126 | |
61 | KEYWORD("+", TOKEN_ADD), | 127 | // Special ops. |
62 | KEYWORD("-", TOKEN_SUB), | 128 | [TOK_COLON] = cstr("COLON"), |
63 | KEYWORD("*", TOKEN_MUL), | 129 | [TOK_DOT] = cstr("DOT"), |
64 | KEYWORD("/", TOKEN_DIV), | 130 | [TOK_AT] = cstr("AT"), |
65 | KEYWORD("%", TOKEN_MOD), | 131 | [TOK_ASSIGN] = cstr("ASSIGN"), |
66 | KEYWORD("not", TOKEN_NOT), | 132 | |
67 | KEYWORD("and", TOKEN_AND), | 133 | // End of file. |
68 | KEYWORD("or", TOKEN_OR), | 134 | [TOK_EOF] = cstr("EOF"), |
69 | KEYWORD("=", TOKEN_EQ), | ||
70 | KEYWORD("<", TOKEN_LT), | ||
71 | KEYWORD(">", TOKEN_GT), | ||
72 | KEYWORD("<=", TOKEN_LE), | ||
73 | KEYWORD(">=", TOKEN_GE), | ||
74 | }; | 135 | }; |
75 | 136 | ||
76 | void | 137 | typedef struct Token { |
77 | print_token(Token tok) { | 138 | TokenType type; |
78 | printf("[%4ld:%-4ld] ", tok.line, tok.col); | 139 | Str val; |
79 | printf("%s", token_str[tok.type]); | 140 | sz line; |
80 | switch (tok.type) { | 141 | sz col; |
81 | case TOKEN_NUMBER: | 142 | } Token; |
82 | case TOKEN_SYMBOL: | 143 | |
83 | case TOKEN_STRING: { | 144 | typedef struct Scanner { |
84 | printf(" -> "); | 145 | Str str; |
85 | sv_write(&tok.value); | 146 | sz line; |
86 | } break; | 147 | sz col; |
87 | default: { | 148 | } Scanner; |
88 | } break; | ||
89 | } | ||
90 | printf("\n"); | ||
91 | } | ||
92 | 149 | ||
93 | char | 150 | char |
94 | scan_next(Scanner *scanner) { | 151 | scan_next(Scanner *scanner) { |
95 | char c = sv_next(&scanner->current); | 152 | char c = str_next(&scanner->str); |
96 | if (c == '\n') { | 153 | if (c == '\n') { |
97 | scanner->line_number++; | 154 | scanner->line++; |
98 | scanner->col_number = 1; | 155 | scanner->col = 0; |
99 | } else { | 156 | } else { |
100 | scanner->col_number++; | 157 | scanner->col++; |
101 | } | 158 | } |
102 | scanner->offset++; | ||
103 | return c; | 159 | return c; |
104 | } | 160 | } |
105 | 161 | ||
106 | void | 162 | bool |
107 | scan_rewind(Scanner *scanner) { | 163 | scan_has_next(Scanner *scanner) { |
108 | sv_rewind(&scanner->current); | 164 | return scanner->str.size; |
109 | scanner->offset--; | ||
110 | } | 165 | } |
111 | 166 | ||
112 | char | 167 | char |
113 | scan_peek(const Scanner *scanner) { | 168 | scan_peek(Scanner *scanner) { |
114 | return sv_peek(&scanner->current); | 169 | return str_peek(scanner->str); |
115 | } | 170 | } |
116 | 171 | ||
117 | bool | 172 | void |
118 | scan_has_next(const Scanner *scanner) { | 173 | scan_skip_line(Scanner *scanner) { |
119 | return scanner->current.n != 0; | 174 | SearchResult newline = array_find_next(scanner->str, cstr("\n")); |
175 | if (newline.found) { | ||
176 | scanner->str.mem += newline.pos + 1; | ||
177 | scanner->str.size -= newline.pos + 1; | ||
178 | scanner->line++; | ||
179 | scanner->col = 0; | ||
180 | } | ||
120 | } | 181 | } |
121 | 182 | ||
122 | void | 183 | void |
123 | skip_whitespace(Scanner *scanner) { | 184 | scan_skip_whitespace(Scanner *scanner) { |
124 | while (scan_has_next(scanner)) { | 185 | while (scan_has_next(scanner)) { |
125 | char c = scan_peek(scanner); | 186 | char c = scan_peek(scanner); |
126 | switch (c) { | 187 | switch (c) { |
127 | case ' ': | 188 | case ' ': |
189 | case ',': // Commas are just syntactic sugar. | ||
128 | case '\f': | 190 | case '\f': |
129 | case '\n': | 191 | case '\n': |
130 | case '\r': | 192 | case '\r': |
@@ -132,6 +194,10 @@ skip_whitespace(Scanner *scanner) { | |||
132 | case '\v': { | 194 | case '\v': { |
133 | scan_next(scanner); | 195 | scan_next(scanner); |
134 | } break; | 196 | } break; |
197 | case ';': { | ||
198 | // Found a comment! (skip) | ||
199 | scan_skip_line(scanner); | ||
200 | } break; | ||
135 | default: { | 201 | default: { |
136 | return; | 202 | return; |
137 | } break; | 203 | } break; |
@@ -140,22 +206,33 @@ skip_whitespace(Scanner *scanner) { | |||
140 | } | 206 | } |
141 | 207 | ||
142 | bool | 208 | bool |
143 | is_delimiter(char c) { | 209 | scan_is_valid_split(char c) { |
144 | switch (c) { | 210 | switch (c) { |
145 | case EOF: | ||
146 | case '\0': | ||
147 | case ';': | 211 | case ';': |
148 | case '"': | ||
149 | case '\'': | ||
150 | case '(': | 212 | case '(': |
151 | case ')': | 213 | case ')': |
152 | case '[': | 214 | case '[': |
153 | case ']': | 215 | case ']': |
154 | case '{': | 216 | case '{': |
155 | case '}': | 217 | case '}': |
218 | case '+': | ||
219 | case '-': | ||
220 | case '*': | ||
221 | case '/': | ||
222 | case '%': | ||
223 | case '!': | ||
224 | case '=': | ||
225 | case '<': | ||
226 | case '>': | ||
227 | case '~': | ||
228 | case '&': | ||
229 | case '|': | ||
156 | case ':': | 230 | case ':': |
231 | case '.': | ||
157 | case '@': | 232 | case '@': |
233 | case '"': | ||
158 | case ' ': | 234 | case ' ': |
235 | case ',': | ||
159 | case '\f': | 236 | case '\f': |
160 | case '\n': | 237 | case '\n': |
161 | case '\r': | 238 | case '\r': |
@@ -167,122 +244,351 @@ is_delimiter(char c) { | |||
167 | return false; | 244 | return false; |
168 | } | 245 | } |
169 | 246 | ||
170 | TokenType | 247 | void |
171 | find_token_type(const StringView value) { | 248 | scan_skip_until_valid(Scanner *scanner) { |
172 | for (size_t i = 0; i < sizeof(keywords) / sizeof(Keyword); i++) { | 249 | while (scan_has_next(scanner)) { |
173 | StringView keyword = (StringView){keywords[i].str, keywords[i].n}; | 250 | char c = scan_peek(scanner); |
174 | if (sv_equal(&value, &keyword)) { | 251 | if (scan_is_valid_split(c)) { |
175 | return keywords[i].token; | 252 | return; |
176 | } | 253 | } |
254 | scan_next(scanner); | ||
177 | } | 255 | } |
178 | return TOKEN_SYMBOL; | ||
179 | } | 256 | } |
180 | 257 | ||
181 | void | 258 | Token |
182 | print_tokens(Token *tokens) { | 259 | emit_token(Scanner current, Scanner *scanner, TokenType t) { |
183 | for (size_t i = 0; i < array_size(tokens); i++) { | 260 | Str val = current.str; |
184 | print_token(tokens[i]); | 261 | val.size = current.str.size - scanner->str.size; |
185 | } | 262 | val.size = val.size < 0 ? 0 : val.size; |
263 | return (Token){ | ||
264 | .val = val, | ||
265 | .line = current.line + 1, | ||
266 | .col = current.col + 1, | ||
267 | .type = t, | ||
268 | }; | ||
186 | } | 269 | } |
187 | 270 | ||
188 | Token * | 271 | Token |
189 | tokenize(const StringView *sv) { | 272 | emit_token_err(Scanner *scanner, Str err_msg) { |
190 | Token *tokens = NULL; | 273 | return (Token){ |
191 | array_init(tokens, 1); | 274 | .line = scanner->line + 1, |
192 | Scanner scanner = (Scanner){ | 275 | .col = scanner->col + 1, |
193 | .current = *sv, | 276 | .val = err_msg, |
194 | .line_number = 1, | 277 | .type = TOK_UNKNOWN, |
195 | .col_number = 1, | ||
196 | }; | 278 | }; |
279 | } | ||
197 | 280 | ||
198 | while (scan_has_next(&scanner)) { | 281 | Token |
199 | skip_whitespace(&scanner); | 282 | emit_token_number(Scanner *scanner) { |
200 | size_t line = scanner.line_number; | 283 | Scanner current = *scanner; |
201 | size_t col = scanner.col_number; | 284 | char c = scan_peek(scanner); |
202 | size_t offset = scanner.offset; | 285 | if (c == '+' || c == '-') { |
203 | Token token = (Token){ | 286 | scan_next(scanner); |
204 | .type = TOKEN_UNKNOWN, | 287 | if (str_has_prefix(scanner->str, cstr("0b")) || |
205 | .line = line, | 288 | str_has_prefix(scanner->str, cstr("0x"))) { |
206 | .col = col, | 289 | scan_skip_until_valid(scanner); |
207 | }; | 290 | return emit_token_err( |
208 | char c = scan_next(&scanner); | 291 | ¤t, |
209 | switch (c) { | 292 | cstr("malformed number: binary/hex numbers can't be signed")); |
210 | case ';': { | 293 | } |
211 | while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} | 294 | } |
295 | if (str_has_prefix(scanner->str, cstr("0b"))) { | ||
296 | scan_next(scanner); | ||
297 | scan_next(scanner); | ||
298 | while (scan_has_next(scanner)) { | ||
299 | c = scan_peek(scanner); | ||
300 | if (c == '0' || c == '1' || c == '_') { | ||
301 | scan_next(scanner); | ||
212 | continue; | 302 | continue; |
213 | } break; | 303 | } |
214 | case '"': { | 304 | if (scan_is_valid_split(c)) { |
215 | char prev = c; | 305 | return emit_token(current, scanner, TOK_NUMBER); |
216 | bool found = false; | 306 | } |
217 | size_t n = 0; | 307 | scan_skip_until_valid(scanner); |
218 | while (scan_has_next(&scanner)) { | 308 | return emit_token_err( |
219 | c = scan_next(&scanner); | 309 | ¤t, cstr("malformed number: invalid binary number")); |
220 | if (c == '"' && prev != '\\') { | 310 | } |
221 | found = true; | 311 | } else if (str_has_prefix(scanner->str, cstr("0x"))) { |
222 | break; | 312 | scan_next(scanner); |
223 | } | 313 | scan_next(scanner); |
224 | prev = c; | 314 | while (scan_has_next(scanner)) { |
225 | n++; | 315 | c = scan_peek(scanner); |
226 | } | 316 | if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || |
227 | if (!found) { | 317 | (c >= 'A' && c <= 'F') || c == '_') { |
228 | push_error(ERR_TYPE_LEXER, ERR_UNMATCHED_STRING, line, col); | 318 | scan_next(scanner); |
229 | return tokens; | 319 | continue; |
230 | } | 320 | } |
231 | token.value = (StringView){ | 321 | if (scan_is_valid_split(c)) { |
232 | .start = &sv->start[offset + 1], | 322 | return emit_token(current, scanner, TOK_NUMBER); |
233 | .n = n, | 323 | } |
234 | }; | 324 | scan_skip_until_valid(scanner); |
235 | token.type = TOKEN_STRING; | 325 | return emit_token_err(¤t, |
236 | } break; | 326 | cstr("malformed number: invalid hex number")); |
237 | case '(': { token.type = TOKEN_LPAREN; } break; | 327 | } |
238 | case ')': { token.type = TOKEN_RPAREN; } break; | 328 | } else { |
239 | case '[': { token.type = TOKEN_LSQUARE; } break; | 329 | // Integral. |
240 | case ']': { token.type = TOKEN_RSQUARE; } break; | 330 | while (scan_has_next(scanner)) { |
241 | case '{': { token.type = TOKEN_LCURLY; } break; | 331 | c = scan_peek(scanner); |
242 | case '}': { token.type = TOKEN_RCURLY; } break; | 332 | if (c == '.') { |
243 | case ':': { token.type = TOKEN_COLON; } break; | 333 | scan_next(scanner); |
244 | case '.': { token.type = TOKEN_DOT; } break; | 334 | break; |
245 | case '@': { token.type = TOKEN_AT; } break; | 335 | } |
246 | default: { | 336 | if ((c >= '0' && c <= '9') || c == '_') { |
247 | if (c == EOF || c == '\0') { | 337 | scan_next(scanner); |
248 | token.type = TOKEN_EOF; | 338 | continue; |
249 | break; | 339 | } |
250 | } | 340 | if (scan_is_valid_split(c)) { |
251 | size_t n = 1; | 341 | return emit_token(current, scanner, TOK_NUMBER); |
252 | bool num = c == '-' && !is_delimiter(scan_peek(&scanner)); | 342 | } |
253 | num = num || (c == '+' && !is_delimiter(scan_peek(&scanner))); | 343 | scan_skip_until_valid(scanner); |
254 | num = num || (c >= '0' && c <= '9'); | 344 | return emit_token_err(¤t, cstr("malformed number")); |
255 | if (num) { | 345 | } |
256 | while (!is_delimiter(scan_peek(&scanner))) { | 346 | c = scan_peek(scanner); |
257 | c = scan_next(&scanner); | 347 | if (!(c >= '0' && c <= '9')) { |
258 | n++; | 348 | return emit_token_err(¤t, |
259 | } | 349 | cstr("malformed number: no decimal digits")); |
260 | token.value = (StringView){ | 350 | } |
261 | .start = &sv->start[offset], | 351 | // Decimals. |
262 | .n = n, | 352 | while (scan_has_next(scanner)) { |
263 | }; | 353 | c = scan_peek(scanner); |
264 | token.type = TOKEN_NUMBER; | 354 | if (c == 'e' || c == 'E') { |
265 | } else { | 355 | scan_next(scanner); |
266 | while (!is_delimiter(scan_peek(&scanner))) { | 356 | break; |
267 | if (scan_peek(&scanner) == '.') { | 357 | } |
268 | break; | 358 | if ((c >= '0' && c <= '9') || c == '_') { |
269 | } | 359 | scan_next(scanner); |
270 | c = scan_next(&scanner); | 360 | continue; |
271 | n++; | 361 | } |
272 | } | 362 | if (scan_is_valid_split(c)) { |
273 | token.value = (StringView){ | 363 | return emit_token(current, scanner, TOK_NUMBER); |
274 | .start = &sv->start[offset], | 364 | } |
275 | .n = n, | 365 | scan_skip_until_valid(scanner); |
276 | }; | 366 | return emit_token_err(¤t, cstr("malformed number")); |
277 | token.type = find_token_type(token.value); | ||
278 | } | ||
279 | } break; | ||
280 | } | 367 | } |
281 | if (token.type == TOKEN_UNKNOWN) { | 368 | // Exponent. |
282 | push_error(ERR_TYPE_LEXER, ERR_UNKNOWN_TOK_TYPE, line, col); | 369 | c = scan_peek(scanner); |
283 | return tokens; | 370 | if (c == '+' || c == '-') { |
371 | scan_next(scanner); | ||
284 | } | 372 | } |
285 | array_push(tokens, token); | 373 | while (scan_has_next(scanner)) { |
374 | c = scan_peek(scanner); | ||
375 | if ((c >= '0' && c <= '9') || c == '_') { | ||
376 | scan_next(scanner); | ||
377 | continue; | ||
378 | } | ||
379 | if (c == '.') { | ||
380 | scan_next(scanner); | ||
381 | return emit_token_err( | ||
382 | ¤t, | ||
383 | cstr("malformed number: decimals not allowed on exponent")); | ||
384 | } | ||
385 | if (scan_is_valid_split(c)) { | ||
386 | return emit_token(current, scanner, TOK_NUMBER); | ||
387 | } | ||
388 | scan_skip_until_valid(scanner); | ||
389 | return emit_token_err(¤t, cstr("malformed number")); | ||
390 | } | ||
391 | } | ||
392 | return emit_token_err(¤t, cstr("malformed number")); | ||
393 | } | ||
394 | |||
395 | Token | ||
396 | scan_token(Scanner *scanner) { | ||
397 | assert(scanner); | ||
398 | |||
399 | scan_skip_whitespace(scanner); | ||
400 | if (!scan_has_next(scanner)) { | ||
401 | return emit_token(*scanner, scanner, TOK_EOF); | ||
402 | } | ||
403 | |||
404 | Scanner current = *scanner; | ||
405 | char c = scan_next(scanner); | ||
406 | switch (c) { | ||
407 | case '(': | ||
408 | return emit_token(current, scanner, TOK_LPAREN); | ||
409 | case ')': | ||
410 | return emit_token(current, scanner, TOK_RPAREN); | ||
411 | case '[': | ||
412 | return emit_token(current, scanner, TOK_LSQUARE); | ||
413 | case ']': | ||
414 | return emit_token(current, scanner, TOK_RSQUARE); | ||
415 | case '{': | ||
416 | return emit_token(current, scanner, TOK_LCURLY); | ||
417 | case '}': | ||
418 | return emit_token(current, scanner, TOK_RCURLY); | ||
419 | case '+': { | ||
420 | char p = scan_peek(scanner); | ||
421 | if (p >= '0' && p <= '9') { | ||
422 | *scanner = current; | ||
423 | return emit_token_number(scanner); | ||
424 | } | ||
425 | return emit_token(current, scanner, TOK_ADD); | ||
426 | }; | ||
427 | case '-': { | ||
428 | char p = scan_peek(scanner); | ||
429 | if (p >= '0' && p <= '9') { | ||
430 | *scanner = current; | ||
431 | return emit_token_number(scanner); | ||
432 | } | ||
433 | return emit_token(current, scanner, TOK_ADD); | ||
434 | }; | ||
435 | case '*': | ||
436 | return emit_token(current, scanner, TOK_MUL); | ||
437 | case '/': | ||
438 | return emit_token(current, scanner, TOK_DIV); | ||
439 | case '%': | ||
440 | return emit_token(current, scanner, TOK_MOD); | ||
441 | case '!': { | ||
442 | if (scan_peek(scanner) == '=') { | ||
443 | scan_next(scanner); | ||
444 | return emit_token(current, scanner, TOK_NOTEQ); | ||
445 | } | ||
446 | return emit_token(current, scanner, TOK_NOT); | ||
447 | }; | ||
448 | case '=': { | ||
449 | if (scan_peek(scanner) == '=') { | ||
450 | scan_next(scanner); | ||
451 | return emit_token(current, scanner, TOK_EQ); | ||
452 | } | ||
453 | return emit_token(current, scanner, TOK_ASSIGN); | ||
454 | }; | ||
455 | case '<': { | ||
456 | char p = scan_peek(scanner); | ||
457 | if (p == '=') { | ||
458 | scan_next(scanner); | ||
459 | return emit_token(current, scanner, TOK_LE); | ||
460 | } | ||
461 | if (p == '<') { | ||
462 | scan_next(scanner); | ||
463 | return emit_token(current, scanner, TOK_BITLSHIFT); | ||
464 | } | ||
465 | return emit_token(current, scanner, TOK_LT); | ||
466 | }; | ||
467 | case '>': { | ||
468 | char p = scan_peek(scanner); | ||
469 | if (p == '=') { | ||
470 | scan_next(scanner); | ||
471 | return emit_token(current, scanner, TOK_GE); | ||
472 | } | ||
473 | if (p == '>') { | ||
474 | scan_next(scanner); | ||
475 | return emit_token(current, scanner, TOK_BITRSHIFT); | ||
476 | } | ||
477 | return emit_token(current, scanner, TOK_GT); | ||
478 | }; | ||
479 | case '~': | ||
480 | return emit_token(current, scanner, TOK_BITNOT); | ||
481 | case '&': { | ||
482 | if (scan_peek(scanner) == '&') { | ||
483 | scan_next(scanner); | ||
484 | return emit_token(current, scanner, TOK_AND); | ||
485 | } | ||
486 | return emit_token(current, scanner, TOK_BITAND); | ||
487 | }; | ||
488 | case '|': { | ||
489 | if (scan_peek(scanner) == '|') { | ||
490 | scan_next(scanner); | ||
491 | return emit_token(current, scanner, TOK_OR); | ||
492 | } | ||
493 | return emit_token(current, scanner, TOK_BITOR); | ||
494 | }; | ||
495 | case ':': | ||
496 | return emit_token(current, scanner, TOK_COLON); | ||
497 | case '.': | ||
498 | return emit_token(current, scanner, TOK_DOT); | ||
499 | case '@': | ||
500 | return emit_token(current, scanner, TOK_AT); | ||
501 | case '"': { | ||
502 | while (scan_has_next(scanner)) { | ||
503 | c = scan_next(scanner); | ||
504 | if (c == '\\') { | ||
505 | scan_next(scanner); | ||
506 | continue; | ||
507 | } | ||
508 | if (c == '"') { | ||
509 | return emit_token(current, scanner, TOK_STRING); | ||
510 | } | ||
511 | } | ||
512 | return emit_token_err(¤t, cstr("mismatched string quotes")); | ||
513 | }; | ||
514 | } | ||
515 | if (c >= '0' && c <= '9') { | ||
516 | *scanner = current; | ||
517 | return emit_token_number(scanner); | ||
518 | } | ||
519 | |||
520 | scan_skip_until_valid(scanner); | ||
521 | Str val = current.str; | ||
522 | val.size = current.str.size - scanner->str.size; | ||
523 | val.size = val.size < 0 ? 0 : val.size; | ||
524 | if (val.size == 0) { | ||
525 | return emit_token_err(¤t, cstr("unexpected character")); | ||
526 | } | ||
527 | switch (val.mem[0]) { | ||
528 | case 'b': { | ||
529 | if (str_has_prefix(val, cstr("break"))) { | ||
530 | return emit_token(current, scanner, TOK_BREAK); | ||
531 | } | ||
532 | } break; | ||
533 | case 'c': { | ||
534 | if (str_has_prefix(val, cstr("case"))) { | ||
535 | return emit_token(current, scanner, TOK_CASE); | ||
536 | } | ||
537 | if (str_has_prefix(val, cstr("continue"))) { | ||
538 | return emit_token(current, scanner, TOK_CONTINUE); | ||
539 | } | ||
540 | } break; | ||
541 | case 'f': { | ||
542 | if (str_has_prefix(val, cstr("false"))) { | ||
543 | return emit_token(current, scanner, TOK_FALSE); | ||
544 | } | ||
545 | if (str_has_prefix(val, cstr("fun"))) { | ||
546 | return emit_token(current, scanner, TOK_FUN); | ||
547 | } | ||
548 | } break; | ||
549 | case 'i': { | ||
550 | if (str_has_prefix(val, cstr("if"))) { | ||
551 | return emit_token(current, scanner, TOK_IF); | ||
552 | } | ||
553 | } break; | ||
554 | case 'l': { | ||
555 | if (str_has_prefix(val, cstr("let"))) { | ||
556 | return emit_token(current, scanner, TOK_LET); | ||
557 | } | ||
558 | } break; | ||
559 | case 'm': { | ||
560 | if (str_has_prefix(val, cstr("match"))) { | ||
561 | return emit_token(current, scanner, TOK_MATCH); | ||
562 | } | ||
563 | } break; | ||
564 | case 'n': { | ||
565 | if (str_has_prefix(val, cstr("nil"))) { | ||
566 | return emit_token(current, scanner, TOK_NIL); | ||
567 | } | ||
568 | } break; | ||
569 | case 'r': { | ||
570 | if (str_has_prefix(val, cstr("return"))) { | ||
571 | return emit_token(current, scanner, TOK_RETURN); | ||
572 | } | ||
573 | } break; | ||
574 | case 's': { | ||
575 | if (str_has_prefix(val, cstr("set"))) { | ||
576 | return emit_token(current, scanner, TOK_SET); | ||
577 | } | ||
578 | if (str_has_prefix(val, cstr("struct"))) { | ||
579 | return emit_token(current, scanner, TOK_STRUCT); | ||
580 | } | ||
581 | } break; | ||
582 | case 't': { | ||
583 | if (str_has_prefix(val, cstr("true"))) { | ||
584 | return emit_token(current, scanner, TOK_TRUE); | ||
585 | } | ||
586 | } break; | ||
587 | case 'w': { | ||
588 | if (str_has_prefix(val, cstr("while"))) { | ||
589 | return emit_token(current, scanner, TOK_WHILE); | ||
590 | } | ||
591 | } break; | ||
286 | } | 592 | } |
287 | return tokens; | 593 | return emit_token(current, scanner, TOK_SYMBOL); |
288 | } | 594 | } |
diff --git a/src/lexer.h b/src/lexer.h deleted file mode 100644 index 949abaf..0000000 --- a/src/lexer.h +++ /dev/null | |||
@@ -1,99 +0,0 @@ | |||
1 | #ifndef BDL_LEXER_H | ||
2 | #define BDL_LEXER_H | ||
3 | |||
4 | #include "string_view.h" | ||
5 | |||
6 | typedef enum TokenType { | ||
7 | TOKEN_UNKNOWN = 0, | ||
8 | |||
9 | // Parentheses. | ||
10 | TOKEN_LPAREN, | ||
11 | TOKEN_RPAREN, | ||
12 | TOKEN_LSQUARE, | ||
13 | TOKEN_RSQUARE, | ||
14 | TOKEN_LCURLY, | ||
15 | TOKEN_RCURLY, | ||
16 | |||
17 | // Primitive types. | ||
18 | TOKEN_NUMBER, | ||
19 | TOKEN_SYMBOL, | ||
20 | TOKEN_STRING, | ||
21 | TOKEN_NIL, | ||
22 | TOKEN_TRUE, | ||
23 | TOKEN_FALSE, | ||
24 | |||
25 | // Keywords. | ||
26 | TOKEN_LAMBDA, | ||
27 | TOKEN_IF, | ||
28 | TOKEN_DEF, | ||
29 | TOKEN_SET, | ||
30 | TOKEN_FUN, | ||
31 | TOKEN_STRUCT, | ||
32 | |||
33 | // Arithmetic ops. | ||
34 | TOKEN_ADD, | ||
35 | TOKEN_SUB, | ||
36 | TOKEN_MUL, | ||
37 | TOKEN_DIV, | ||
38 | TOKEN_MOD, | ||
39 | |||
40 | // Boolean operations. | ||
41 | TOKEN_NOT, | ||
42 | TOKEN_AND, | ||
43 | TOKEN_OR, | ||
44 | TOKEN_EQ, | ||
45 | TOKEN_LT, | ||
46 | TOKEN_GT, | ||
47 | TOKEN_LE, | ||
48 | TOKEN_GE, | ||
49 | |||
50 | // Special operators. | ||
51 | TOKEN_COLON, | ||
52 | TOKEN_DOT, | ||
53 | TOKEN_AT, | ||
54 | |||
55 | // End of file. | ||
56 | TOKEN_EOF, | ||
57 | } TokenType; | ||
58 | |||
59 | typedef struct Token { | ||
60 | TokenType type; | ||
61 | StringView value; | ||
62 | size_t line; | ||
63 | size_t col; | ||
64 | } Token; | ||
65 | |||
66 | typedef struct Scanner { | ||
67 | StringView current; | ||
68 | size_t line_number; | ||
69 | size_t col_number; | ||
70 | size_t offset; | ||
71 | } Scanner; | ||
72 | |||
73 | // Print a token to standard output for debugging purposes. | ||
74 | void print_token(Token tok); | ||
75 | |||
76 | // Same functionality as with StringView, but keeping track of line and column | ||
77 | // numbers. | ||
78 | char scan_next(Scanner *scanner); | ||
79 | char scan_peek(const Scanner *scanner); | ||
80 | |||
81 | // Check if the current scanner still have characters left. | ||
82 | bool scan_has_next(const Scanner *scanner); | ||
83 | |||
84 | // Advance the scanner until we ran out of whitespace. | ||
85 | void skip_whitespace(Scanner *scanner); | ||
86 | |||
87 | // Check if a given character is a delimiter. | ||
88 | bool is_delimiter(char c); | ||
89 | |||
90 | // Extract the token type from the current string. | ||
91 | TokenType find_token_type(const StringView value); | ||
92 | |||
93 | // Generate a list of tokens from the given string. | ||
94 | Token * tokenize(const StringView *sv); | ||
95 | |||
96 | // Display tokens from token list. | ||
97 | void print_tokens(Token *tokens); | ||
98 | |||
99 | #endif // BDL_LEXER_H | ||
@@ -3,6 +3,7 @@ | |||
3 | #include <stdlib.h> | 3 | #include <stdlib.h> |
4 | 4 | ||
5 | #include "badlib.h" | 5 | #include "badlib.h" |
6 | #include "lexer.c" | ||
6 | 7 | ||
7 | typedef enum ExecMode { | 8 | typedef enum ExecMode { |
8 | RUN_NORMAL, | 9 | RUN_NORMAL, |
@@ -14,607 +15,11 @@ typedef enum ExecMode { | |||
14 | 15 | ||
15 | static ExecMode mode = RUN_NORMAL; | 16 | static ExecMode mode = RUN_NORMAL; |
16 | 17 | ||
17 | #define LEXER_MEM GB(2) | ||
18 | |||
19 | void | 18 | void |
20 | init(void) { | 19 | init(void) { |
21 | log_init_default(); | 20 | log_init_default(); |
22 | } | 21 | } |
23 | 22 | ||
24 | typedef enum TokenType { | ||
25 | TOK_UNKNOWN = 0, | ||
26 | |||
27 | // Parentheses. | ||
28 | TOK_LPAREN, // ( | ||
29 | TOK_RPAREN, // ) | ||
30 | TOK_LSQUARE, // [ | ||
31 | TOK_RSQUARE, // ] | ||
32 | TOK_LCURLY, // { | ||
33 | TOK_RCURLY, // } | ||
34 | |||
35 | // Basic literals. | ||
36 | TOK_NUMBER, | ||
37 | TOK_SYMBOL, | ||
38 | TOK_STRING, | ||
39 | |||
40 | // Keywords. | ||
41 | TOK_BREAK, // break | ||
42 | TOK_CASE, // case | ||
43 | TOK_CONTINUE, // continue | ||
44 | TOK_FALSE, // false | ||
45 | TOK_FUN, // fun | ||
46 | TOK_IF, // if | ||
47 | TOK_LET, // let | ||
48 | TOK_MATCH, // match | ||
49 | TOK_NIL, // nil | ||
50 | TOK_RETURN, // return | ||
51 | TOK_SET, // set | ||
52 | TOK_STRUCT, // struct | ||
53 | TOK_TRUE, // true | ||
54 | TOK_WHILE, // while | ||
55 | |||
56 | // Arithmetic ops. | ||
57 | TOK_ADD, // + | ||
58 | TOK_SUB, // - | ||
59 | TOK_MUL, // * | ||
60 | TOK_DIV, // / | ||
61 | TOK_MOD, // % | ||
62 | |||
63 | // Logical ops. | ||
64 | TOK_NOT, // ! | ||
65 | TOK_AND, // && | ||
66 | TOK_OR, // || | ||
67 | TOK_EQ, // == | ||
68 | TOK_NOTEQ, // != | ||
69 | TOK_LT, // < | ||
70 | TOK_GT, // > | ||
71 | TOK_LE, // <= | ||
72 | TOK_GE, // >= | ||
73 | |||
74 | // Bitwise ops. | ||
75 | TOK_BITNOT, // ~ | ||
76 | TOK_BITAND, // & | ||
77 | TOK_BITOR, // | | ||
78 | TOK_BITLSHIFT, // << | ||
79 | TOK_BITRSHIFT, // >> | ||
80 | |||
81 | // Special ops. | ||
82 | TOK_COLON, // : | ||
83 | TOK_DOT, // . | ||
84 | TOK_AT, // @ | ||
85 | TOK_ASSIGN, // = | ||
86 | |||
87 | // End of file. | ||
88 | TOK_EOF, | ||
89 | } TokenType; | ||
90 | |||
91 | Str token_str[] = { | ||
92 | [TOK_UNKNOWN] = cstr("UNKNOWN"), | ||
93 | |||
94 | // Parentheses. | ||
95 | [TOK_LPAREN] = cstr("LPAREN"), | ||
96 | [TOK_RPAREN] = cstr("RPAREN"), | ||
97 | [TOK_LSQUARE] = cstr("LSQUARE"), | ||
98 | [TOK_RSQUARE] = cstr("RSQUARE"), | ||
99 | [TOK_LCURLY] = cstr("LCURLY"), | ||
100 | [TOK_RCURLY] = cstr("RCURLY"), | ||
101 | |||
102 | // Basic literals. | ||
103 | [TOK_NUMBER] = cstr("NUMBER"), | ||
104 | [TOK_SYMBOL] = cstr("SYMBOL"), | ||
105 | [TOK_STRING] = cstr("STRING"), | ||
106 | |||
107 | // Keywords. | ||
108 | [TOK_BREAK] = cstr("BREAK"), | ||
109 | [TOK_CASE] = cstr("CASE"), | ||
110 | [TOK_CONTINUE] = cstr("CONTINUE"), | ||
111 | [TOK_FALSE] = cstr("FALSE"), | ||
112 | [TOK_FUN] = cstr("FUN"), | ||
113 | [TOK_IF] = cstr("IF"), | ||
114 | [TOK_LET] = cstr("LET"), | ||
115 | [TOK_MATCH] = cstr("MATCH"), | ||
116 | [TOK_NIL] = cstr("NIL"), | ||
117 | [TOK_RETURN] = cstr("RETURN"), | ||
118 | [TOK_SET] = cstr("SET"), | ||
119 | [TOK_STRUCT] = cstr("STRUCT"), | ||
120 | [TOK_TRUE] = cstr("TRUE"), | ||
121 | [TOK_WHILE] = cstr("WHILE"), | ||
122 | |||
123 | // Arithmetic ops. | ||
124 | [TOK_ADD] = cstr("ADD"), | ||
125 | [TOK_SUB] = cstr("SUB"), | ||
126 | [TOK_MUL] = cstr("MUL"), | ||
127 | [TOK_DIV] = cstr("DIV"), | ||
128 | [TOK_MOD] = cstr("MOD"), | ||
129 | |||
130 | // Logical ops. | ||
131 | [TOK_NOT] = cstr("NOT"), | ||
132 | [TOK_AND] = cstr("AND"), | ||
133 | [TOK_OR] = cstr("OR"), | ||
134 | [TOK_EQ] = cstr("EQ"), | ||
135 | [TOK_NOTEQ] = cstr("NOTEQ"), | ||
136 | [TOK_LT] = cstr("LT"), | ||
137 | [TOK_GT] = cstr("GT"), | ||
138 | [TOK_LE] = cstr("LE"), | ||
139 | [TOK_GE] = cstr("GE"), | ||
140 | |||
141 | // Bitwise ops. | ||
142 | [TOK_BITNOT] = cstr("BITNOT"), | ||
143 | [TOK_BITAND] = cstr("BITAND"), | ||
144 | [TOK_BITOR] = cstr("BITOR"), | ||
145 | [TOK_BITLSHIFT] = cstr("BITLSHIFT"), | ||
146 | [TOK_BITRSHIFT] = cstr("BITRSHIFT"), | ||
147 | |||
148 | // Special ops. | ||
149 | [TOK_COLON] = cstr("COLON"), | ||
150 | [TOK_DOT] = cstr("DOT"), | ||
151 | [TOK_AT] = cstr("AT"), | ||
152 | [TOK_ASSIGN] = cstr("ASSIGN"), | ||
153 | |||
154 | // End of file. | ||
155 | [TOK_EOF] = cstr("EOF"), | ||
156 | }; | ||
157 | |||
158 | typedef struct Token { | ||
159 | TokenType type; | ||
160 | Str val; | ||
161 | sz line; | ||
162 | sz col; | ||
163 | } Token; | ||
164 | |||
165 | typedef struct Scanner { | ||
166 | Str str; | ||
167 | sz line; | ||
168 | sz col; | ||
169 | Arena *storage; | ||
170 | } Scanner; | ||
171 | |||
172 | char | ||
173 | scan_next(Scanner *scanner) { | ||
174 | char c = str_next(&scanner->str); | ||
175 | if (c == '\n') { | ||
176 | scanner->line++; | ||
177 | scanner->col = 0; | ||
178 | } else { | ||
179 | scanner->col++; | ||
180 | } | ||
181 | return c; | ||
182 | } | ||
183 | |||
184 | bool | ||
185 | scan_has_next(Scanner *scanner) { | ||
186 | return scanner->str.size; | ||
187 | } | ||
188 | |||
189 | char | ||
190 | scan_peek(Scanner *scanner) { | ||
191 | return str_peek(scanner->str); | ||
192 | } | ||
193 | |||
194 | Token | ||
195 | emit_token(Scanner current, Scanner *scanner, TokenType t) { | ||
196 | Str val = current.str; | ||
197 | val.size = current.str.size - scanner->str.size; | ||
198 | val.size = val.size < 0 ? 0 : val.size; | ||
199 | return (Token){ | ||
200 | .val = val, | ||
201 | .line = current.line + 1, | ||
202 | .col = current.col + 1, | ||
203 | .type = t, | ||
204 | }; | ||
205 | } | ||
206 | |||
207 | Token | ||
208 | emit_token_err(Scanner *scanner, Str err_msg) { | ||
209 | return (Token){ | ||
210 | .line = scanner->line + 1, | ||
211 | .col = scanner->col + 1, | ||
212 | .val = err_msg, | ||
213 | .type = TOK_UNKNOWN, | ||
214 | }; | ||
215 | } | ||
216 | |||
217 | void | ||
218 | scan_skip_line(Scanner *scanner) { | ||
219 | SearchResult newline = array_find_next(scanner->str, cstr("\n")); | ||
220 | if (newline.found) { | ||
221 | scanner->str.mem += newline.pos + 1; | ||
222 | scanner->str.size -= newline.pos + 1; | ||
223 | scanner->line++; | ||
224 | scanner->col = 0; | ||
225 | } | ||
226 | } | ||
227 | |||
228 | void | ||
229 | scan_skip_whitespace(Scanner *scanner) { | ||
230 | while (scan_has_next(scanner)) { | ||
231 | char c = scan_peek(scanner); | ||
232 | switch (c) { | ||
233 | case ' ': | ||
234 | case ',': // Commas are just syntactic sugar. | ||
235 | case '\f': | ||
236 | case '\n': | ||
237 | case '\r': | ||
238 | case '\t': | ||
239 | case '\v': { | ||
240 | scan_next(scanner); | ||
241 | } break; | ||
242 | case ';': { | ||
243 | // Found a comment! (skip) | ||
244 | scan_skip_line(scanner); | ||
245 | } break; | ||
246 | default: { | ||
247 | return; | ||
248 | } break; | ||
249 | } | ||
250 | } | ||
251 | } | ||
252 | |||
253 | bool | ||
254 | is_valid_split(char c) { | ||
255 | switch (c) { | ||
256 | case ';': | ||
257 | case '(': | ||
258 | case ')': | ||
259 | case '[': | ||
260 | case ']': | ||
261 | case '{': | ||
262 | case '}': | ||
263 | case '+': | ||
264 | case '-': | ||
265 | case '*': | ||
266 | case '/': | ||
267 | case '%': | ||
268 | case '!': | ||
269 | case '=': | ||
270 | case '<': | ||
271 | case '>': | ||
272 | case '~': | ||
273 | case '&': | ||
274 | case '|': | ||
275 | case ':': | ||
276 | case '.': | ||
277 | case '@': | ||
278 | case '"': | ||
279 | case ' ': | ||
280 | case ',': | ||
281 | case '\f': | ||
282 | case '\n': | ||
283 | case '\r': | ||
284 | case '\t': | ||
285 | case '\v': { | ||
286 | return true; | ||
287 | } break; | ||
288 | } | ||
289 | return false; | ||
290 | } | ||
291 | |||
292 | void | ||
293 | scan_skip_until_valid(Scanner *scanner) { | ||
294 | while (scan_has_next(scanner)) { | ||
295 | char c = scan_peek(scanner); | ||
296 | if (is_valid_split(c)) { | ||
297 | return; | ||
298 | } | ||
299 | scan_next(scanner); | ||
300 | } | ||
301 | } | ||
302 | |||
303 | Token | ||
304 | emit_token_number(Scanner *scanner) { | ||
305 | Scanner current = *scanner; | ||
306 | char c = scan_peek(scanner); | ||
307 | if (c == '+' || c == '-') { | ||
308 | scan_next(scanner); | ||
309 | if (str_has_prefix(scanner->str, cstr("0b")) || | ||
310 | str_has_prefix(scanner->str, cstr("0x"))) { | ||
311 | scan_skip_until_valid(scanner); | ||
312 | return emit_token_err( | ||
313 | ¤t, | ||
314 | cstr("malformed number: binary/hex numbers can't be signed")); | ||
315 | } | ||
316 | } | ||
317 | if (str_has_prefix(scanner->str, cstr("0b"))) { | ||
318 | scan_next(scanner); | ||
319 | scan_next(scanner); | ||
320 | while (scan_has_next(scanner)) { | ||
321 | c = scan_peek(scanner); | ||
322 | if (c == '0' || c == '1' || c == '_') { | ||
323 | scan_next(scanner); | ||
324 | continue; | ||
325 | } | ||
326 | if (is_valid_split(c)) { | ||
327 | return emit_token(current, scanner, TOK_NUMBER); | ||
328 | } | ||
329 | scan_skip_until_valid(scanner); | ||
330 | return emit_token_err( | ||
331 | ¤t, cstr("malformed number: invalid binary number")); | ||
332 | } | ||
333 | } else if (str_has_prefix(scanner->str, cstr("0x"))) { | ||
334 | scan_next(scanner); | ||
335 | scan_next(scanner); | ||
336 | while (scan_has_next(scanner)) { | ||
337 | c = scan_peek(scanner); | ||
338 | if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || | ||
339 | (c >= 'A' && c <= 'F') || c == '_') { | ||
340 | scan_next(scanner); | ||
341 | continue; | ||
342 | } | ||
343 | if (is_valid_split(c)) { | ||
344 | return emit_token(current, scanner, TOK_NUMBER); | ||
345 | } | ||
346 | scan_skip_until_valid(scanner); | ||
347 | return emit_token_err(¤t, | ||
348 | cstr("malformed number: invalid hex number")); | ||
349 | } | ||
350 | } else { | ||
351 | // Integral. | ||
352 | while (scan_has_next(scanner)) { | ||
353 | c = scan_peek(scanner); | ||
354 | if (c == '.') { | ||
355 | scan_next(scanner); | ||
356 | break; | ||
357 | } | ||
358 | if ((c >= '0' && c <= '9') || c == '_') { | ||
359 | scan_next(scanner); | ||
360 | continue; | ||
361 | } | ||
362 | if (is_valid_split(c)) { | ||
363 | return emit_token(current, scanner, TOK_NUMBER); | ||
364 | } | ||
365 | scan_skip_until_valid(scanner); | ||
366 | return emit_token_err(¤t, cstr("malformed number")); | ||
367 | } | ||
368 | c = scan_peek(scanner); | ||
369 | if (!(c >= '0' && c <= '9')) { | ||
370 | return emit_token_err(¤t, | ||
371 | cstr("malformed number: no decimal digits")); | ||
372 | } | ||
373 | // Decimals. | ||
374 | while (scan_has_next(scanner)) { | ||
375 | c = scan_peek(scanner); | ||
376 | if (c == 'e' || c == 'E') { | ||
377 | scan_next(scanner); | ||
378 | break; | ||
379 | } | ||
380 | if ((c >= '0' && c <= '9') || c == '_') { | ||
381 | scan_next(scanner); | ||
382 | continue; | ||
383 | } | ||
384 | if (is_valid_split(c)) { | ||
385 | return emit_token(current, scanner, TOK_NUMBER); | ||
386 | } | ||
387 | scan_skip_until_valid(scanner); | ||
388 | return emit_token_err(¤t, cstr("malformed number")); | ||
389 | } | ||
390 | // Exponent. | ||
391 | c = scan_peek(scanner); | ||
392 | if (c == '+' || c == '-') { | ||
393 | scan_next(scanner); | ||
394 | } | ||
395 | while (scan_has_next(scanner)) { | ||
396 | c = scan_peek(scanner); | ||
397 | if ((c >= '0' && c <= '9') || c == '_') { | ||
398 | scan_next(scanner); | ||
399 | continue; | ||
400 | } | ||
401 | if (c == '.') { | ||
402 | scan_next(scanner); | ||
403 | return emit_token_err( | ||
404 | ¤t, | ||
405 | cstr("malformed number: decimals not allowed on exponent")); | ||
406 | } | ||
407 | if (is_valid_split(c)) { | ||
408 | return emit_token(current, scanner, TOK_NUMBER); | ||
409 | } | ||
410 | scan_skip_until_valid(scanner); | ||
411 | return emit_token_err(¤t, cstr("malformed number")); | ||
412 | } | ||
413 | } | ||
414 | return emit_token_err(¤t, cstr("malformed number")); | ||
415 | } | ||
416 | |||
417 | Token | ||
418 | scan_token(Scanner *scanner) { | ||
419 | assert(scanner); | ||
420 | |||
421 | scan_skip_whitespace(scanner); | ||
422 | if (!scan_has_next(scanner)) { | ||
423 | return emit_token(*scanner, scanner, TOK_EOF); | ||
424 | } | ||
425 | |||
426 | Scanner current = *scanner; | ||
427 | char c = scan_next(scanner); | ||
428 | switch (c) { | ||
429 | case '(': | ||
430 | return emit_token(current, scanner, TOK_LPAREN); | ||
431 | case ')': | ||
432 | return emit_token(current, scanner, TOK_RPAREN); | ||
433 | case '[': | ||
434 | return emit_token(current, scanner, TOK_LSQUARE); | ||
435 | case ']': | ||
436 | return emit_token(current, scanner, TOK_RSQUARE); | ||
437 | case '{': | ||
438 | return emit_token(current, scanner, TOK_LCURLY); | ||
439 | case '}': | ||
440 | return emit_token(current, scanner, TOK_RCURLY); | ||
441 | case '+': { | ||
442 | char p = scan_peek(scanner); | ||
443 | if (p >= '0' && p <= '9') { | ||
444 | *scanner = current; | ||
445 | return emit_token_number(scanner); | ||
446 | } | ||
447 | return emit_token(current, scanner, TOK_ADD); | ||
448 | }; | ||
449 | case '-': { | ||
450 | char p = scan_peek(scanner); | ||
451 | if (p >= '0' && p <= '9') { | ||
452 | *scanner = current; | ||
453 | return emit_token_number(scanner); | ||
454 | } | ||
455 | return emit_token(current, scanner, TOK_ADD); | ||
456 | }; | ||
457 | case '*': | ||
458 | return emit_token(current, scanner, TOK_MUL); | ||
459 | case '/': | ||
460 | return emit_token(current, scanner, TOK_DIV); | ||
461 | case '%': | ||
462 | return emit_token(current, scanner, TOK_MOD); | ||
463 | case '!': { | ||
464 | if (scan_peek(scanner) == '=') { | ||
465 | scan_next(scanner); | ||
466 | return emit_token(current, scanner, TOK_NOTEQ); | ||
467 | } | ||
468 | return emit_token(current, scanner, TOK_NOT); | ||
469 | }; | ||
470 | case '=': { | ||
471 | if (scan_peek(scanner) == '=') { | ||
472 | scan_next(scanner); | ||
473 | return emit_token(current, scanner, TOK_EQ); | ||
474 | } | ||
475 | return emit_token(current, scanner, TOK_ASSIGN); | ||
476 | }; | ||
477 | case '<': { | ||
478 | char p = scan_peek(scanner); | ||
479 | if (p == '=') { | ||
480 | scan_next(scanner); | ||
481 | return emit_token(current, scanner, TOK_LE); | ||
482 | } | ||
483 | if (p == '<') { | ||
484 | scan_next(scanner); | ||
485 | return emit_token(current, scanner, TOK_BITLSHIFT); | ||
486 | } | ||
487 | return emit_token(current, scanner, TOK_LT); | ||
488 | }; | ||
489 | case '>': { | ||
490 | char p = scan_peek(scanner); | ||
491 | if (p == '=') { | ||
492 | scan_next(scanner); | ||
493 | return emit_token(current, scanner, TOK_GE); | ||
494 | } | ||
495 | if (p == '>') { | ||
496 | scan_next(scanner); | ||
497 | return emit_token(current, scanner, TOK_BITRSHIFT); | ||
498 | } | ||
499 | return emit_token(current, scanner, TOK_GT); | ||
500 | }; | ||
501 | case '~': | ||
502 | return emit_token(current, scanner, TOK_BITNOT); | ||
503 | case '&': { | ||
504 | if (scan_peek(scanner) == '&') { | ||
505 | scan_next(scanner); | ||
506 | return emit_token(current, scanner, TOK_AND); | ||
507 | } | ||
508 | return emit_token(current, scanner, TOK_BITAND); | ||
509 | }; | ||
510 | case '|': { | ||
511 | if (scan_peek(scanner) == '|') { | ||
512 | scan_next(scanner); | ||
513 | return emit_token(current, scanner, TOK_OR); | ||
514 | } | ||
515 | return emit_token(current, scanner, TOK_BITOR); | ||
516 | }; | ||
517 | case ':': | ||
518 | return emit_token(current, scanner, TOK_COLON); | ||
519 | case '.': | ||
520 | return emit_token(current, scanner, TOK_DOT); | ||
521 | case '@': | ||
522 | return emit_token(current, scanner, TOK_AT); | ||
523 | case '"': { | ||
524 | while (scan_has_next(scanner)) { | ||
525 | c = scan_next(scanner); | ||
526 | if (c == '\\') { | ||
527 | scan_next(scanner); | ||
528 | continue; | ||
529 | } | ||
530 | if (c == '"') { | ||
531 | return emit_token(current, scanner, TOK_STRING); | ||
532 | } | ||
533 | } | ||
534 | return emit_token_err(¤t, cstr("mismatched string quotes")); | ||
535 | }; | ||
536 | } | ||
537 | if (c >= '0' && c <= '9') { | ||
538 | *scanner = current; | ||
539 | return emit_token_number(scanner); | ||
540 | } | ||
541 | |||
542 | scan_skip_until_valid(scanner); | ||
543 | Str val = current.str; | ||
544 | val.size = current.str.size - scanner->str.size; | ||
545 | val.size = val.size < 0 ? 0 : val.size; | ||
546 | if (val.size == 0) { | ||
547 | return emit_token_err(¤t, cstr("unexpected character")); | ||
548 | } | ||
549 | switch (val.mem[0]) { | ||
550 | case 'b': { | ||
551 | if (str_has_prefix(val, cstr("break"))) { | ||
552 | return emit_token(current, scanner, TOK_BREAK); | ||
553 | } | ||
554 | } break; | ||
555 | case 'c': { | ||
556 | if (str_has_prefix(val, cstr("case"))) { | ||
557 | return emit_token(current, scanner, TOK_CASE); | ||
558 | } | ||
559 | if (str_has_prefix(val, cstr("continue"))) { | ||
560 | return emit_token(current, scanner, TOK_CONTINUE); | ||
561 | } | ||
562 | } break; | ||
563 | case 'f': { | ||
564 | if (str_has_prefix(val, cstr("false"))) { | ||
565 | return emit_token(current, scanner, TOK_FALSE); | ||
566 | } | ||
567 | if (str_has_prefix(val, cstr("fun"))) { | ||
568 | return emit_token(current, scanner, TOK_FUN); | ||
569 | } | ||
570 | } break; | ||
571 | case 'i': { | ||
572 | if (str_has_prefix(val, cstr("if"))) { | ||
573 | return emit_token(current, scanner, TOK_IF); | ||
574 | } | ||
575 | } break; | ||
576 | case 'l': { | ||
577 | if (str_has_prefix(val, cstr("let"))) { | ||
578 | return emit_token(current, scanner, TOK_LET); | ||
579 | } | ||
580 | } break; | ||
581 | case 'm': { | ||
582 | if (str_has_prefix(val, cstr("match"))) { | ||
583 | return emit_token(current, scanner, TOK_MATCH); | ||
584 | } | ||
585 | } break; | ||
586 | case 'n': { | ||
587 | if (str_has_prefix(val, cstr("nil"))) { | ||
588 | return emit_token(current, scanner, TOK_NIL); | ||
589 | } | ||
590 | } break; | ||
591 | case 'r': { | ||
592 | if (str_has_prefix(val, cstr("return"))) { | ||
593 | return emit_token(current, scanner, TOK_RETURN); | ||
594 | } | ||
595 | } break; | ||
596 | case 's': { | ||
597 | if (str_has_prefix(val, cstr("set"))) { | ||
598 | return emit_token(current, scanner, TOK_SET); | ||
599 | } | ||
600 | if (str_has_prefix(val, cstr("struct"))) { | ||
601 | return emit_token(current, scanner, TOK_STRUCT); | ||
602 | } | ||
603 | } break; | ||
604 | case 't': { | ||
605 | if (str_has_prefix(val, cstr("true"))) { | ||
606 | return emit_token(current, scanner, TOK_TRUE); | ||
607 | } | ||
608 | } break; | ||
609 | case 'w': { | ||
610 | if (str_has_prefix(val, cstr("while"))) { | ||
611 | return emit_token(current, scanner, TOK_WHILE); | ||
612 | } | ||
613 | } break; | ||
614 | } | ||
615 | return emit_token(current, scanner, TOK_SYMBOL); | ||
616 | } | ||
617 | |||
618 | void | 23 | void |
619 | process_file(Str path) { | 24 | process_file(Str path) { |
620 | Arena lexer_arena = arena_create(LEXER_MEM, os_allocator); | 25 | Arena lexer_arena = arena_create(LEXER_MEM, os_allocator); |
@@ -628,36 +33,24 @@ process_file(Str path) { | |||
628 | 33 | ||
629 | Scanner scanner = { | 34 | Scanner scanner = { |
630 | .str = file.data, | 35 | .str = file.data, |
631 | .storage = &lexer_arena, | ||
632 | }; | 36 | }; |
633 | Token tok = {0}; | 37 | Token tok = {0}; |
38 | sz errors = 0; | ||
634 | while (tok.type != TOK_EOF) { | 39 | while (tok.type != TOK_EOF) { |
635 | tok = scan_token(&scanner); | 40 | tok = scan_token(&scanner); |
636 | eprintln("%s:%d:%d:%s %s", path, tok.line, tok.col, token_str[tok.type], | 41 | if (tok.type == TOK_UNKNOWN) { |
637 | tok.val); | 42 | eprintln("%s:%d:%d:%s %s", path, tok.line, tok.col, |
43 | token_str[tok.type], tok.val); | ||
44 | errors++; | ||
45 | } | ||
638 | } | 46 | } |
639 | // while (true) { | ||
640 | // Token tok = scan_token(&scanner); | ||
641 | // println("%s:%d:%d:%s %s", path, tok.line, tok.col, | ||
642 | // token_str[tok.type], | ||
643 | // tok.val); | ||
644 | // if (tok.type == TOK_EOF) break; | ||
645 | // } | ||
646 | 47 | ||
647 | // Str scanner = file.data; | 48 | // Only proceed if there are no errors. |
648 | // // NOTE: Testing file read line by line. | 49 | if (errors) { |
649 | // for (sz i = 0; scanner.size != 0; i++) { | 50 | goto stop; |
650 | // Str line = str_split(&scanner, cstr("\n")); | 51 | } |
651 | // println("%x{4} %s", i + 1, line); | ||
652 | // } | ||
653 | |||
654 | // println("<<< %x{4} %b{4} %f{2} %s %{Arena} >>>", 123, 3, 1.345, | ||
655 | // cstr("BOOM!"), &logger_inf.storage); | ||
656 | 52 | ||
657 | // println("%{Mem}", &(Array){lexer_arena.beg, lexer_arena.size}); | 53 | stop: |
658 | // eprintln("%s:%d:%d: %s -> %c", path, 1, 1, cstr("error: testing string | ||
659 | // logger"), 'X'); while (true) {} | ||
660 | // TODO: run lexer. | ||
661 | // Free up resources. | 54 | // Free up resources. |
662 | arena_destroy(&lexer_arena, os_allocator); | 55 | arena_destroy(&lexer_arena, os_allocator); |
663 | } | 56 | } |