diff options
Diffstat (limited to 'src/lexer.c')
-rw-r--r-- | src/lexer.c | 734 |
1 files changed, 520 insertions, 214 deletions
diff --git a/src/lexer.c b/src/lexer.c index a6d7c74..df998f2 100644 --- a/src/lexer.c +++ b/src/lexer.c | |||
@@ -1,130 +1,192 @@ | |||
1 | #include "lexer.h" | 1 | #define LEXER_MEM GB(2) |
2 | #include "errors.h" | 2 | |
3 | 3 | typedef enum TokenType { | |
4 | static const char* token_str[] = { | 4 | TOK_UNKNOWN = 0, |
5 | [TOKEN_UNKNOWN] = "UNKNOWN", | 5 | |
6 | [TOKEN_LPAREN] = "LPAREN", | 6 | // Parentheses. |
7 | [TOKEN_RPAREN] = "RPAREN", | 7 | TOK_LPAREN, // ( |
8 | [TOKEN_LSQUARE] = "LSQUARE", | 8 | TOK_RPAREN, // ) |
9 | [TOKEN_RSQUARE] = "RSQUARE", | 9 | TOK_LSQUARE, // [ |
10 | [TOKEN_LCURLY] = "LCURLY", | 10 | TOK_RSQUARE, // ] |
11 | [TOKEN_RCURLY] = "RCURLY", | 11 | TOK_LCURLY, // { |
12 | [TOKEN_NUMBER] = "NUMBER", | 12 | TOK_RCURLY, // } |
13 | [TOKEN_SYMBOL] = "SYMBOL", | 13 | |
14 | [TOKEN_STRING] = "STRING", | 14 | // Basic literals. |
15 | [TOKEN_NIL] = "NIL", | 15 | TOK_NUMBER, |
16 | [TOKEN_TRUE] = "TRUE", | 16 | TOK_SYMBOL, |
17 | [TOKEN_FALSE] = "FALSE", | 17 | TOK_STRING, |
18 | [TOKEN_LAMBDA] = "LAMBDA", | 18 | |
19 | [TOKEN_IF] = "IF", | 19 | // Keywords. |
20 | [TOKEN_DEF] = "DEF", | 20 | TOK_BREAK, // break |
21 | [TOKEN_SET] = "SET", | 21 | TOK_CASE, // case |
22 | [TOKEN_FUN] = "FUN", | 22 | TOK_CONTINUE, // continue |
23 | [TOKEN_STRUCT] = "STRUCT", | 23 | TOK_FALSE, // false |
24 | [TOKEN_ADD] = "ADD", | 24 | TOK_FUN, // fun |
25 | [TOKEN_SUB] = "SUB", | 25 | TOK_IF, // if |
26 | [TOKEN_MUL] = "MUL", | 26 | TOK_LET, // let |
27 | [TOKEN_DIV] = "DIV", | 27 | TOK_MATCH, // match |
28 | [TOKEN_MOD] = "MOD", | 28 | TOK_NIL, // nil |
29 | [TOKEN_NOT] = "NOT", | 29 | TOK_RETURN, // return |
30 | [TOKEN_AND] = "AND", | 30 | TOK_SET, // set |
31 | [TOKEN_OR] = "OR", | 31 | TOK_STRUCT, // struct |
32 | [TOKEN_EQ] = "EQ", | 32 | TOK_TRUE, // true |
33 | [TOKEN_LT] = "LT", | 33 | TOK_WHILE, // while |
34 | [TOKEN_GT] = "GT", | 34 | |
35 | [TOKEN_LE] = "LE", | 35 | // Arithmetic ops. |
36 | [TOKEN_GE] = "GE", | 36 | TOK_ADD, // + |
37 | [TOKEN_COLON] = "COLON", | 37 | TOK_SUB, // - |
38 | [TOKEN_DOT] = "DOT", | 38 | TOK_MUL, // * |
39 | [TOKEN_AT] = "AT", | 39 | TOK_DIV, // / |
40 | [TOKEN_EOF] = "EOF", | 40 | TOK_MOD, // % |
41 | }; | 41 | |
42 | // Logical ops. | ||
43 | TOK_NOT, // ! | ||
44 | TOK_AND, // && | ||
45 | TOK_OR, // || | ||
46 | TOK_EQ, // == | ||
47 | TOK_NOTEQ, // != | ||
48 | TOK_LT, // < | ||
49 | TOK_GT, // > | ||
50 | TOK_LE, // <= | ||
51 | TOK_GE, // >= | ||
52 | |||
53 | // Bitwise ops. | ||
54 | TOK_BITNOT, // ~ | ||
55 | TOK_BITAND, // & | ||
56 | TOK_BITOR, // | | ||
57 | TOK_BITLSHIFT, // << | ||
58 | TOK_BITRSHIFT, // >> | ||
59 | |||
60 | // Special ops. | ||
61 | TOK_COLON, // : | ||
62 | TOK_DOT, // . | ||
63 | TOK_AT, // @ | ||
64 | TOK_ASSIGN, // = | ||
65 | |||
66 | // End of file. | ||
67 | TOK_EOF, | ||
68 | } TokenType; | ||
69 | |||
70 | Str token_str[] = { | ||
71 | [TOK_UNKNOWN] = cstr("UNKNOWN"), | ||
72 | |||
73 | // Parentheses. | ||
74 | [TOK_LPAREN] = cstr("LPAREN"), | ||
75 | [TOK_RPAREN] = cstr("RPAREN"), | ||
76 | [TOK_LSQUARE] = cstr("LSQUARE"), | ||
77 | [TOK_RSQUARE] = cstr("RSQUARE"), | ||
78 | [TOK_LCURLY] = cstr("LCURLY"), | ||
79 | [TOK_RCURLY] = cstr("RCURLY"), | ||
80 | |||
81 | // Basic literals. | ||
82 | [TOK_NUMBER] = cstr("NUMBER"), | ||
83 | [TOK_SYMBOL] = cstr("SYMBOL"), | ||
84 | [TOK_STRING] = cstr("STRING"), | ||
85 | |||
86 | // Keywords. | ||
87 | [TOK_BREAK] = cstr("BREAK"), | ||
88 | [TOK_CASE] = cstr("CASE"), | ||
89 | [TOK_CONTINUE] = cstr("CONTINUE"), | ||
90 | [TOK_FALSE] = cstr("FALSE"), | ||
91 | [TOK_FUN] = cstr("FUN"), | ||
92 | [TOK_IF] = cstr("IF"), | ||
93 | [TOK_LET] = cstr("LET"), | ||
94 | [TOK_MATCH] = cstr("MATCH"), | ||
95 | [TOK_NIL] = cstr("NIL"), | ||
96 | [TOK_RETURN] = cstr("RETURN"), | ||
97 | [TOK_SET] = cstr("SET"), | ||
98 | [TOK_STRUCT] = cstr("STRUCT"), | ||
99 | [TOK_TRUE] = cstr("TRUE"), | ||
100 | [TOK_WHILE] = cstr("WHILE"), | ||
101 | |||
102 | // Arithmetic ops. | ||
103 | [TOK_ADD] = cstr("ADD"), | ||
104 | [TOK_SUB] = cstr("SUB"), | ||
105 | [TOK_MUL] = cstr("MUL"), | ||
106 | [TOK_DIV] = cstr("DIV"), | ||
107 | [TOK_MOD] = cstr("MOD"), | ||
42 | 108 | ||
43 | typedef struct Keyword { | 109 | // Logical ops. |
44 | char *str; | 110 | [TOK_NOT] = cstr("NOT"), |
45 | size_t n; | 111 | [TOK_AND] = cstr("AND"), |
46 | TokenType token; | 112 | [TOK_OR] = cstr("OR"), |
47 | } Keyword; | 113 | [TOK_EQ] = cstr("EQ"), |
48 | 114 | [TOK_NOTEQ] = cstr("NOTEQ"), | |
49 | #define KEYWORD(STR,TOK) {(STR), sizeof(STR) - 1, (TOK)} | 115 | [TOK_LT] = cstr("LT"), |
50 | 116 | [TOK_GT] = cstr("GT"), | |
51 | static const Keyword keywords[] = { | 117 | [TOK_LE] = cstr("LE"), |
52 | KEYWORD("nil", TOKEN_NIL), | 118 | [TOK_GE] = cstr("GE"), |
53 | KEYWORD("true", TOKEN_TRUE), | 119 | |
54 | KEYWORD("false", TOKEN_FALSE), | 120 | // Bitwise ops. |
55 | KEYWORD("lambda", TOKEN_LAMBDA), | 121 | [TOK_BITNOT] = cstr("BITNOT"), |
56 | KEYWORD("if", TOKEN_IF), | 122 | [TOK_BITAND] = cstr("BITAND"), |
57 | KEYWORD("def", TOKEN_DEF), | 123 | [TOK_BITOR] = cstr("BITOR"), |
58 | KEYWORD("set", TOKEN_SET), | 124 | [TOK_BITLSHIFT] = cstr("BITLSHIFT"), |
59 | KEYWORD("fun", TOKEN_FUN), | 125 | [TOK_BITRSHIFT] = cstr("BITRSHIFT"), |
60 | KEYWORD("struct", TOKEN_STRUCT), | 126 | |
61 | KEYWORD("+", TOKEN_ADD), | 127 | // Special ops. |
62 | KEYWORD("-", TOKEN_SUB), | 128 | [TOK_COLON] = cstr("COLON"), |
63 | KEYWORD("*", TOKEN_MUL), | 129 | [TOK_DOT] = cstr("DOT"), |
64 | KEYWORD("/", TOKEN_DIV), | 130 | [TOK_AT] = cstr("AT"), |
65 | KEYWORD("%", TOKEN_MOD), | 131 | [TOK_ASSIGN] = cstr("ASSIGN"), |
66 | KEYWORD("not", TOKEN_NOT), | 132 | |
67 | KEYWORD("and", TOKEN_AND), | 133 | // End of file. |
68 | KEYWORD("or", TOKEN_OR), | 134 | [TOK_EOF] = cstr("EOF"), |
69 | KEYWORD("=", TOKEN_EQ), | ||
70 | KEYWORD("<", TOKEN_LT), | ||
71 | KEYWORD(">", TOKEN_GT), | ||
72 | KEYWORD("<=", TOKEN_LE), | ||
73 | KEYWORD(">=", TOKEN_GE), | ||
74 | }; | 135 | }; |
75 | 136 | ||
76 | void | 137 | typedef struct Token { |
77 | print_token(Token tok) { | 138 | TokenType type; |
78 | printf("[%4ld:%-4ld] ", tok.line, tok.col); | 139 | Str val; |
79 | printf("%s", token_str[tok.type]); | 140 | sz line; |
80 | switch (tok.type) { | 141 | sz col; |
81 | case TOKEN_NUMBER: | 142 | } Token; |
82 | case TOKEN_SYMBOL: | 143 | |
83 | case TOKEN_STRING: { | 144 | typedef struct Scanner { |
84 | printf(" -> "); | 145 | Str str; |
85 | sv_write(&tok.value); | 146 | sz line; |
86 | } break; | 147 | sz col; |
87 | default: { | 148 | } Scanner; |
88 | } break; | ||
89 | } | ||
90 | printf("\n"); | ||
91 | } | ||
92 | 149 | ||
93 | char | 150 | char |
94 | scan_next(Scanner *scanner) { | 151 | scan_next(Scanner *scanner) { |
95 | char c = sv_next(&scanner->current); | 152 | char c = str_next(&scanner->str); |
96 | if (c == '\n') { | 153 | if (c == '\n') { |
97 | scanner->line_number++; | 154 | scanner->line++; |
98 | scanner->col_number = 1; | 155 | scanner->col = 0; |
99 | } else { | 156 | } else { |
100 | scanner->col_number++; | 157 | scanner->col++; |
101 | } | 158 | } |
102 | scanner->offset++; | ||
103 | return c; | 159 | return c; |
104 | } | 160 | } |
105 | 161 | ||
106 | void | 162 | bool |
107 | scan_rewind(Scanner *scanner) { | 163 | scan_has_next(Scanner *scanner) { |
108 | sv_rewind(&scanner->current); | 164 | return scanner->str.size; |
109 | scanner->offset--; | ||
110 | } | 165 | } |
111 | 166 | ||
112 | char | 167 | char |
113 | scan_peek(const Scanner *scanner) { | 168 | scan_peek(Scanner *scanner) { |
114 | return sv_peek(&scanner->current); | 169 | return str_peek(scanner->str); |
115 | } | 170 | } |
116 | 171 | ||
117 | bool | 172 | void |
118 | scan_has_next(const Scanner *scanner) { | 173 | scan_skip_line(Scanner *scanner) { |
119 | return scanner->current.n != 0; | 174 | SearchResult newline = array_find_next(scanner->str, cstr("\n")); |
175 | if (newline.found) { | ||
176 | scanner->str.mem += newline.pos + 1; | ||
177 | scanner->str.size -= newline.pos + 1; | ||
178 | scanner->line++; | ||
179 | scanner->col = 0; | ||
180 | } | ||
120 | } | 181 | } |
121 | 182 | ||
122 | void | 183 | void |
123 | skip_whitespace(Scanner *scanner) { | 184 | scan_skip_whitespace(Scanner *scanner) { |
124 | while (scan_has_next(scanner)) { | 185 | while (scan_has_next(scanner)) { |
125 | char c = scan_peek(scanner); | 186 | char c = scan_peek(scanner); |
126 | switch (c) { | 187 | switch (c) { |
127 | case ' ': | 188 | case ' ': |
189 | case ',': // Commas are just syntactic sugar. | ||
128 | case '\f': | 190 | case '\f': |
129 | case '\n': | 191 | case '\n': |
130 | case '\r': | 192 | case '\r': |
@@ -132,6 +194,10 @@ skip_whitespace(Scanner *scanner) { | |||
132 | case '\v': { | 194 | case '\v': { |
133 | scan_next(scanner); | 195 | scan_next(scanner); |
134 | } break; | 196 | } break; |
197 | case ';': { | ||
198 | // Found a comment! (skip) | ||
199 | scan_skip_line(scanner); | ||
200 | } break; | ||
135 | default: { | 201 | default: { |
136 | return; | 202 | return; |
137 | } break; | 203 | } break; |
@@ -140,22 +206,33 @@ skip_whitespace(Scanner *scanner) { | |||
140 | } | 206 | } |
141 | 207 | ||
142 | bool | 208 | bool |
143 | is_delimiter(char c) { | 209 | scan_is_valid_split(char c) { |
144 | switch (c) { | 210 | switch (c) { |
145 | case EOF: | ||
146 | case '\0': | ||
147 | case ';': | 211 | case ';': |
148 | case '"': | ||
149 | case '\'': | ||
150 | case '(': | 212 | case '(': |
151 | case ')': | 213 | case ')': |
152 | case '[': | 214 | case '[': |
153 | case ']': | 215 | case ']': |
154 | case '{': | 216 | case '{': |
155 | case '}': | 217 | case '}': |
218 | case '+': | ||
219 | case '-': | ||
220 | case '*': | ||
221 | case '/': | ||
222 | case '%': | ||
223 | case '!': | ||
224 | case '=': | ||
225 | case '<': | ||
226 | case '>': | ||
227 | case '~': | ||
228 | case '&': | ||
229 | case '|': | ||
156 | case ':': | 230 | case ':': |
231 | case '.': | ||
157 | case '@': | 232 | case '@': |
233 | case '"': | ||
158 | case ' ': | 234 | case ' ': |
235 | case ',': | ||
159 | case '\f': | 236 | case '\f': |
160 | case '\n': | 237 | case '\n': |
161 | case '\r': | 238 | case '\r': |
@@ -167,122 +244,351 @@ is_delimiter(char c) { | |||
167 | return false; | 244 | return false; |
168 | } | 245 | } |
169 | 246 | ||
170 | TokenType | 247 | void |
171 | find_token_type(const StringView value) { | 248 | scan_skip_until_valid(Scanner *scanner) { |
172 | for (size_t i = 0; i < sizeof(keywords) / sizeof(Keyword); i++) { | 249 | while (scan_has_next(scanner)) { |
173 | StringView keyword = (StringView){keywords[i].str, keywords[i].n}; | 250 | char c = scan_peek(scanner); |
174 | if (sv_equal(&value, &keyword)) { | 251 | if (scan_is_valid_split(c)) { |
175 | return keywords[i].token; | 252 | return; |
176 | } | 253 | } |
254 | scan_next(scanner); | ||
177 | } | 255 | } |
178 | return TOKEN_SYMBOL; | ||
179 | } | 256 | } |
180 | 257 | ||
181 | void | 258 | Token |
182 | print_tokens(Token *tokens) { | 259 | emit_token(Scanner current, Scanner *scanner, TokenType t) { |
183 | for (size_t i = 0; i < array_size(tokens); i++) { | 260 | Str val = current.str; |
184 | print_token(tokens[i]); | 261 | val.size = current.str.size - scanner->str.size; |
185 | } | 262 | val.size = val.size < 0 ? 0 : val.size; |
263 | return (Token){ | ||
264 | .val = val, | ||
265 | .line = current.line + 1, | ||
266 | .col = current.col + 1, | ||
267 | .type = t, | ||
268 | }; | ||
186 | } | 269 | } |
187 | 270 | ||
188 | Token * | 271 | Token |
189 | tokenize(const StringView *sv) { | 272 | emit_token_err(Scanner *scanner, Str err_msg) { |
190 | Token *tokens = NULL; | 273 | return (Token){ |
191 | array_init(tokens, 1); | 274 | .line = scanner->line + 1, |
192 | Scanner scanner = (Scanner){ | 275 | .col = scanner->col + 1, |
193 | .current = *sv, | 276 | .val = err_msg, |
194 | .line_number = 1, | 277 | .type = TOK_UNKNOWN, |
195 | .col_number = 1, | ||
196 | }; | 278 | }; |
279 | } | ||
197 | 280 | ||
198 | while (scan_has_next(&scanner)) { | 281 | Token |
199 | skip_whitespace(&scanner); | 282 | emit_token_number(Scanner *scanner) { |
200 | size_t line = scanner.line_number; | 283 | Scanner current = *scanner; |
201 | size_t col = scanner.col_number; | 284 | char c = scan_peek(scanner); |
202 | size_t offset = scanner.offset; | 285 | if (c == '+' || c == '-') { |
203 | Token token = (Token){ | 286 | scan_next(scanner); |
204 | .type = TOKEN_UNKNOWN, | 287 | if (str_has_prefix(scanner->str, cstr("0b")) || |
205 | .line = line, | 288 | str_has_prefix(scanner->str, cstr("0x"))) { |
206 | .col = col, | 289 | scan_skip_until_valid(scanner); |
207 | }; | 290 | return emit_token_err( |
208 | char c = scan_next(&scanner); | 291 | ¤t, |
209 | switch (c) { | 292 | cstr("malformed number: binary/hex numbers can't be signed")); |
210 | case ';': { | 293 | } |
211 | while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} | 294 | } |
295 | if (str_has_prefix(scanner->str, cstr("0b"))) { | ||
296 | scan_next(scanner); | ||
297 | scan_next(scanner); | ||
298 | while (scan_has_next(scanner)) { | ||
299 | c = scan_peek(scanner); | ||
300 | if (c == '0' || c == '1' || c == '_') { | ||
301 | scan_next(scanner); | ||
212 | continue; | 302 | continue; |
213 | } break; | 303 | } |
214 | case '"': { | 304 | if (scan_is_valid_split(c)) { |
215 | char prev = c; | 305 | return emit_token(current, scanner, TOK_NUMBER); |
216 | bool found = false; | 306 | } |
217 | size_t n = 0; | 307 | scan_skip_until_valid(scanner); |
218 | while (scan_has_next(&scanner)) { | 308 | return emit_token_err( |
219 | c = scan_next(&scanner); | 309 | ¤t, cstr("malformed number: invalid binary number")); |
220 | if (c == '"' && prev != '\\') { | 310 | } |
221 | found = true; | 311 | } else if (str_has_prefix(scanner->str, cstr("0x"))) { |
222 | break; | 312 | scan_next(scanner); |
223 | } | 313 | scan_next(scanner); |
224 | prev = c; | 314 | while (scan_has_next(scanner)) { |
225 | n++; | 315 | c = scan_peek(scanner); |
226 | } | 316 | if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || |
227 | if (!found) { | 317 | (c >= 'A' && c <= 'F') || c == '_') { |
228 | push_error(ERR_TYPE_LEXER, ERR_UNMATCHED_STRING, line, col); | 318 | scan_next(scanner); |
229 | return tokens; | 319 | continue; |
230 | } | 320 | } |
231 | token.value = (StringView){ | 321 | if (scan_is_valid_split(c)) { |
232 | .start = &sv->start[offset + 1], | 322 | return emit_token(current, scanner, TOK_NUMBER); |
233 | .n = n, | 323 | } |
234 | }; | 324 | scan_skip_until_valid(scanner); |
235 | token.type = TOKEN_STRING; | 325 | return emit_token_err(¤t, |
236 | } break; | 326 | cstr("malformed number: invalid hex number")); |
237 | case '(': { token.type = TOKEN_LPAREN; } break; | 327 | } |
238 | case ')': { token.type = TOKEN_RPAREN; } break; | 328 | } else { |
239 | case '[': { token.type = TOKEN_LSQUARE; } break; | 329 | // Integral. |
240 | case ']': { token.type = TOKEN_RSQUARE; } break; | 330 | while (scan_has_next(scanner)) { |
241 | case '{': { token.type = TOKEN_LCURLY; } break; | 331 | c = scan_peek(scanner); |
242 | case '}': { token.type = TOKEN_RCURLY; } break; | 332 | if (c == '.') { |
243 | case ':': { token.type = TOKEN_COLON; } break; | 333 | scan_next(scanner); |
244 | case '.': { token.type = TOKEN_DOT; } break; | 334 | break; |
245 | case '@': { token.type = TOKEN_AT; } break; | 335 | } |
246 | default: { | 336 | if ((c >= '0' && c <= '9') || c == '_') { |
247 | if (c == EOF || c == '\0') { | 337 | scan_next(scanner); |
248 | token.type = TOKEN_EOF; | 338 | continue; |
249 | break; | 339 | } |
250 | } | 340 | if (scan_is_valid_split(c)) { |
251 | size_t n = 1; | 341 | return emit_token(current, scanner, TOK_NUMBER); |
252 | bool num = c == '-' && !is_delimiter(scan_peek(&scanner)); | 342 | } |
253 | num = num || (c == '+' && !is_delimiter(scan_peek(&scanner))); | 343 | scan_skip_until_valid(scanner); |
254 | num = num || (c >= '0' && c <= '9'); | 344 | return emit_token_err(¤t, cstr("malformed number")); |
255 | if (num) { | 345 | } |
256 | while (!is_delimiter(scan_peek(&scanner))) { | 346 | c = scan_peek(scanner); |
257 | c = scan_next(&scanner); | 347 | if (!(c >= '0' && c <= '9')) { |
258 | n++; | 348 | return emit_token_err(¤t, |
259 | } | 349 | cstr("malformed number: no decimal digits")); |
260 | token.value = (StringView){ | 350 | } |
261 | .start = &sv->start[offset], | 351 | // Decimals. |
262 | .n = n, | 352 | while (scan_has_next(scanner)) { |
263 | }; | 353 | c = scan_peek(scanner); |
264 | token.type = TOKEN_NUMBER; | 354 | if (c == 'e' || c == 'E') { |
265 | } else { | 355 | scan_next(scanner); |
266 | while (!is_delimiter(scan_peek(&scanner))) { | 356 | break; |
267 | if (scan_peek(&scanner) == '.') { | 357 | } |
268 | break; | 358 | if ((c >= '0' && c <= '9') || c == '_') { |
269 | } | 359 | scan_next(scanner); |
270 | c = scan_next(&scanner); | 360 | continue; |
271 | n++; | 361 | } |
272 | } | 362 | if (scan_is_valid_split(c)) { |
273 | token.value = (StringView){ | 363 | return emit_token(current, scanner, TOK_NUMBER); |
274 | .start = &sv->start[offset], | 364 | } |
275 | .n = n, | 365 | scan_skip_until_valid(scanner); |
276 | }; | 366 | return emit_token_err(¤t, cstr("malformed number")); |
277 | token.type = find_token_type(token.value); | ||
278 | } | ||
279 | } break; | ||
280 | } | 367 | } |
281 | if (token.type == TOKEN_UNKNOWN) { | 368 | // Exponent. |
282 | push_error(ERR_TYPE_LEXER, ERR_UNKNOWN_TOK_TYPE, line, col); | 369 | c = scan_peek(scanner); |
283 | return tokens; | 370 | if (c == '+' || c == '-') { |
371 | scan_next(scanner); | ||
284 | } | 372 | } |
285 | array_push(tokens, token); | 373 | while (scan_has_next(scanner)) { |
374 | c = scan_peek(scanner); | ||
375 | if ((c >= '0' && c <= '9') || c == '_') { | ||
376 | scan_next(scanner); | ||
377 | continue; | ||
378 | } | ||
379 | if (c == '.') { | ||
380 | scan_next(scanner); | ||
381 | return emit_token_err( | ||
382 | ¤t, | ||
383 | cstr("malformed number: decimals not allowed on exponent")); | ||
384 | } | ||
385 | if (scan_is_valid_split(c)) { | ||
386 | return emit_token(current, scanner, TOK_NUMBER); | ||
387 | } | ||
388 | scan_skip_until_valid(scanner); | ||
389 | return emit_token_err(¤t, cstr("malformed number")); | ||
390 | } | ||
391 | } | ||
392 | return emit_token_err(¤t, cstr("malformed number")); | ||
393 | } | ||
394 | |||
395 | Token | ||
396 | scan_token(Scanner *scanner) { | ||
397 | assert(scanner); | ||
398 | |||
399 | scan_skip_whitespace(scanner); | ||
400 | if (!scan_has_next(scanner)) { | ||
401 | return emit_token(*scanner, scanner, TOK_EOF); | ||
402 | } | ||
403 | |||
404 | Scanner current = *scanner; | ||
405 | char c = scan_next(scanner); | ||
406 | switch (c) { | ||
407 | case '(': | ||
408 | return emit_token(current, scanner, TOK_LPAREN); | ||
409 | case ')': | ||
410 | return emit_token(current, scanner, TOK_RPAREN); | ||
411 | case '[': | ||
412 | return emit_token(current, scanner, TOK_LSQUARE); | ||
413 | case ']': | ||
414 | return emit_token(current, scanner, TOK_RSQUARE); | ||
415 | case '{': | ||
416 | return emit_token(current, scanner, TOK_LCURLY); | ||
417 | case '}': | ||
418 | return emit_token(current, scanner, TOK_RCURLY); | ||
419 | case '+': { | ||
420 | char p = scan_peek(scanner); | ||
421 | if (p >= '0' && p <= '9') { | ||
422 | *scanner = current; | ||
423 | return emit_token_number(scanner); | ||
424 | } | ||
425 | return emit_token(current, scanner, TOK_ADD); | ||
426 | }; | ||
427 | case '-': { | ||
428 | char p = scan_peek(scanner); | ||
429 | if (p >= '0' && p <= '9') { | ||
430 | *scanner = current; | ||
431 | return emit_token_number(scanner); | ||
432 | } | ||
433 | return emit_token(current, scanner, TOK_ADD); | ||
434 | }; | ||
435 | case '*': | ||
436 | return emit_token(current, scanner, TOK_MUL); | ||
437 | case '/': | ||
438 | return emit_token(current, scanner, TOK_DIV); | ||
439 | case '%': | ||
440 | return emit_token(current, scanner, TOK_MOD); | ||
441 | case '!': { | ||
442 | if (scan_peek(scanner) == '=') { | ||
443 | scan_next(scanner); | ||
444 | return emit_token(current, scanner, TOK_NOTEQ); | ||
445 | } | ||
446 | return emit_token(current, scanner, TOK_NOT); | ||
447 | }; | ||
448 | case '=': { | ||
449 | if (scan_peek(scanner) == '=') { | ||
450 | scan_next(scanner); | ||
451 | return emit_token(current, scanner, TOK_EQ); | ||
452 | } | ||
453 | return emit_token(current, scanner, TOK_ASSIGN); | ||
454 | }; | ||
455 | case '<': { | ||
456 | char p = scan_peek(scanner); | ||
457 | if (p == '=') { | ||
458 | scan_next(scanner); | ||
459 | return emit_token(current, scanner, TOK_LE); | ||
460 | } | ||
461 | if (p == '<') { | ||
462 | scan_next(scanner); | ||
463 | return emit_token(current, scanner, TOK_BITLSHIFT); | ||
464 | } | ||
465 | return emit_token(current, scanner, TOK_LT); | ||
466 | }; | ||
467 | case '>': { | ||
468 | char p = scan_peek(scanner); | ||
469 | if (p == '=') { | ||
470 | scan_next(scanner); | ||
471 | return emit_token(current, scanner, TOK_GE); | ||
472 | } | ||
473 | if (p == '>') { | ||
474 | scan_next(scanner); | ||
475 | return emit_token(current, scanner, TOK_BITRSHIFT); | ||
476 | } | ||
477 | return emit_token(current, scanner, TOK_GT); | ||
478 | }; | ||
479 | case '~': | ||
480 | return emit_token(current, scanner, TOK_BITNOT); | ||
481 | case '&': { | ||
482 | if (scan_peek(scanner) == '&') { | ||
483 | scan_next(scanner); | ||
484 | return emit_token(current, scanner, TOK_AND); | ||
485 | } | ||
486 | return emit_token(current, scanner, TOK_BITAND); | ||
487 | }; | ||
488 | case '|': { | ||
489 | if (scan_peek(scanner) == '|') { | ||
490 | scan_next(scanner); | ||
491 | return emit_token(current, scanner, TOK_OR); | ||
492 | } | ||
493 | return emit_token(current, scanner, TOK_BITOR); | ||
494 | }; | ||
495 | case ':': | ||
496 | return emit_token(current, scanner, TOK_COLON); | ||
497 | case '.': | ||
498 | return emit_token(current, scanner, TOK_DOT); | ||
499 | case '@': | ||
500 | return emit_token(current, scanner, TOK_AT); | ||
501 | case '"': { | ||
502 | while (scan_has_next(scanner)) { | ||
503 | c = scan_next(scanner); | ||
504 | if (c == '\\') { | ||
505 | scan_next(scanner); | ||
506 | continue; | ||
507 | } | ||
508 | if (c == '"') { | ||
509 | return emit_token(current, scanner, TOK_STRING); | ||
510 | } | ||
511 | } | ||
512 | return emit_token_err(¤t, cstr("mismatched string quotes")); | ||
513 | }; | ||
514 | } | ||
515 | if (c >= '0' && c <= '9') { | ||
516 | *scanner = current; | ||
517 | return emit_token_number(scanner); | ||
518 | } | ||
519 | |||
520 | scan_skip_until_valid(scanner); | ||
521 | Str val = current.str; | ||
522 | val.size = current.str.size - scanner->str.size; | ||
523 | val.size = val.size < 0 ? 0 : val.size; | ||
524 | if (val.size == 0) { | ||
525 | return emit_token_err(¤t, cstr("unexpected character")); | ||
526 | } | ||
527 | switch (val.mem[0]) { | ||
528 | case 'b': { | ||
529 | if (str_has_prefix(val, cstr("break"))) { | ||
530 | return emit_token(current, scanner, TOK_BREAK); | ||
531 | } | ||
532 | } break; | ||
533 | case 'c': { | ||
534 | if (str_has_prefix(val, cstr("case"))) { | ||
535 | return emit_token(current, scanner, TOK_CASE); | ||
536 | } | ||
537 | if (str_has_prefix(val, cstr("continue"))) { | ||
538 | return emit_token(current, scanner, TOK_CONTINUE); | ||
539 | } | ||
540 | } break; | ||
541 | case 'f': { | ||
542 | if (str_has_prefix(val, cstr("false"))) { | ||
543 | return emit_token(current, scanner, TOK_FALSE); | ||
544 | } | ||
545 | if (str_has_prefix(val, cstr("fun"))) { | ||
546 | return emit_token(current, scanner, TOK_FUN); | ||
547 | } | ||
548 | } break; | ||
549 | case 'i': { | ||
550 | if (str_has_prefix(val, cstr("if"))) { | ||
551 | return emit_token(current, scanner, TOK_IF); | ||
552 | } | ||
553 | } break; | ||
554 | case 'l': { | ||
555 | if (str_has_prefix(val, cstr("let"))) { | ||
556 | return emit_token(current, scanner, TOK_LET); | ||
557 | } | ||
558 | } break; | ||
559 | case 'm': { | ||
560 | if (str_has_prefix(val, cstr("match"))) { | ||
561 | return emit_token(current, scanner, TOK_MATCH); | ||
562 | } | ||
563 | } break; | ||
564 | case 'n': { | ||
565 | if (str_has_prefix(val, cstr("nil"))) { | ||
566 | return emit_token(current, scanner, TOK_NIL); | ||
567 | } | ||
568 | } break; | ||
569 | case 'r': { | ||
570 | if (str_has_prefix(val, cstr("return"))) { | ||
571 | return emit_token(current, scanner, TOK_RETURN); | ||
572 | } | ||
573 | } break; | ||
574 | case 's': { | ||
575 | if (str_has_prefix(val, cstr("set"))) { | ||
576 | return emit_token(current, scanner, TOK_SET); | ||
577 | } | ||
578 | if (str_has_prefix(val, cstr("struct"))) { | ||
579 | return emit_token(current, scanner, TOK_STRUCT); | ||
580 | } | ||
581 | } break; | ||
582 | case 't': { | ||
583 | if (str_has_prefix(val, cstr("true"))) { | ||
584 | return emit_token(current, scanner, TOK_TRUE); | ||
585 | } | ||
586 | } break; | ||
587 | case 'w': { | ||
588 | if (str_has_prefix(val, cstr("while"))) { | ||
589 | return emit_token(current, scanner, TOK_WHILE); | ||
590 | } | ||
591 | } break; | ||
286 | } | 592 | } |
287 | return tokens; | 593 | return emit_token(current, scanner, TOK_SYMBOL); |
288 | } | 594 | } |