diff options
author | Bad Diode <bd@badd10de.dev> | 2024-06-15 15:13:46 +0200 |
---|---|---|
committer | Bad Diode <bd@badd10de.dev> | 2024-06-15 15:13:46 +0200 |
commit | 4de23d7e60ba018d6f9bf656f6f3fca4be2473be (patch) | |
tree | b359a023f256e00cf7dc9d106c7444ee9cd36212 /src | |
parent | 805efd71e0f5b10a6e78da08565407ec0a3649fe (diff) | |
download | bdl-4de23d7e60ba018d6f9bf656f6f3fca4be2473be.tar.gz bdl-4de23d7e60ba018d6f9bf656f6f3fca4be2473be.zip |
Add many scanning functions
Diffstat (limited to 'src')
-rw-r--r-- | src/main.c | 344 |
1 files changed, 301 insertions, 43 deletions
@@ -36,9 +36,6 @@ typedef enum TokenType { | |||
36 | TOK_NUMBER, | 36 | TOK_NUMBER, |
37 | TOK_SYMBOL, | 37 | TOK_SYMBOL, |
38 | TOK_STRING, | 38 | TOK_STRING, |
39 | TOK_NIL, | ||
40 | TOK_TRUE, | ||
41 | TOK_FALSE, | ||
42 | 39 | ||
43 | // Keywords. | 40 | // Keywords. |
44 | TOK_LET, | 41 | TOK_LET, |
@@ -52,36 +49,40 @@ typedef enum TokenType { | |||
52 | TOK_CONTINUE, | 49 | TOK_CONTINUE, |
53 | TOK_BREAK, | 50 | TOK_BREAK, |
54 | TOK_RETURN, | 51 | TOK_RETURN, |
52 | TOK_NIL, | ||
53 | TOK_TRUE, | ||
54 | TOK_FALSE, | ||
55 | 55 | ||
56 | // Arithmetic ops. | 56 | // Arithmetic ops. |
57 | TOK_ADD, | 57 | TOK_ADD, // + |
58 | TOK_SUB, | 58 | TOK_SUB, // - |
59 | TOK_MUL, | 59 | TOK_MUL, // * |
60 | TOK_DIV, | 60 | TOK_DIV, // / |
61 | TOK_MOD, | 61 | TOK_MOD, // % |
62 | 62 | ||
63 | // Boolean ops. | 63 | // Logical ops. |
64 | TOK_NOT, | 64 | TOK_NOT, // ! |
65 | TOK_AND, | 65 | TOK_AND, // && |
66 | TOK_OR, | 66 | TOK_OR, // || |
67 | TOK_EQ, | 67 | TOK_EQ, // == |
68 | TOK_LT, | 68 | TOK_NOTEQ, // != |
69 | TOK_GT, | 69 | TOK_LT, // < |
70 | TOK_LE, | 70 | TOK_GT, // > |
71 | TOK_GE, | 71 | TOK_LE, // <= |
72 | TOK_GE, // >= | ||
72 | 73 | ||
73 | // Bitwise ops. | 74 | // Bitwise ops. |
74 | TOK_BITNOT, | 75 | TOK_BITNOT, // ~ |
75 | TOK_BITAND, | 76 | TOK_BITAND, // & |
76 | TOK_BITOR, | 77 | TOK_BITOR, // | |
77 | TOK_BITLSHIFT, | 78 | TOK_BITLSHIFT, // << |
78 | TOK_BITRSHIFT, | 79 | TOK_BITRSHIFT, // >> |
79 | 80 | ||
80 | // Special ops. | 81 | // Special ops. |
81 | TOK_COLON, | 82 | TOK_COLON, // : |
82 | TOK_DOT, | 83 | TOK_DOT, // . |
83 | TOK_AT, | 84 | TOK_AT, // @ |
84 | TOK_ASSIGN, | 85 | TOK_ASSIGN, // = |
85 | 86 | ||
86 | // End of file. | 87 | // End of file. |
87 | TOK_EOF, | 88 | TOK_EOF, |
@@ -102,9 +103,6 @@ Str token_str[] = { | |||
102 | [TOK_NUMBER] = cstr("NUMBER"), | 103 | [TOK_NUMBER] = cstr("NUMBER"), |
103 | [TOK_SYMBOL] = cstr("SYMBOL"), | 104 | [TOK_SYMBOL] = cstr("SYMBOL"), |
104 | [TOK_STRING] = cstr("STRING"), | 105 | [TOK_STRING] = cstr("STRING"), |
105 | [TOK_NIL] = cstr("NIL"), | ||
106 | [TOK_TRUE] = cstr("TRUE"), | ||
107 | [TOK_FALSE] = cstr("FALSE"), | ||
108 | 106 | ||
109 | // Keywords. | 107 | // Keywords. |
110 | [TOK_LET] = cstr("LET"), | 108 | [TOK_LET] = cstr("LET"), |
@@ -118,6 +116,9 @@ Str token_str[] = { | |||
118 | [TOK_CONTINUE] = cstr("CONTINUE"), | 116 | [TOK_CONTINUE] = cstr("CONTINUE"), |
119 | [TOK_BREAK] = cstr("BREAK"), | 117 | [TOK_BREAK] = cstr("BREAK"), |
120 | [TOK_RETURN] = cstr("RETURN"), | 118 | [TOK_RETURN] = cstr("RETURN"), |
119 | [TOK_NIL] = cstr("NIL"), | ||
120 | [TOK_TRUE] = cstr("TRUE"), | ||
121 | [TOK_FALSE] = cstr("FALSE"), | ||
121 | 122 | ||
122 | // Arithmetic ops. | 123 | // Arithmetic ops. |
123 | [TOK_ADD] = cstr("ADD"), | 124 | [TOK_ADD] = cstr("ADD"), |
@@ -126,11 +127,12 @@ Str token_str[] = { | |||
126 | [TOK_DIV] = cstr("DIV"), | 127 | [TOK_DIV] = cstr("DIV"), |
127 | [TOK_MOD] = cstr("MOD"), | 128 | [TOK_MOD] = cstr("MOD"), |
128 | 129 | ||
129 | // Boolean ops. | 130 | // Logical ops. |
130 | [TOK_NOT] = cstr("NOT"), | 131 | [TOK_NOT] = cstr("NOT"), |
131 | [TOK_AND] = cstr("AND"), | 132 | [TOK_AND] = cstr("AND"), |
132 | [TOK_OR] = cstr("OR"), | 133 | [TOK_OR] = cstr("OR"), |
133 | [TOK_EQ] = cstr("EQ"), | 134 | [TOK_EQ] = cstr("EQ"), |
135 | [TOK_NOTEQ] = cstr("NOTEQ"), | ||
134 | [TOK_LT] = cstr("LT"), | 136 | [TOK_LT] = cstr("LT"), |
135 | [TOK_GT] = cstr("GT"), | 137 | [TOK_GT] = cstr("GT"), |
136 | [TOK_LE] = cstr("LE"), | 138 | [TOK_LE] = cstr("LE"), |
@@ -190,10 +192,14 @@ scan_peek(Scanner *scanner) { | |||
190 | } | 192 | } |
191 | 193 | ||
192 | Token | 194 | Token |
193 | emit_token(Scanner *scanner, TokenType t) { | 195 | emit_token(Scanner current, Scanner *scanner, TokenType t) { |
196 | Str val = current.str; | ||
197 | val.size = current.str.size - scanner->str.size; | ||
198 | val.size = val.size < 0 ? 0 : val.size; | ||
194 | return (Token){ | 199 | return (Token){ |
195 | .line = scanner->line + 1, | 200 | .val = val, |
196 | .col = scanner->col + 1, | 201 | .line = current.line + 1, |
202 | .col = current.col + 1, | ||
197 | .type = t, | 203 | .type = t, |
198 | }; | 204 | }; |
199 | } | 205 | } |
@@ -225,7 +231,7 @@ scan_skip_whitespace(Scanner *scanner) { | |||
225 | char c = scan_peek(scanner); | 231 | char c = scan_peek(scanner); |
226 | switch (c) { | 232 | switch (c) { |
227 | case ' ': | 233 | case ' ': |
228 | case ',': // We are currently considering commas as syntactic sugar. | 234 | case ',': // Commas are just syntactic sugar. |
229 | case '\f': | 235 | case '\f': |
230 | case '\n': | 236 | case '\n': |
231 | case '\r': | 237 | case '\r': |
@@ -244,22 +250,273 @@ scan_skip_whitespace(Scanner *scanner) { | |||
244 | } | 250 | } |
245 | } | 251 | } |
246 | 252 | ||
253 | bool | ||
254 | is_valid_split(char c) { | ||
255 | switch (c) { | ||
256 | case ';': | ||
257 | case '(': | ||
258 | case ')': | ||
259 | case '[': | ||
260 | case ']': | ||
261 | case '{': | ||
262 | case '}': | ||
263 | case '+': | ||
264 | case '-': | ||
265 | case '*': | ||
266 | case '/': | ||
267 | case '%': | ||
268 | case '!': | ||
269 | case '=': | ||
270 | case '<': | ||
271 | case '>': | ||
272 | case '~': | ||
273 | case '&': | ||
274 | case '|': | ||
275 | case ':': | ||
276 | case '.': | ||
277 | case '@': | ||
278 | case '"': | ||
279 | case ' ': | ||
280 | case ',': | ||
281 | case '\f': | ||
282 | case '\n': | ||
283 | case '\r': | ||
284 | case '\t': | ||
285 | case '\v': { | ||
286 | return true; | ||
287 | } break; | ||
288 | } | ||
289 | return false; | ||
290 | } | ||
291 | |||
292 | void | ||
293 | scan_skip_until_valid(Scanner *scanner) { | ||
294 | while (scan_has_next(scanner)) { | ||
295 | char c = scan_peek(scanner); | ||
296 | if (is_valid_split(c)) { | ||
297 | return; | ||
298 | } | ||
299 | scan_next(scanner); | ||
300 | } | ||
301 | } | ||
302 | |||
303 | Token | ||
304 | emit_token_number(Scanner *scanner) { | ||
305 | Scanner current = *scanner; | ||
306 | char c = scan_peek(scanner); | ||
307 | if (c == '+' || c == '-') { | ||
308 | scan_next(scanner); | ||
309 | if (str_has_prefix(scanner->str, cstr("0b")) || | ||
310 | str_has_prefix(scanner->str, cstr("0x"))) { | ||
311 | scan_skip_until_valid(scanner); | ||
312 | return emit_token_err( | ||
313 | ¤t, | ||
314 | cstr("malformed number: binary/hex numbers can't be signed")); | ||
315 | } | ||
316 | } | ||
317 | if (str_has_prefix(scanner->str, cstr("0b"))) { | ||
318 | scan_next(scanner); | ||
319 | scan_next(scanner); | ||
320 | while (scan_has_next(scanner)) { | ||
321 | char c = scan_peek(scanner); | ||
322 | if (c == '0' || c == '1') { | ||
323 | scan_next(scanner); | ||
324 | continue; | ||
325 | } | ||
326 | if (is_valid_split(c)) { | ||
327 | break; | ||
328 | } | ||
329 | scan_skip_until_valid(scanner); | ||
330 | return emit_token_err( | ||
331 | ¤t, cstr("malformed number: invalid binary number")); | ||
332 | } | ||
333 | } else if (str_has_prefix(scanner->str, cstr("0x"))) { | ||
334 | scan_next(scanner); | ||
335 | scan_next(scanner); | ||
336 | while (scan_has_next(scanner)) { | ||
337 | char c = scan_peek(scanner); | ||
338 | if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || | ||
339 | (c >= 'A' && c <= 'F')) { | ||
340 | scan_next(scanner); | ||
341 | continue; | ||
342 | } | ||
343 | if (is_valid_split(c)) { | ||
344 | break; | ||
345 | } | ||
346 | scan_skip_until_valid(scanner); | ||
347 | return emit_token_err(¤t, | ||
348 | cstr("malformed number: invalid hex number")); | ||
349 | } | ||
350 | } else { | ||
351 | // TODO: handle fractions, signs and exponents properly. | ||
352 | while (scan_has_next(scanner)) { | ||
353 | char c = scan_peek(scanner); | ||
354 | switch (c) { | ||
355 | case '+': | ||
356 | case '-': | ||
357 | case 'e': | ||
358 | case 'E': | ||
359 | case '.': | ||
360 | scan_next(scanner); | ||
361 | continue; | ||
362 | } | ||
363 | if (c >= '0' && c <= '9') { | ||
364 | scan_next(scanner); | ||
365 | continue; | ||
366 | } | ||
367 | break; | ||
368 | } | ||
369 | } | ||
370 | return emit_token(current, scanner, TOK_NUMBER); | ||
371 | } | ||
372 | |||
247 | Token | 373 | Token |
248 | scan_token(Scanner *scanner) { | 374 | scan_token(Scanner *scanner) { |
249 | assert(scanner); | 375 | assert(scanner); |
250 | 376 | ||
251 | scan_skip_whitespace(scanner); | 377 | scan_skip_whitespace(scanner); |
252 | if (!scan_has_next(scanner)) { | 378 | if (!scan_has_next(scanner)) { |
253 | return emit_token(scanner, TOK_EOF); | 379 | return emit_token(*scanner, scanner, TOK_EOF); |
254 | } | 380 | } |
255 | 381 | ||
256 | Scanner current = *scanner; | 382 | Scanner current = *scanner; |
257 | char c = scan_next(scanner); | 383 | char c = scan_next(scanner); |
258 | (void)c; | 384 | switch (c) { |
259 | // TODO: rest of the operations... | 385 | case '(': |
386 | return emit_token(current, scanner, TOK_LPAREN); | ||
387 | case ')': | ||
388 | return emit_token(current, scanner, TOK_RPAREN); | ||
389 | case '[': | ||
390 | return emit_token(current, scanner, TOK_LSQUARE); | ||
391 | case ']': | ||
392 | return emit_token(current, scanner, TOK_RSQUARE); | ||
393 | case '{': | ||
394 | return emit_token(current, scanner, TOK_LCURLY); | ||
395 | case '}': | ||
396 | return emit_token(current, scanner, TOK_RCURLY); | ||
397 | case '+': { | ||
398 | char p = scan_peek(scanner); | ||
399 | if (p >= '0' && p <= '9') { | ||
400 | *scanner = current; | ||
401 | return emit_token_number(scanner); | ||
402 | } | ||
403 | return emit_token(current, scanner, TOK_ADD); | ||
404 | }; | ||
405 | case '-': { | ||
406 | char p = scan_peek(scanner); | ||
407 | if (p >= '0' && p <= '9') { | ||
408 | *scanner = current; | ||
409 | return emit_token_number(scanner); | ||
410 | } | ||
411 | return emit_token(current, scanner, TOK_ADD); | ||
412 | }; | ||
413 | case '*': | ||
414 | return emit_token(current, scanner, TOK_MUL); | ||
415 | case '/': | ||
416 | return emit_token(current, scanner, TOK_DIV); | ||
417 | case '%': | ||
418 | return emit_token(current, scanner, TOK_MOD); | ||
419 | case '!': { | ||
420 | if (scan_peek(scanner) == '=') { | ||
421 | scan_next(scanner); | ||
422 | return emit_token(current, scanner, TOK_NOTEQ); | ||
423 | } | ||
424 | return emit_token(current, scanner, TOK_NOT); | ||
425 | }; | ||
426 | case '=': { | ||
427 | if (scan_peek(scanner) == '=') { | ||
428 | scan_next(scanner); | ||
429 | return emit_token(current, scanner, TOK_EQ); | ||
430 | } | ||
431 | return emit_token(current, scanner, TOK_ASSIGN); | ||
432 | }; | ||
433 | case '<': { | ||
434 | char p = scan_peek(scanner); | ||
435 | if (p == '=') { | ||
436 | scan_next(scanner); | ||
437 | return emit_token(current, scanner, TOK_LE); | ||
438 | } | ||
439 | if (p == '<') { | ||
440 | scan_next(scanner); | ||
441 | return emit_token(current, scanner, TOK_BITLSHIFT); | ||
442 | } | ||
443 | return emit_token(current, scanner, TOK_LT); | ||
444 | }; | ||
445 | case '>': { | ||
446 | char p = scan_peek(scanner); | ||
447 | if (p == '=') { | ||
448 | scan_next(scanner); | ||
449 | return emit_token(current, scanner, TOK_GE); | ||
450 | } | ||
451 | if (p == '>') { | ||
452 | scan_next(scanner); | ||
453 | return emit_token(current, scanner, TOK_BITRSHIFT); | ||
454 | } | ||
455 | return emit_token(current, scanner, TOK_GT); | ||
456 | }; | ||
457 | case '~': | ||
458 | return emit_token(current, scanner, TOK_BITNOT); | ||
459 | case '&': { | ||
460 | if (scan_peek(scanner) == '&') { | ||
461 | scan_next(scanner); | ||
462 | return emit_token(current, scanner, TOK_AND); | ||
463 | } | ||
464 | return emit_token(current, scanner, TOK_BITAND); | ||
465 | }; | ||
466 | case '|': { | ||
467 | if (scan_peek(scanner) == '|') { | ||
468 | scan_next(scanner); | ||
469 | return emit_token(current, scanner, TOK_OR); | ||
470 | } | ||
471 | return emit_token(current, scanner, TOK_BITOR); | ||
472 | }; | ||
473 | case ':': | ||
474 | return emit_token(current, scanner, TOK_COLON); | ||
475 | case '.': | ||
476 | return emit_token(current, scanner, TOK_DOT); | ||
477 | case '@': | ||
478 | return emit_token(current, scanner, TOK_AT); | ||
479 | case '"': { | ||
480 | while (scan_has_next(scanner)) { | ||
481 | c = scan_next(scanner); | ||
482 | if (c == '\\') { | ||
483 | scan_next(scanner); | ||
484 | continue; | ||
485 | } | ||
486 | if (c == '"') { | ||
487 | return emit_token(current, scanner, TOK_STRING); | ||
488 | } | ||
489 | } | ||
490 | return emit_token_err(¤t, cstr("mismatched string quotes")); | ||
491 | }; | ||
492 | } | ||
493 | if (c >= '0' && c <= '9') { | ||
494 | *scanner = current; | ||
495 | return emit_token_number(scanner); | ||
496 | } | ||
260 | 497 | ||
261 | // At this point we have an error, find the next newline. | 498 | // TODO: keywords & literals |
262 | scan_skip_line(scanner); | 499 | // Basic literals. |
500 | // TOK_SYMBOL, | ||
501 | |||
502 | // // Keywords. | ||
503 | // TOK_LET, | ||
504 | // TOK_SET, | ||
505 | // TOK_FUN, | ||
506 | // TOK_STRUCT, | ||
507 | // TOK_IF, | ||
508 | // TOK_MATCH, | ||
509 | // TOK_CASE, | ||
510 | // TOK_WHILE, | ||
511 | // TOK_CONTINUE, | ||
512 | // TOK_BREAK, | ||
513 | // TOK_RETURN, | ||
514 | // TOK_NIL, | ||
515 | // TOK_TRUE, | ||
516 | // TOK_FALSE, | ||
517 | |||
518 | // At this point we have an error, skip until we find whitespace again. | ||
519 | scan_skip_until_valid(scanner); | ||
263 | return emit_token_err(¤t, cstr("unexpected character")); | 520 | return emit_token_err(¤t, cstr("unexpected character")); |
264 | } | 521 | } |
265 | 522 | ||
@@ -281,12 +538,13 @@ process_file(Str path) { | |||
281 | Token tok = {0}; | 538 | Token tok = {0}; |
282 | while (tok.type != TOK_EOF) { | 539 | while (tok.type != TOK_EOF) { |
283 | tok = scan_token(&scanner); | 540 | tok = scan_token(&scanner); |
284 | println("%s:%d:%d:%s %s", path, tok.line, tok.col, token_str[tok.type], | 541 | eprintln("%s:%d:%d:%s %s", path, tok.line, tok.col, token_str[tok.type], |
285 | tok.val); | 542 | tok.val); |
286 | } | 543 | } |
287 | // while (true) { | 544 | // while (true) { |
288 | // Token tok = scan_token(&scanner); | 545 | // Token tok = scan_token(&scanner); |
289 | // println("%s:%d:%d:%s %s", path, tok.line, tok.col, token_str[tok.type], | 546 | // println("%s:%d:%d:%s %s", path, tok.line, tok.col, |
547 | // token_str[tok.type], | ||
290 | // tok.val); | 548 | // tok.val); |
291 | // if (tok.type == TOK_EOF) break; | 549 | // if (tok.type == TOK_EOF) break; |
292 | // } | 550 | // } |