aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorBad Diode <bd@badd10de.dev>2024-06-15 15:13:46 +0200
committerBad Diode <bd@badd10de.dev>2024-06-15 15:13:46 +0200
commit4de23d7e60ba018d6f9bf656f6f3fca4be2473be (patch)
treeb359a023f256e00cf7dc9d106c7444ee9cd36212 /src
parent805efd71e0f5b10a6e78da08565407ec0a3649fe (diff)
downloadbdl-4de23d7e60ba018d6f9bf656f6f3fca4be2473be.tar.gz
bdl-4de23d7e60ba018d6f9bf656f6f3fca4be2473be.zip
Add many scanning functions
Diffstat (limited to 'src')
-rw-r--r--src/main.c344
1 files changed, 301 insertions, 43 deletions
diff --git a/src/main.c b/src/main.c
index 6be1e88..61d02a7 100644
--- a/src/main.c
+++ b/src/main.c
@@ -36,9 +36,6 @@ typedef enum TokenType {
36 TOK_NUMBER, 36 TOK_NUMBER,
37 TOK_SYMBOL, 37 TOK_SYMBOL,
38 TOK_STRING, 38 TOK_STRING,
39 TOK_NIL,
40 TOK_TRUE,
41 TOK_FALSE,
42 39
43 // Keywords. 40 // Keywords.
44 TOK_LET, 41 TOK_LET,
@@ -52,36 +49,40 @@ typedef enum TokenType {
52 TOK_CONTINUE, 49 TOK_CONTINUE,
53 TOK_BREAK, 50 TOK_BREAK,
54 TOK_RETURN, 51 TOK_RETURN,
52 TOK_NIL,
53 TOK_TRUE,
54 TOK_FALSE,
55 55
56 // Arithmetic ops. 56 // Arithmetic ops.
57 TOK_ADD, 57 TOK_ADD, // +
58 TOK_SUB, 58 TOK_SUB, // -
59 TOK_MUL, 59 TOK_MUL, // *
60 TOK_DIV, 60 TOK_DIV, // /
61 TOK_MOD, 61 TOK_MOD, // %
62 62
63 // Boolean ops. 63 // Logical ops.
64 TOK_NOT, 64 TOK_NOT, // !
65 TOK_AND, 65 TOK_AND, // &&
66 TOK_OR, 66 TOK_OR, // ||
67 TOK_EQ, 67 TOK_EQ, // ==
68 TOK_LT, 68 TOK_NOTEQ, // !=
69 TOK_GT, 69 TOK_LT, // <
70 TOK_LE, 70 TOK_GT, // >
71 TOK_GE, 71 TOK_LE, // <=
72 TOK_GE, // >=
72 73
73 // Bitwise ops. 74 // Bitwise ops.
74 TOK_BITNOT, 75 TOK_BITNOT, // ~
75 TOK_BITAND, 76 TOK_BITAND, // &
76 TOK_BITOR, 77 TOK_BITOR, // |
77 TOK_BITLSHIFT, 78 TOK_BITLSHIFT, // <<
78 TOK_BITRSHIFT, 79 TOK_BITRSHIFT, // >>
79 80
80 // Special ops. 81 // Special ops.
81 TOK_COLON, 82 TOK_COLON, // :
82 TOK_DOT, 83 TOK_DOT, // .
83 TOK_AT, 84 TOK_AT, // @
84 TOK_ASSIGN, 85 TOK_ASSIGN, // =
85 86
86 // End of file. 87 // End of file.
87 TOK_EOF, 88 TOK_EOF,
@@ -102,9 +103,6 @@ Str token_str[] = {
102 [TOK_NUMBER] = cstr("NUMBER"), 103 [TOK_NUMBER] = cstr("NUMBER"),
103 [TOK_SYMBOL] = cstr("SYMBOL"), 104 [TOK_SYMBOL] = cstr("SYMBOL"),
104 [TOK_STRING] = cstr("STRING"), 105 [TOK_STRING] = cstr("STRING"),
105 [TOK_NIL] = cstr("NIL"),
106 [TOK_TRUE] = cstr("TRUE"),
107 [TOK_FALSE] = cstr("FALSE"),
108 106
109 // Keywords. 107 // Keywords.
110 [TOK_LET] = cstr("LET"), 108 [TOK_LET] = cstr("LET"),
@@ -118,6 +116,9 @@ Str token_str[] = {
118 [TOK_CONTINUE] = cstr("CONTINUE"), 116 [TOK_CONTINUE] = cstr("CONTINUE"),
119 [TOK_BREAK] = cstr("BREAK"), 117 [TOK_BREAK] = cstr("BREAK"),
120 [TOK_RETURN] = cstr("RETURN"), 118 [TOK_RETURN] = cstr("RETURN"),
119 [TOK_NIL] = cstr("NIL"),
120 [TOK_TRUE] = cstr("TRUE"),
121 [TOK_FALSE] = cstr("FALSE"),
121 122
122 // Arithmetic ops. 123 // Arithmetic ops.
123 [TOK_ADD] = cstr("ADD"), 124 [TOK_ADD] = cstr("ADD"),
@@ -126,11 +127,12 @@ Str token_str[] = {
126 [TOK_DIV] = cstr("DIV"), 127 [TOK_DIV] = cstr("DIV"),
127 [TOK_MOD] = cstr("MOD"), 128 [TOK_MOD] = cstr("MOD"),
128 129
129 // Boolean ops. 130 // Logical ops.
130 [TOK_NOT] = cstr("NOT"), 131 [TOK_NOT] = cstr("NOT"),
131 [TOK_AND] = cstr("AND"), 132 [TOK_AND] = cstr("AND"),
132 [TOK_OR] = cstr("OR"), 133 [TOK_OR] = cstr("OR"),
133 [TOK_EQ] = cstr("EQ"), 134 [TOK_EQ] = cstr("EQ"),
135 [TOK_NOTEQ] = cstr("NOTEQ"),
134 [TOK_LT] = cstr("LT"), 136 [TOK_LT] = cstr("LT"),
135 [TOK_GT] = cstr("GT"), 137 [TOK_GT] = cstr("GT"),
136 [TOK_LE] = cstr("LE"), 138 [TOK_LE] = cstr("LE"),
@@ -190,10 +192,14 @@ scan_peek(Scanner *scanner) {
190} 192}
191 193
192Token 194Token
193emit_token(Scanner *scanner, TokenType t) { 195emit_token(Scanner current, Scanner *scanner, TokenType t) {
196 Str val = current.str;
197 val.size = current.str.size - scanner->str.size;
198 val.size = val.size < 0 ? 0 : val.size;
194 return (Token){ 199 return (Token){
195 .line = scanner->line + 1, 200 .val = val,
196 .col = scanner->col + 1, 201 .line = current.line + 1,
202 .col = current.col + 1,
197 .type = t, 203 .type = t,
198 }; 204 };
199} 205}
@@ -225,7 +231,7 @@ scan_skip_whitespace(Scanner *scanner) {
225 char c = scan_peek(scanner); 231 char c = scan_peek(scanner);
226 switch (c) { 232 switch (c) {
227 case ' ': 233 case ' ':
228 case ',': // We are currently considering commas as syntactic sugar. 234 case ',': // Commas are just syntactic sugar.
229 case '\f': 235 case '\f':
230 case '\n': 236 case '\n':
231 case '\r': 237 case '\r':
@@ -244,22 +250,273 @@ scan_skip_whitespace(Scanner *scanner) {
244 } 250 }
245} 251}
246 252
253bool
254is_valid_split(char c) {
255 switch (c) {
256 case ';':
257 case '(':
258 case ')':
259 case '[':
260 case ']':
261 case '{':
262 case '}':
263 case '+':
264 case '-':
265 case '*':
266 case '/':
267 case '%':
268 case '!':
269 case '=':
270 case '<':
271 case '>':
272 case '~':
273 case '&':
274 case '|':
275 case ':':
276 case '.':
277 case '@':
278 case '"':
279 case ' ':
280 case ',':
281 case '\f':
282 case '\n':
283 case '\r':
284 case '\t':
285 case '\v': {
286 return true;
287 } break;
288 }
289 return false;
290}
291
292void
293scan_skip_until_valid(Scanner *scanner) {
294 while (scan_has_next(scanner)) {
295 char c = scan_peek(scanner);
296 if (is_valid_split(c)) {
297 return;
298 }
299 scan_next(scanner);
300 }
301}
302
303Token
304emit_token_number(Scanner *scanner) {
305 Scanner current = *scanner;
306 char c = scan_peek(scanner);
307 if (c == '+' || c == '-') {
308 scan_next(scanner);
309 if (str_has_prefix(scanner->str, cstr("0b")) ||
310 str_has_prefix(scanner->str, cstr("0x"))) {
311 scan_skip_until_valid(scanner);
312 return emit_token_err(
313 &current,
314 cstr("malformed number: binary/hex numbers can't be signed"));
315 }
316 }
317 if (str_has_prefix(scanner->str, cstr("0b"))) {
318 scan_next(scanner);
319 scan_next(scanner);
320 while (scan_has_next(scanner)) {
321 char c = scan_peek(scanner);
322 if (c == '0' || c == '1') {
323 scan_next(scanner);
324 continue;
325 }
326 if (is_valid_split(c)) {
327 break;
328 }
329 scan_skip_until_valid(scanner);
330 return emit_token_err(
331 &current, cstr("malformed number: invalid binary number"));
332 }
333 } else if (str_has_prefix(scanner->str, cstr("0x"))) {
334 scan_next(scanner);
335 scan_next(scanner);
336 while (scan_has_next(scanner)) {
337 char c = scan_peek(scanner);
338 if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
339 (c >= 'A' && c <= 'F')) {
340 scan_next(scanner);
341 continue;
342 }
343 if (is_valid_split(c)) {
344 break;
345 }
346 scan_skip_until_valid(scanner);
347 return emit_token_err(&current,
348 cstr("malformed number: invalid hex number"));
349 }
350 } else {
351 // TODO: handle fractions, signs and exponents properly.
352 while (scan_has_next(scanner)) {
353 char c = scan_peek(scanner);
354 switch (c) {
355 case '+':
356 case '-':
357 case 'e':
358 case 'E':
359 case '.':
360 scan_next(scanner);
361 continue;
362 }
363 if (c >= '0' && c <= '9') {
364 scan_next(scanner);
365 continue;
366 }
367 break;
368 }
369 }
370 return emit_token(current, scanner, TOK_NUMBER);
371}
372
247Token 373Token
248scan_token(Scanner *scanner) { 374scan_token(Scanner *scanner) {
249 assert(scanner); 375 assert(scanner);
250 376
251 scan_skip_whitespace(scanner); 377 scan_skip_whitespace(scanner);
252 if (!scan_has_next(scanner)) { 378 if (!scan_has_next(scanner)) {
253 return emit_token(scanner, TOK_EOF); 379 return emit_token(*scanner, scanner, TOK_EOF);
254 } 380 }
255 381
256 Scanner current = *scanner; 382 Scanner current = *scanner;
257 char c = scan_next(scanner); 383 char c = scan_next(scanner);
258 (void)c; 384 switch (c) {
259 // TODO: rest of the operations... 385 case '(':
386 return emit_token(current, scanner, TOK_LPAREN);
387 case ')':
388 return emit_token(current, scanner, TOK_RPAREN);
389 case '[':
390 return emit_token(current, scanner, TOK_LSQUARE);
391 case ']':
392 return emit_token(current, scanner, TOK_RSQUARE);
393 case '{':
394 return emit_token(current, scanner, TOK_LCURLY);
395 case '}':
396 return emit_token(current, scanner, TOK_RCURLY);
397 case '+': {
398 char p = scan_peek(scanner);
399 if (p >= '0' && p <= '9') {
400 *scanner = current;
401 return emit_token_number(scanner);
402 }
403 return emit_token(current, scanner, TOK_ADD);
404 };
405 case '-': {
406 char p = scan_peek(scanner);
407 if (p >= '0' && p <= '9') {
408 *scanner = current;
409 return emit_token_number(scanner);
410 }
411 return emit_token(current, scanner, TOK_ADD);
412 };
413 case '*':
414 return emit_token(current, scanner, TOK_MUL);
415 case '/':
416 return emit_token(current, scanner, TOK_DIV);
417 case '%':
418 return emit_token(current, scanner, TOK_MOD);
419 case '!': {
420 if (scan_peek(scanner) == '=') {
421 scan_next(scanner);
422 return emit_token(current, scanner, TOK_NOTEQ);
423 }
424 return emit_token(current, scanner, TOK_NOT);
425 };
426 case '=': {
427 if (scan_peek(scanner) == '=') {
428 scan_next(scanner);
429 return emit_token(current, scanner, TOK_EQ);
430 }
431 return emit_token(current, scanner, TOK_ASSIGN);
432 };
433 case '<': {
434 char p = scan_peek(scanner);
435 if (p == '=') {
436 scan_next(scanner);
437 return emit_token(current, scanner, TOK_LE);
438 }
439 if (p == '<') {
440 scan_next(scanner);
441 return emit_token(current, scanner, TOK_BITLSHIFT);
442 }
443 return emit_token(current, scanner, TOK_LT);
444 };
445 case '>': {
446 char p = scan_peek(scanner);
447 if (p == '=') {
448 scan_next(scanner);
449 return emit_token(current, scanner, TOK_GE);
450 }
451 if (p == '>') {
452 scan_next(scanner);
453 return emit_token(current, scanner, TOK_BITRSHIFT);
454 }
455 return emit_token(current, scanner, TOK_GT);
456 };
457 case '~':
458 return emit_token(current, scanner, TOK_BITNOT);
459 case '&': {
460 if (scan_peek(scanner) == '&') {
461 scan_next(scanner);
462 return emit_token(current, scanner, TOK_AND);
463 }
464 return emit_token(current, scanner, TOK_BITAND);
465 };
466 case '|': {
467 if (scan_peek(scanner) == '|') {
468 scan_next(scanner);
469 return emit_token(current, scanner, TOK_OR);
470 }
471 return emit_token(current, scanner, TOK_BITOR);
472 };
473 case ':':
474 return emit_token(current, scanner, TOK_COLON);
475 case '.':
476 return emit_token(current, scanner, TOK_DOT);
477 case '@':
478 return emit_token(current, scanner, TOK_AT);
479 case '"': {
480 while (scan_has_next(scanner)) {
481 c = scan_next(scanner);
482 if (c == '\\') {
483 scan_next(scanner);
484 continue;
485 }
486 if (c == '"') {
487 return emit_token(current, scanner, TOK_STRING);
488 }
489 }
490 return emit_token_err(&current, cstr("mismatched string quotes"));
491 };
492 }
493 if (c >= '0' && c <= '9') {
494 *scanner = current;
495 return emit_token_number(scanner);
496 }
260 497
261 // At this point we have an error, find the next newline. 498 // TODO: keywords & literals
262 scan_skip_line(scanner); 499 // Basic literals.
500 // TOK_SYMBOL,
501
502 // // Keywords.
503 // TOK_LET,
504 // TOK_SET,
505 // TOK_FUN,
506 // TOK_STRUCT,
507 // TOK_IF,
508 // TOK_MATCH,
509 // TOK_CASE,
510 // TOK_WHILE,
511 // TOK_CONTINUE,
512 // TOK_BREAK,
513 // TOK_RETURN,
514 // TOK_NIL,
515 // TOK_TRUE,
516 // TOK_FALSE,
517
518 // At this point we have an error, skip until we find whitespace again.
519 scan_skip_until_valid(scanner);
263 return emit_token_err(&current, cstr("unexpected character")); 520 return emit_token_err(&current, cstr("unexpected character"));
264} 521}
265 522
@@ -281,12 +538,13 @@ process_file(Str path) {
281 Token tok = {0}; 538 Token tok = {0};
282 while (tok.type != TOK_EOF) { 539 while (tok.type != TOK_EOF) {
283 tok = scan_token(&scanner); 540 tok = scan_token(&scanner);
284 println("%s:%d:%d:%s %s", path, tok.line, tok.col, token_str[tok.type], 541 eprintln("%s:%d:%d:%s %s", path, tok.line, tok.col, token_str[tok.type],
285 tok.val); 542 tok.val);
286 } 543 }
287 // while (true) { 544 // while (true) {
288 // Token tok = scan_token(&scanner); 545 // Token tok = scan_token(&scanner);
289 // println("%s:%d:%d:%s %s", path, tok.line, tok.col, token_str[tok.type], 546 // println("%s:%d:%d:%s %s", path, tok.line, tok.col,
547 // token_str[tok.type],
290 // tok.val); 548 // tok.val);
291 // if (tok.type == TOK_EOF) break; 549 // if (tok.type == TOK_EOF) break;
292 // } 550 // }