From 893b52223d274c675272cee55768a9d5853420fb Mon Sep 17 00:00:00 2001
From: Bad Diode <bd@badd10de.dev>
Date: Sat, 15 Jun 2024 16:10:16 +0200
Subject: Finish basic lexing

---
 Makefile           |   2 +-
 src/main.c         | 163 +++++++++++++++++++++++++++++++++++------------------
 tests/literals.bad |   1 -
 3 files changed, 108 insertions(+), 58 deletions(-)

diff --git a/Makefile b/Makefile
index 1d894dd..45c1389 100644
--- a/Makefile
+++ b/Makefile
@@ -47,7 +47,7 @@ tests: $(BIN)
 	./$(BIN) tests/constants/numbers.bdl
 
 run: $(BIN)
-	$(BIN) tests/simple.bad
+	$(BIN) tests/literals.bad
 
 viz_lex: $(BIN)
 	$(BIN) -pl example.bdl
diff --git a/src/main.c b/src/main.c
index 9246092..edd70aa 100644
--- a/src/main.c
+++ b/src/main.c
@@ -25,12 +25,12 @@ typedef enum TokenType {
     TOK_UNKNOWN = 0,
 
     // Parentheses.
-    TOK_LPAREN,
-    TOK_RPAREN,
-    TOK_LSQUARE,
-    TOK_RSQUARE,
-    TOK_LCURLY,
-    TOK_RCURLY,
+    TOK_LPAREN,   // (
+    TOK_RPAREN,   // )
+    TOK_LSQUARE,  // [
+    TOK_RSQUARE,  // ]
+    TOK_LCURLY,   // {
+    TOK_RCURLY,   // }
 
     // Basic literals.
     TOK_NUMBER,
@@ -38,20 +38,20 @@ typedef enum TokenType {
     TOK_STRING,
 
     // Keywords.
-    TOK_LET,
-    TOK_SET,
-    TOK_FUN,
-    TOK_STRUCT,
-    TOK_IF,
-    TOK_MATCH,
-    TOK_CASE,
-    TOK_WHILE,
-    TOK_CONTINUE,
-    TOK_BREAK,
-    TOK_RETURN,
-    TOK_NIL,
-    TOK_TRUE,
-    TOK_FALSE,
+    TOK_BREAK,     // break
+    TOK_CASE,      // case
+    TOK_CONTINUE,  // continue
+    TOK_FALSE,     // false
+    TOK_FUN,       // fun
+    TOK_IF,        // if
+    TOK_LET,       // let
+    TOK_MATCH,     // match
+    TOK_NIL,       // nil
+    TOK_RETURN,    // return
+    TOK_SET,       // set
+    TOK_STRUCT,    // struct
+    TOK_TRUE,      // true
+    TOK_WHILE,     // while
 
     // Arithmetic ops.
     TOK_ADD,  // +
@@ -105,20 +105,20 @@ Str token_str[] = {
     [TOK_STRING] = cstr("STRING"),
 
     // Keywords.
-    [TOK_LET] = cstr("LET"),
-    [TOK_SET] = cstr("SET"),
+    [TOK_BREAK] = cstr("BREAK"),
+    [TOK_CASE] = cstr("CASE"),
+    [TOK_CONTINUE] = cstr("CONTINUE"),
+    [TOK_FALSE] = cstr("FALSE"),
     [TOK_FUN] = cstr("FUN"),
-    [TOK_STRUCT] = cstr("STRUCT"),
     [TOK_IF] = cstr("IF"),
+    [TOK_LET] = cstr("LET"),
     [TOK_MATCH] = cstr("MATCH"),
-    [TOK_CASE] = cstr("CASE"),
-    [TOK_WHILE] = cstr("WHILE"),
-    [TOK_CONTINUE] = cstr("CONTINUE"),
-    [TOK_BREAK] = cstr("BREAK"),
-    [TOK_RETURN] = cstr("RETURN"),
     [TOK_NIL] = cstr("NIL"),
+    [TOK_RETURN] = cstr("RETURN"),
+    [TOK_SET] = cstr("SET"),
+    [TOK_STRUCT] = cstr("STRUCT"),
     [TOK_TRUE] = cstr("TRUE"),
-    [TOK_FALSE] = cstr("FALSE"),
+    [TOK_WHILE] = cstr("WHILE"),
 
     // Arithmetic ops.
     [TOK_ADD] = cstr("ADD"),
@@ -319,7 +319,7 @@ emit_token_number(Scanner *scanner) {
         scan_next(scanner);
         while (scan_has_next(scanner)) {
             c = scan_peek(scanner);
-            if (c == '0' || c == '1') {
+            if (c == '0' || c == '1' || c == '_') {
                 scan_next(scanner);
                 continue;
             }
@@ -336,7 +336,7 @@ emit_token_number(Scanner *scanner) {
         while (scan_has_next(scanner)) {
             c = scan_peek(scanner);
             if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
-                (c >= 'A' && c <= 'F')) {
+                (c >= 'A' && c <= 'F') || c == '_') {
                 scan_next(scanner);
                 continue;
             }
@@ -355,7 +355,7 @@ emit_token_number(Scanner *scanner) {
                 scan_next(scanner);
                 break;
             }
-            if (c >= '0' && c <= '9') {
+            if ((c >= '0' && c <= '9') || c == '_') {
                 scan_next(scanner);
                 continue;
             }
@@ -377,7 +377,7 @@ emit_token_number(Scanner *scanner) {
                 scan_next(scanner);
                 break;
             }
-            if (c >= '0' && c <= '9') {
+            if ((c >= '0' && c <= '9') || c == '_') {
                 scan_next(scanner);
                 continue;
             }
@@ -394,7 +394,7 @@ emit_token_number(Scanner *scanner) {
         }
         while (scan_has_next(scanner)) {
             c = scan_peek(scanner);
-            if (c >= '0' && c <= '9') {
+            if ((c >= '0' && c <= '9') || c == '_') {
                 scan_next(scanner);
                 continue;
             }
@@ -539,29 +539,80 @@ scan_token(Scanner *scanner) {
         return emit_token_number(scanner);
     }
 
-    // TODO: keywords & literals
-    // Basic literals.
-    // TOK_SYMBOL,
-
-    // // Keywords.
-    // TOK_LET,
-    // TOK_SET,
-    // TOK_FUN,
-    // TOK_STRUCT,
-    // TOK_IF,
-    // TOK_MATCH,
-    // TOK_CASE,
-    // TOK_WHILE,
-    // TOK_CONTINUE,
-    // TOK_BREAK,
-    // TOK_RETURN,
-    // TOK_NIL,
-    // TOK_TRUE,
-    // TOK_FALSE,
-
-    // At this point we have an error, skip until we find whitespace again.
     scan_skip_until_valid(scanner);
-    return emit_token_err(&current, cstr("unexpected character"));
+    Str val = current.str;
+    val.size = current.str.size - scanner->str.size;
+    val.size = val.size < 0 ? 0 : val.size;
+    if (val.size == 0) {
+        return emit_token_err(&current, cstr("unexpected character"));
+    }
+    switch (val.mem[0]) {
+        case 'b': {
+            if (str_has_prefix(val, cstr("break"))) {
+                return emit_token(current, scanner, TOK_BREAK);
+            }
+        } break;
+        case 'c': {
+            if (str_has_prefix(val, cstr("case"))) {
+                return emit_token(current, scanner, TOK_CASE);
+            }
+            if (str_has_prefix(val, cstr("continue"))) {
+                return emit_token(current, scanner, TOK_CONTINUE);
+            }
+        } break;
+        case 'f': {
+            if (str_has_prefix(val, cstr("false"))) {
+                return emit_token(current, scanner, TOK_FALSE);
+            }
+            if (str_has_prefix(val, cstr("fun"))) {
+                return emit_token(current, scanner, TOK_FUN);
+            }
+        } break;
+        case 'i': {
+            if (str_has_prefix(val, cstr("if"))) {
+                return emit_token(current, scanner, TOK_IF);
+            }
+        } break;
+        case 'l': {
+            if (str_has_prefix(val, cstr("let"))) {
+                return emit_token(current, scanner, TOK_LET);
+            }
+        } break;
+        case 'm': {
+            if (str_has_prefix(val, cstr("match"))) {
+                return emit_token(current, scanner, TOK_MATCH);
+            }
+        } break;
+        case 'n': {
+            if (str_has_prefix(val, cstr("nil"))) {
+                return emit_token(current, scanner, TOK_NIL);
+            }
+        } break;
+        case 'r': {
+            if (str_has_prefix(val, cstr("return"))) {
+                return emit_token(current, scanner, TOK_RETURN);
+            }
+        } break;
+        case 's': {
+            if (str_has_prefix(val, cstr("set"))) {
+                return emit_token(current, scanner, TOK_SET);
+            }
+            if (str_has_prefix(val, cstr("struct"))) {
+                return emit_token(current, scanner, TOK_STRUCT);
+            }
+        } break;
+        case 't': {
+            if (str_has_prefix(val, cstr("true"))) {
+                return emit_token(current, scanner, TOK_TRUE);
+            }
+        } break;
+        case 'w': {
+            if (str_has_prefix(val, cstr("while"))) {
+                return emit_token(current, scanner, TOK_WHILE);
+            }
+        } break;
+    }
+    return emit_token(current, scanner, TOK_SYMBOL);
 }
 
 void
diff --git a/tests/literals.bad b/tests/literals.bad
index 673494d..e958bba 100644
--- a/tests/literals.bad
+++ b/tests/literals.bad
@@ -23,7 +23,6 @@
 2.45e5      ; we can use scientific notation
 +1.23e+6    ; +/- can be on the number and/or the exponent
 -3.14e-1    ; the exponents are always integers
--3.21e+0xff ; ... in any of its forms
 
 ; Booleans.
 true
-- 
cgit v1.2.1