From 893b52223d274c675272cee55768a9d5853420fb Mon Sep 17 00:00:00 2001 From: Bad Diode Date: Sat, 15 Jun 2024 16:10:16 +0200 Subject: Finish basic lexing --- Makefile | 2 +- src/main.c | 163 +++++++++++++++++++++++++++++++++++------------------ tests/literals.bad | 1 - 3 files changed, 108 insertions(+), 58 deletions(-) diff --git a/Makefile b/Makefile index 1d894dd..45c1389 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ tests: $(BIN) ./$(BIN) tests/constants/numbers.bdl run: $(BIN) - $(BIN) tests/simple.bad + $(BIN) tests/literals.bad viz_lex: $(BIN) $(BIN) -pl example.bdl diff --git a/src/main.c b/src/main.c index 9246092..edd70aa 100644 --- a/src/main.c +++ b/src/main.c @@ -25,12 +25,12 @@ typedef enum TokenType { TOK_UNKNOWN = 0, // Parentheses. - TOK_LPAREN, - TOK_RPAREN, - TOK_LSQUARE, - TOK_RSQUARE, - TOK_LCURLY, - TOK_RCURLY, + TOK_LPAREN, // ( + TOK_RPAREN, // ) + TOK_LSQUARE, // [ + TOK_RSQUARE, // ] + TOK_LCURLY, // { + TOK_RCURLY, // } // Basic literals. TOK_NUMBER, @@ -38,20 +38,20 @@ typedef enum TokenType { TOK_STRING, // Keywords. - TOK_LET, - TOK_SET, - TOK_FUN, - TOK_STRUCT, - TOK_IF, - TOK_MATCH, - TOK_CASE, - TOK_WHILE, - TOK_CONTINUE, - TOK_BREAK, - TOK_RETURN, - TOK_NIL, - TOK_TRUE, - TOK_FALSE, + TOK_BREAK, // break + TOK_CASE, // case + TOK_CONTINUE, // continue + TOK_FALSE, // false + TOK_FUN, // fun + TOK_IF, // if + TOK_LET, // let + TOK_MATCH, // match + TOK_NIL, // nil + TOK_RETURN, // return + TOK_SET, // set + TOK_STRUCT, // struct + TOK_TRUE, // true + TOK_WHILE, // while // Arithmetic ops. TOK_ADD, // + @@ -105,20 +105,20 @@ Str token_str[] = { [TOK_STRING] = cstr("STRING"), // Keywords. - [TOK_LET] = cstr("LET"), - [TOK_SET] = cstr("SET"), + [TOK_BREAK] = cstr("BREAK"), + [TOK_CASE] = cstr("CASE"), + [TOK_CONTINUE] = cstr("CONTINUE"), + [TOK_FALSE] = cstr("FALSE"), [TOK_FUN] = cstr("FUN"), - [TOK_STRUCT] = cstr("STRUCT"), [TOK_IF] = cstr("IF"), + [TOK_LET] = cstr("LET"), [TOK_MATCH] = cstr("MATCH"), - [TOK_CASE] = cstr("CASE"), - [TOK_WHILE] = cstr("WHILE"), - [TOK_CONTINUE] = cstr("CONTINUE"), - [TOK_BREAK] = cstr("BREAK"), - [TOK_RETURN] = cstr("RETURN"), [TOK_NIL] = cstr("NIL"), + [TOK_RETURN] = cstr("RETURN"), + [TOK_SET] = cstr("SET"), + [TOK_STRUCT] = cstr("STRUCT"), [TOK_TRUE] = cstr("TRUE"), - [TOK_FALSE] = cstr("FALSE"), + [TOK_WHILE] = cstr("WHILE"), // Arithmetic ops. [TOK_ADD] = cstr("ADD"), @@ -319,7 +319,7 @@ emit_token_number(Scanner *scanner) { scan_next(scanner); while (scan_has_next(scanner)) { c = scan_peek(scanner); - if (c == '0' || c == '1') { + if (c == '0' || c == '1' || c == '_') { scan_next(scanner); continue; } @@ -336,7 +336,7 @@ emit_token_number(Scanner *scanner) { while (scan_has_next(scanner)) { c = scan_peek(scanner); if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || - (c >= 'A' && c <= 'F')) { + (c >= 'A' && c <= 'F') || c == '_') { scan_next(scanner); continue; } @@ -355,7 +355,7 @@ emit_token_number(Scanner *scanner) { scan_next(scanner); break; } - if (c >= '0' && c <= '9') { + if ((c >= '0' && c <= '9') || c == '_') { scan_next(scanner); continue; } @@ -377,7 +377,7 @@ emit_token_number(Scanner *scanner) { scan_next(scanner); break; } - if (c >= '0' && c <= '9') { + if ((c >= '0' && c <= '9') || c == '_') { scan_next(scanner); continue; } @@ -394,7 +394,7 @@ emit_token_number(Scanner *scanner) { } while (scan_has_next(scanner)) { c = scan_peek(scanner); - if (c >= '0' && c <= '9') { + if ((c >= '0' && c <= '9') || c == '_') { scan_next(scanner); continue; } @@ -539,29 +539,80 @@ scan_token(Scanner *scanner) { return emit_token_number(scanner); } - // TODO: keywords & literals - // Basic literals. - // TOK_SYMBOL, - - // // Keywords. - // TOK_LET, - // TOK_SET, - // TOK_FUN, - // TOK_STRUCT, - // TOK_IF, - // TOK_MATCH, - // TOK_CASE, - // TOK_WHILE, - // TOK_CONTINUE, - // TOK_BREAK, - // TOK_RETURN, - // TOK_NIL, - // TOK_TRUE, - // TOK_FALSE, - - // At this point we have an error, skip until we find whitespace again. scan_skip_until_valid(scanner); - return emit_token_err(¤t, cstr("unexpected character")); + Str val = current.str; + val.size = current.str.size - scanner->str.size; + val.size = val.size < 0 ? 0 : val.size; + if (val.size == 0) { + return emit_token_err(¤t, cstr("unexpected character")); + } + switch (val.mem[0]) { + case 'b': { + if (str_has_prefix(val, cstr("break"))) { + return emit_token(current, scanner, TOK_BREAK); + } + } break; + case 'c': { + if (str_has_prefix(val, cstr("case"))) { + return emit_token(current, scanner, TOK_CASE); + } + if (str_has_prefix(val, cstr("continue"))) { + return emit_token(current, scanner, TOK_CONTINUE); + } + } break; + case 'f': { + if (str_has_prefix(val, cstr("false"))) { + return emit_token(current, scanner, TOK_FALSE); + } + if (str_has_prefix(val, cstr("fun"))) { + return emit_token(current, scanner, TOK_FUN); + } + } break; + case 'i': { + if (str_has_prefix(val, cstr("if"))) { + return emit_token(current, scanner, TOK_IF); + } + } break; + case 'l': { + if (str_has_prefix(val, cstr("let"))) { + return emit_token(current, scanner, TOK_LET); + } + } break; + case 'm': { + if (str_has_prefix(val, cstr("match"))) { + return emit_token(current, scanner, TOK_MATCH); + } + } break; + case 'n': { + if (str_has_prefix(val, cstr("nil"))) { + return emit_token(current, scanner, TOK_NIL); + } + } break; + case 'r': { + if (str_has_prefix(val, cstr("return"))) { + return emit_token(current, scanner, TOK_RETURN); + } + } break; + case 's': { + if (str_has_prefix(val, cstr("set"))) { + return emit_token(current, scanner, TOK_SET); + } + if (str_has_prefix(val, cstr("struct"))) { + return emit_token(current, scanner, TOK_STRUCT); + } + } break; + case 't': { + if (str_has_prefix(val, cstr("true"))) { + return emit_token(current, scanner, TOK_TRUE); + } + } break; + case 'w': { + if (str_has_prefix(val, cstr("while"))) { + return emit_token(current, scanner, TOK_WHILE); + } + } break; + } + return emit_token(current, scanner, TOK_SYMBOL); } void diff --git a/tests/literals.bad b/tests/literals.bad index 673494d..e958bba 100644 --- a/tests/literals.bad +++ b/tests/literals.bad @@ -23,7 +23,6 @@ 2.45e5 ; we can use scientific notation +1.23e+6 ; +/- can be on the number and/or the exponent -3.14e-1 ; the exponents are always integers --3.21e+0xff ; ... in any of its forms ; Booleans. true -- cgit v1.2.1