#ifndef LEXER_C #define LEXER_C #include "badlib.h" #define LEXER_MEM GB(2) typedef enum TokenKind { TOK_UNKNOWN = 0, // Parentheses. TOK_LPAREN, // ( TOK_RPAREN, // ) TOK_LSQUARE, // [ TOK_RSQUARE, // ] TOK_LCURLY, // { TOK_RCURLY, // } // Basic literals. TOK_CHAR, TOK_NUM_INT, TOK_NUM_FLOAT, TOK_SYMBOL, TOK_STRING, // Keywords. TOK_BREAK, // break TOK_CASE, // case TOK_COND, // match TOK_CONTINUE, // continue TOK_ELSE, // else TOK_ENUM, // enum TOK_FALSE, // false TOK_FUN, // fun TOK_IF, // if TOK_LET, // let TOK_MATCH, // match TOK_NIL, // nil TOK_RETURN, // return TOK_SET, // set TOK_STRUCT, // struct TOK_TRUE, // true TOK_WHILE, // while // Arithmetic ops. TOK_ADD, // + TOK_SUB, // - TOK_MUL, // * TOK_DIV, // / TOK_MOD, // % // Logical ops. TOK_NOT, // ! TOK_AND, // && TOK_OR, // || TOK_EQ, // == TOK_NEQ, // != TOK_LT, // < TOK_GT, // > TOK_LE, // <= TOK_GE, // >= // Bitwise ops. TOK_BITNOT, // ~ TOK_BITAND, // & TOK_BITOR, // | TOK_BITLSHIFT, // << TOK_BITRSHIFT, // >> // Special ops. TOK_COLON, // : TOK_DOT, // . TOK_AT, // @ TOK_ASSIGN, // = // End of file. TOK_EOF, } TokenKind; Str token_str[] = { [TOK_UNKNOWN] = cstr("UNKNOWN"), // Parentheses. [TOK_LPAREN] = cstr("LPAREN"), [TOK_RPAREN] = cstr("RPAREN"), [TOK_LSQUARE] = cstr("LSQUARE"), [TOK_RSQUARE] = cstr("RSQUARE"), [TOK_LCURLY] = cstr("LCURLY"), [TOK_RCURLY] = cstr("RCURLY"), // Basic literals. [TOK_CHAR] = cstr("CHAR"), [TOK_NUM_INT] = cstr("INUMBER"), [TOK_NUM_FLOAT] = cstr("FNUMBER"), [TOK_SYMBOL] = cstr("SYMBOL"), [TOK_STRING] = cstr("STRING"), // Keywords. [TOK_BREAK] = cstr("BREAK"), [TOK_CASE] = cstr("CASE"), [TOK_COND] = cstr("COND"), [TOK_CONTINUE] = cstr("CONTINUE"), [TOK_ELSE] = cstr("ELSE"), [TOK_ENUM] = cstr("ENUM"), [TOK_FALSE] = cstr("FALSE"), [TOK_FUN] = cstr("FUN"), [TOK_IF] = cstr("IF"), [TOK_LET] = cstr("LET"), [TOK_MATCH] = cstr("MATCH"), [TOK_NIL] = cstr("NIL"), [TOK_RETURN] = cstr("RETURN"), [TOK_SET] = cstr("SET"), [TOK_STRUCT] = cstr("STRUCT"), [TOK_TRUE] = cstr("TRUE"), [TOK_WHILE] = cstr("WHILE"), // Arithmetic ops. [TOK_ADD] = cstr("ADD"), [TOK_SUB] = cstr("SUB"), [TOK_MUL] = cstr("MUL"), [TOK_DIV] = cstr("DIV"), [TOK_MOD] = cstr("MOD"), // Logical ops. [TOK_NOT] = cstr("NOT"), [TOK_AND] = cstr("AND"), [TOK_OR] = cstr("OR"), [TOK_EQ] = cstr("EQ"), [TOK_NEQ] = cstr("NEQ"), [TOK_LT] = cstr("LT"), [TOK_GT] = cstr("GT"), [TOK_LE] = cstr("LE"), [TOK_GE] = cstr("GE"), // Bitwise ops. [TOK_BITNOT] = cstr("BITNOT"), [TOK_BITAND] = cstr("BITAND"), [TOK_BITOR] = cstr("BITOR"), [TOK_BITLSHIFT] = cstr("BITLSHIFT"), [TOK_BITRSHIFT] = cstr("BITRSHIFT"), // Special ops. [TOK_COLON] = cstr("COLON"), [TOK_DOT] = cstr("DOT"), [TOK_AT] = cstr("AT"), [TOK_ASSIGN] = cstr("ASSIGN"), // End of file. [TOK_EOF] = cstr("EOF"), }; typedef struct Token { TokenKind kind; Str val; sz line; sz col; } Token; typedef struct Scanner { Str str; sz line; sz col; } Scanner; char scan_next(Scanner *scanner) { char c = str_next(&scanner->str); if (c == '\n') { scanner->line++; scanner->col = 0; } else { scanner->col++; } return c; } bool scan_has_next(Scanner *scanner) { return scanner->str.size; } char scan_peek(Scanner *scanner) { return str_peek(scanner->str); } void scan_skip_line(Scanner *scanner) { SearchResult newline = array_find_next(scanner->str, cstr("\n")); if (newline.found) { scanner->str.mem += newline.pos + 1; scanner->str.size -= newline.pos + 1; scanner->line++; scanner->col = 0; } } void scan_skip_whitespace(Scanner *scanner) { while (scan_has_next(scanner)) { char c = scan_peek(scanner); switch (c) { case ' ': case ',': // Commas are just syntactic sugar. case '\f': case '\n': case '\r': case '\t': case '\v': { scan_next(scanner); } break; case ';': { // Found a comment! (skip) scan_skip_line(scanner); } break; default: { return; } break; } } } bool scan_is_valid_split(char c) { switch (c) { case ';': case '(': case ')': case '[': case ']': case '{': case '}': case '+': case '-': case '*': case '/': case '%': case '!': case '=': case '<': case '>': case '~': case '&': case '|': case ':': case '.': case '@': case '"': case ' ': case ',': case '\'': case '\f': case '\n': case '\r': case '\t': case '\v': { return true; } break; } return false; } void scan_skip_until_valid(Scanner *scanner) { while (scan_has_next(scanner)) { char c = scan_peek(scanner); if (scan_is_valid_split(c)) { return; } scan_next(scanner); } } Token emit_token(Scanner current, Scanner *scanner, TokenKind t) { Str val = current.str; val.size = current.str.size - scanner->str.size; val.size = val.size < 0 ? 0 : val.size; return (Token){ .val = val, .line = current.line + 1, .col = current.col + 1, .kind = t, }; } Token emit_token_err(Scanner *scanner, Str err_msg) { return (Token){ .line = scanner->line + 1, .col = scanner->col + 1, .val = err_msg, .kind = TOK_UNKNOWN, }; } Token emit_token_number(Scanner *scanner) { Scanner current = *scanner; char c = scan_peek(scanner); if (c == '+' || c == '-') { scan_next(scanner); if (str_has_prefix(scanner->str, cstr("0b")) || str_has_prefix(scanner->str, cstr("0x"))) { scan_skip_until_valid(scanner); return emit_token_err( ¤t, cstr("malformed number: binary/hex numbers can't be signed")); } } if (str_has_prefix(scanner->str, cstr("0b"))) { scan_next(scanner); scan_next(scanner); while (scan_has_next(scanner)) { c = scan_peek(scanner); if (c == '0' || c == '1' || c == '_') { scan_next(scanner); continue; } if (scan_is_valid_split(c)) { return emit_token(current, scanner, TOK_NUM_INT); } scan_skip_until_valid(scanner); return emit_token_err( ¤t, cstr("malformed number: invalid binary number")); } } else if (str_has_prefix(scanner->str, cstr("0x"))) { scan_next(scanner); scan_next(scanner); while (scan_has_next(scanner)) { c = scan_peek(scanner); if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') || c == '_') { scan_next(scanner); continue; } if (scan_is_valid_split(c)) { return emit_token(current, scanner, TOK_NUM_INT); } scan_skip_until_valid(scanner); return emit_token_err(¤t, cstr("malformed number: invalid hex number")); } } else { // Integral. while (scan_has_next(scanner)) { c = scan_peek(scanner); if (c == '.') { scan_next(scanner); break; } if ((c >= '0' && c <= '9') || c == '_') { scan_next(scanner); continue; } if (scan_is_valid_split(c)) { return emit_token(current, scanner, TOK_NUM_INT); } scan_skip_until_valid(scanner); return emit_token_err(¤t, cstr("malformed number")); } c = scan_peek(scanner); if (!(c >= '0' && c <= '9')) { return emit_token_err(¤t, cstr("malformed number: no decimal digits")); } // Decimals. while (scan_has_next(scanner)) { c = scan_peek(scanner); if (c == 'e' || c == 'E') { scan_next(scanner); break; } if ((c >= '0' && c <= '9') || c == '_') { scan_next(scanner); continue; } if (scan_is_valid_split(c)) { return emit_token(current, scanner, TOK_NUM_FLOAT); } scan_skip_until_valid(scanner); return emit_token_err(¤t, cstr("malformed number")); } // Exponent. c = scan_peek(scanner); if (c == '+' || c == '-') { scan_next(scanner); } while (scan_has_next(scanner)) { c = scan_peek(scanner); if ((c >= '0' && c <= '9') || c == '_') { scan_next(scanner); continue; } if (c == '.') { scan_next(scanner); return emit_token_err( ¤t, cstr("malformed number: decimals not allowed on exponent")); } if (scan_is_valid_split(c)) { return emit_token(current, scanner, TOK_NUM_FLOAT); } scan_skip_until_valid(scanner); return emit_token_err(¤t, cstr("malformed number")); } } return emit_token_err(¤t, cstr("malformed number")); } Token scan_token(Scanner *scanner) { assert(scanner); scan_skip_whitespace(scanner); if (!scan_has_next(scanner)) { return emit_token(*scanner, scanner, TOK_EOF); } Scanner current = *scanner; char c = scan_next(scanner); switch (c) { case '(': return emit_token(current, scanner, TOK_LPAREN); case ')': return emit_token(current, scanner, TOK_RPAREN); case '[': return emit_token(current, scanner, TOK_LSQUARE); case ']': return emit_token(current, scanner, TOK_RSQUARE); case '{': return emit_token(current, scanner, TOK_LCURLY); case '}': return emit_token(current, scanner, TOK_RCURLY); case '+': { char p = scan_peek(scanner); if (p >= '0' && p <= '9') { *scanner = current; return emit_token_number(scanner); } return emit_token(current, scanner, TOK_ADD); }; case '-': { char p = scan_peek(scanner); if (p >= '0' && p <= '9') { *scanner = current; return emit_token_number(scanner); } return emit_token(current, scanner, TOK_SUB); }; case '*': return emit_token(current, scanner, TOK_MUL); case '/': return emit_token(current, scanner, TOK_DIV); case '%': return emit_token(current, scanner, TOK_MOD); case '!': { if (scan_peek(scanner) == '=') { scan_next(scanner); return emit_token(current, scanner, TOK_NEQ); } return emit_token(current, scanner, TOK_NOT); }; case '=': { if (scan_peek(scanner) == '=') { scan_next(scanner); return emit_token(current, scanner, TOK_EQ); } return emit_token(current, scanner, TOK_ASSIGN); }; case '<': { char p = scan_peek(scanner); if (p == '=') { scan_next(scanner); return emit_token(current, scanner, TOK_LE); } if (p == '<') { scan_next(scanner); return emit_token(current, scanner, TOK_BITLSHIFT); } return emit_token(current, scanner, TOK_LT); }; case '>': { char p = scan_peek(scanner); if (p == '=') { scan_next(scanner); return emit_token(current, scanner, TOK_GE); } if (p == '>') { scan_next(scanner); return emit_token(current, scanner, TOK_BITRSHIFT); } return emit_token(current, scanner, TOK_GT); }; case '~': return emit_token(current, scanner, TOK_BITNOT); case '&': { if (scan_peek(scanner) == '&') { scan_next(scanner); return emit_token(current, scanner, TOK_AND); } return emit_token(current, scanner, TOK_BITAND); }; case '|': { if (scan_peek(scanner) == '|') { scan_next(scanner); return emit_token(current, scanner, TOK_OR); } return emit_token(current, scanner, TOK_BITOR); }; case ':': return emit_token(current, scanner, TOK_COLON); case '.': return emit_token(current, scanner, TOK_DOT); case '@': return emit_token(current, scanner, TOK_AT); case '\'': { c = scan_next(scanner); if (scan_next(scanner) == '\'') { return emit_token(current, scanner, TOK_CHAR); } return emit_token_err(¤t, cstr("mismatched char quote")); }; case '"': { while (scan_has_next(scanner)) { c = scan_next(scanner); if (c == '\\') { scan_next(scanner); continue; } if (c == '"') { return emit_token(current, scanner, TOK_STRING); } } return emit_token_err(¤t, cstr("mismatched string quotes")); }; } if (c >= '0' && c <= '9') { *scanner = current; return emit_token_number(scanner); } scan_skip_until_valid(scanner); Str val = current.str; val.size = current.str.size - scanner->str.size; val.size = val.size < 0 ? 0 : val.size; if (val.size == 0) { return emit_token_err(¤t, cstr("unexpected character")); } switch (val.mem[0]) { case 'b': { if (str_has_prefix(val, cstr("break"))) { return emit_token(current, scanner, TOK_BREAK); } } break; case 'c': { if (str_has_prefix(val, cstr("case"))) { return emit_token(current, scanner, TOK_CASE); } if (str_has_prefix(val, cstr("continue"))) { return emit_token(current, scanner, TOK_CONTINUE); } if (str_has_prefix(val, cstr("cond"))) { return emit_token(current, scanner, TOK_COND); } } break; case 'e': { if (str_has_prefix(val, cstr("else"))) { return emit_token(current, scanner, TOK_ELSE); } if (str_has_prefix(val, cstr("enum"))) { return emit_token(current, scanner, TOK_ENUM); } } break; case 'f': { if (str_has_prefix(val, cstr("false"))) { return emit_token(current, scanner, TOK_FALSE); } if (str_has_prefix(val, cstr("fun"))) { return emit_token(current, scanner, TOK_FUN); } } break; case 'i': { if (str_has_prefix(val, cstr("if"))) { return emit_token(current, scanner, TOK_IF); } } break; case 'l': { if (str_has_prefix(val, cstr("let"))) { return emit_token(current, scanner, TOK_LET); } } break; case 'm': { if (str_has_prefix(val, cstr("match"))) { return emit_token(current, scanner, TOK_MATCH); } } break; case 'n': { if (str_has_prefix(val, cstr("nil"))) { return emit_token(current, scanner, TOK_NIL); } } break; case 'r': { if (str_has_prefix(val, cstr("return"))) { return emit_token(current, scanner, TOK_RETURN); } } break; case 's': { if (str_has_prefix(val, cstr("set"))) { return emit_token(current, scanner, TOK_SET); } if (str_has_prefix(val, cstr("struct"))) { return emit_token(current, scanner, TOK_STRUCT); } } break; case 't': { if (str_has_prefix(val, cstr("true"))) { return emit_token(current, scanner, TOK_TRUE); } } break; case 'w': { if (str_has_prefix(val, cstr("while"))) { return emit_token(current, scanner, TOK_WHILE); } } break; } return emit_token(current, scanner, TOK_SYMBOL); } void print_token(Token tok) { println("%d:%d\t%s %s", tok.line, tok.col, token_str[tok.kind], tok.val); } void print_tokens(Str path, Token *tokens) { for (sz i = 0; i < array_size(tokens); i++) { Token tok = tokens[i]; print("%s:", path); print_token(tok); } } #endif // LEXER_C