From e73a4c16a2269cdb2f5e7d66fb9839e4c44e14de Mon Sep 17 00:00:00 2001 From: Bad Diode Date: Fri, 29 Oct 2021 15:37:28 +0200 Subject: Prepare third compiler implementation --- src/common.h | 31 +++++++ src/darray.h | 81 ++++++++++++++++++ src/errors.c | 46 ++++++++++ src/errors.h | 48 +++++++++++ src/lexer.c | 244 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/lexer.h | 67 +++++++++++++++ src/main.c | 137 ++++++++++++++++++++++++++++++ src/string_view.c | 40 +++++++++ src/string_view.h | 25 ++++++ 9 files changed, 719 insertions(+) create mode 100644 src/common.h create mode 100755 src/darray.h create mode 100755 src/errors.c create mode 100755 src/errors.h create mode 100755 src/lexer.c create mode 100755 src/lexer.h create mode 100755 src/main.c create mode 100755 src/string_view.c create mode 100755 src/string_view.h (limited to 'src') diff --git a/src/common.h b/src/common.h new file mode 100644 index 0000000..08e78a8 --- /dev/null +++ b/src/common.h @@ -0,0 +1,31 @@ +#ifndef BDL_COMMON_H +#define BDL_COMMON_H + +#include +#include +#include +#include + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; +typedef volatile u8 vu8; +typedef volatile u16 vu16; +typedef volatile u32 vu32; +typedef volatile u64 vu64; +typedef volatile s8 vs8; +typedef volatile s16 vs16; +typedef volatile s32 vs32; +typedef volatile s64 vs64; + +#define KB(N) ((u64)(N) * 1024) +#define MB(N) ((u64)KB(N) * 1024) +#define GB(N) ((u64)MB(N) * 1024) +#define TB(N) ((u64)GB(N) * 1024) + +#endif // BDL_COMMON_H diff --git a/src/darray.h b/src/darray.h new file mode 100755 index 0000000..fa4e293 --- /dev/null +++ b/src/darray.h @@ -0,0 +1,81 @@ +#ifndef BDL_DARRAY_H +#define BDL_DARRAY_H + +#include + +typedef struct ArrayHeader { + size_t size; + size_t cap; +} ArrayHeader; + +// Header/Size/capacity accessors. +#define array_head(ARR) ((ArrayHeader *)((char *)(ARR) - sizeof(ArrayHeader))) +#define array_size(ARR) ((ARR) ? array_head(ARR)->size : 0) +#define array_cap(ARR) ((ARR) ? array_head(ARR)->cap : 0) + +// Initialize a dynamic array ARR with N elements. The initialization doesn't +// zero out the data, so thread carefully.. +#define array_init(ARR,N) ((ARR) = _array_reserve(N, sizeof(*(ARR)))) + +// Push a given element T to the dynamic array ARR. +#define array_push(ARR, T) \ + ((ARR) = _array_maybe_grow(ARR, sizeof(T)), \ + (ARR)[array_head(ARR)->size++] = (T)) + +// Return the last element of the array. Can be used to build stacks. +#define array_pop(ARR) (ARR)[--array_head(ARR)->size] + +// Return the value stored at the OFFSET position from the tail of the array. +#define array_peek(ARR, OFFSET) (ARR)[array_head(ARR)->size - 1 - (OFFSET)] + +// Insert N bytes from the SRC array into the ARR dynamic array. +#define array_insert(ARR, SRC, N) \ + ((ARR) = _array_insert(ARR, SRC, N, sizeof(*(ARR)))) + +// Free the memory from the original allocated position. +#define array_free(ARR) ((ARR) ? free(array_head(ARR)), (ARR) = NULL : 0) + +static inline void * +_array_reserve(size_t num_elem, size_t type_size) { + char *p = malloc(num_elem * type_size + sizeof(ArrayHeader)); + p += sizeof(ArrayHeader); + array_head(p)->size = 0; + array_head(p)->cap = num_elem; + return p; +} + +static inline void * +_array_maybe_grow(void *arr, size_t type_size) { + ArrayHeader *head = array_head(arr); + if (head->cap == head->size) { + if (head->cap == 0) { + head->cap++; + } else { + head->cap *= 2; + } + head = realloc(head, head->cap * type_size + sizeof(ArrayHeader)); + } + arr = (char *)head + sizeof(ArrayHeader); + return arr; +} + +static inline +char * _array_insert(char *arr, const char *src, size_t n_bytes, size_t type_size) { + ArrayHeader *head = array_head(arr); + size_t new_size = n_bytes + head->size; + if (new_size > head->cap * type_size) { + if (head->cap == 0) { + head->cap = 1; + } + while (new_size >= head->cap * type_size) { + head->cap *= 2; + } + head = realloc(head, head->cap * type_size + sizeof(ArrayHeader)); + } + arr = (char *)head + sizeof(ArrayHeader); + memcpy((arr + head->size), src, n_bytes); + head->size = new_size; + return arr; +} + +#endif // BDL_DARRAY_H diff --git a/src/errors.c b/src/errors.c new file mode 100755 index 0000000..11348fd --- /dev/null +++ b/src/errors.c @@ -0,0 +1,46 @@ +#include "errors.h" + +static const char* error_msgs[] = { + [ERR_UNKNOWN] = "error: something unexpected happened", + [ERR_UNMATCHED_STRING] = "error: unmatched string delimiter", + [ERR_UNBALANCED_PAREN] = "error: unbalanced parentheses", + [ERR_NOT_IMPLEMENTED] = "error: not implemented", + [ERR_EOF_REACHED] = "error: EOF reached", + [ERR_UNKNOWN_TOKEN] = "error: unknown token", + [ERR_UNKNOWN_OBJ_TYPE] = "error: can't eval unknown object type", + [ERR_NOT_A_SYMBOL] = "error: object is not a symbol", + [ERR_SYMBOL_NOT_FOUND] = "error: symbol not found", + [ERR_NOT_CALLABLE] = "error: not callable", + [ERR_NOT_ENOUGH_ARGS] = "error: not enough arguments", + [ERR_TOO_MANY_ARGS] = "error: too many arguments", + [ERR_WRONG_ARG_TYPE] = "error: wrong argument type", + [ERR_DIVISION_BY_ZERO] = "error: division by zero", + [ERR_AMBIGUOUS_PARAMS] = "error: ambiguous parameter names", +}; + +void +error_push(Errors *errors, Error error) { + if (errors->n < ERR_MAX_NUMBER) { + errors->errors[errors->n++] = error; + } +} + +void +report_errors(Errors *errors, const char *file_name) { + for (size_t i = 0; i < errors->n; i++) { + Error err = errors->errors[i]; + fprintf(stderr, "%s", file_name); + if (err.line != 0) { + fprintf(stderr, ":%ld:%ld", err.line, err.col); + } + switch (err.type) { + case ERR_TYPE_LEXER: { fprintf(stderr, ": [lexer]"); } break; + case ERR_TYPE_COMPILER: { fprintf(stderr, ": [compiler]"); } break; + case ERR_TYPE_RUNTIME: { fprintf(stderr, ": [runtime]"); } break; + case ERR_TYPE_PARSER: { fprintf(stderr, ": [parser]"); } break; + default: break; + } + fprintf(stderr, " %s\n", error_msgs[err.value]); + } + errors->n = 0; +} diff --git a/src/errors.h b/src/errors.h new file mode 100755 index 0000000..7d8e977 --- /dev/null +++ b/src/errors.h @@ -0,0 +1,48 @@ +#ifndef BDL_ERRORS_H +#define BDL_ERRORS_H + +#include "common.h" + +typedef enum ErrorType { + ERR_TYPE_LEXER, + ERR_TYPE_PARSER, + ERR_TYPE_COMPILER, + ERR_TYPE_RUNTIME, +} ErrorType; + +typedef enum ErrorValue { + ERR_UNKNOWN = 0, + ERR_UNMATCHED_STRING, + ERR_UNBALANCED_PAREN, + ERR_NOT_IMPLEMENTED, + ERR_EOF_REACHED, + ERR_UNKNOWN_TOKEN, + ERR_UNKNOWN_OBJ_TYPE, + ERR_NOT_A_SYMBOL, + ERR_SYMBOL_NOT_FOUND, + ERR_NOT_CALLABLE, + ERR_NOT_ENOUGH_ARGS, + ERR_TOO_MANY_ARGS, + ERR_WRONG_ARG_TYPE, + ERR_DIVISION_BY_ZERO, + ERR_AMBIGUOUS_PARAMS, +} ErrorValue; + +typedef struct Error { + ErrorType type; + ErrorValue value; + size_t line; + size_t col; +} Error; + +#define ERR_MAX_NUMBER 16 + +typedef struct Errors { + Error errors[ERR_MAX_NUMBER]; + size_t n; +} Errors; + +void error_push(Errors *errors, Error error); +void report_errors(Errors *errors, const char *file_name); + +#endif // BDL_ERRORS_H diff --git a/src/lexer.c b/src/lexer.c new file mode 100755 index 0000000..6a417e4 --- /dev/null +++ b/src/lexer.c @@ -0,0 +1,244 @@ +#include "lexer.h" +#include "errors.h" + +static const char* token_str[] = { + [TOKEN_UNKNOWN] = "TOKEN_UNKNOWN", + [TOKEN_LPAREN] = "TOKEN_LPAREN", + [TOKEN_RPAREN] = "TOKEN_RPAREN", + [TOKEN_FIXNUM] = "TOKEN_FIXNUM", + [TOKEN_SYMBOL] = "TOKEN_SYMBOL", + [TOKEN_STRING] = "TOKEN_STRING", + [TOKEN_NIL] = "TOKEN_NIL", + [TOKEN_TRUE] = "TOKEN_TRUE", + [TOKEN_FALSE] = "TOKEN_FALSE", + [TOKEN_EOF] = "TOKEN_EOF", +}; + +void +print_token(Token tok) { + printf("[%4ld:%-4ld] ", tok.line, tok.column); + printf("%s", token_str[tok.type]); + switch (tok.type) { + case TOKEN_FIXNUM: { + printf(" -> "); + sv_write(&tok.value); + } break; + case TOKEN_SYMBOL: { + printf(" -> "); + sv_write(&tok.value); + } break; + case TOKEN_STRING: { + printf(" -> "); + sv_write(&tok.value); + } break; + default: { + } break; + } + printf("\n"); +} + +char +scan_next(Scanner *scanner) { + char c = sv_next(&scanner->current); + if (c == '\n') { + scanner->line_number++; + scanner->col_number = 1; + } else { + scanner->col_number++; + } + scanner->offset++; + return c; +} + +char +scan_peek(const Scanner *scanner) { + return sv_peek(&scanner->current); +} + +bool +scan_has_next(const Scanner *scanner) { + return scanner->current.n != 0; +} + +void +skip_whitespace(Scanner *scanner) { + while (scan_has_next(scanner)) { + char c = scan_peek(scanner); + switch (c) { + case ' ': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': { + scan_next(scanner); + } break; + default: { + return; + } break; + } + } +} + +bool +is_delimiter(char c) { + switch (c) { + case EOF: + case '\0': + case ';': + case '"': + case '\'': + case '(': + case ')': + case ' ': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': { + return true; + } break; + } + return false; +} + +#define TOKEN_IS_KEYWORD(VAL, KEYWORD) \ + sv_equal(&(VAL), &(StringView){(KEYWORD), sizeof(KEYWORD) - 1}) + +TokenType +find_primitive_type(const StringView value) { + bool is_fixnum = true; + for (size_t i = 0; i < value.n; i++) { + char c = value.start[i]; + if (i == 0 && c == '-' && value.n > 1) { + continue; + } + if (!(c >= '0' && c <= '9')) { + is_fixnum = false; + break; + } + } + if (is_fixnum) { + return TOKEN_FIXNUM; + } + if (TOKEN_IS_KEYWORD(value, "nil")) { return TOKEN_NIL; } + if (TOKEN_IS_KEYWORD(value, "true")) { return TOKEN_TRUE; } + if (TOKEN_IS_KEYWORD(value, "false")) { return TOKEN_FALSE; } + + return TOKEN_SYMBOL; +} + +Tokens +tokenize(const StringView *sv) { + Tokens tokens = {0}; + tokens.tokens = NULL; + array_init(tokens.tokens, 1); + Scanner scanner = (Scanner){ + .current = *sv, + .line_number = 1, + .col_number = 1, + }; + + while (scan_has_next(&scanner)) { + skip_whitespace(&scanner); + size_t line = scanner.line_number; + size_t col = scanner.col_number; + size_t offset = scanner.offset; + char c = scan_next(&scanner); + switch (c) { + case ';': { + while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} + } break; + case '"': { + char prev = c; + bool found = false; + size_t n = 0; + while (scan_has_next(&scanner)) { + c = scan_next(&scanner); + if (c == '"' && prev != '\\') { + found = true; + break; + } + prev = c; + n++; + } + if (!found) { + error_push(&tokens.errors, (Error){ + .type = ERR_TYPE_LEXER, + .value = ERR_UNMATCHED_STRING, + .line = line, + .col = col, + }); + return tokens; + } + Token token = (Token){ + .value = (StringView){ + .start = &sv->start[offset + 1], + .n = n, + }, + .type = TOKEN_STRING, + .line = line, + .column = col, + }; + array_push(tokens.tokens, token); + } break; + case '(': { + if (scan_peek(&scanner) == ')') { + scan_next(&scanner); + Token token = (Token){ + .type = TOKEN_NIL, + .line = line, + .column = col, + }; + array_push(tokens.tokens, token); + } else { + Token token = (Token){ + .type = TOKEN_LPAREN, + .line = line, + .column = col, + }; + array_push(tokens.tokens, token); + } + } break; + case ')': { + Token token = (Token){ + .type = TOKEN_RPAREN, + .line = line, + .column = col, + }; + array_push(tokens.tokens, token); + } break; + default: { + size_t n = 1; + while (!is_delimiter(scan_peek(&scanner))) { + scan_next(&scanner); + n++; + } + if (c == EOF || c == '\0') { + break; + } + Token token = (Token){ + .value = (StringView){ + .start = &sv->start[offset], + .n = n, + }, + .type = TOKEN_SYMBOL, + .line = line, + .column = col, + }; + token.type = find_primitive_type(token.value); + array_push(tokens.tokens, token); + } break; + } + } + + // Push EOF token. + Token token = (Token){ + .type = TOKEN_EOF, + .line = scanner.line_number, + .column = 1, + }; + array_push(tokens.tokens, token); + + return tokens; +} diff --git a/src/lexer.h b/src/lexer.h new file mode 100755 index 0000000..e56f5f2 --- /dev/null +++ b/src/lexer.h @@ -0,0 +1,67 @@ +#ifndef BDL_LEXER_H +#define BDL_LEXER_H + +#include "string_view.h" + +typedef enum TokenType { + TOKEN_UNKNOWN = 0, + + // Parentheses. + TOKEN_LPAREN, + TOKEN_RPAREN, + + // Primitive types. + TOKEN_FIXNUM, + TOKEN_SYMBOL, + TOKEN_STRING, + TOKEN_NIL, + TOKEN_TRUE, + TOKEN_FALSE, + + // End of file. + TOKEN_EOF, +} TokenType; + +typedef struct Token { + TokenType type; + StringView value; + size_t line; + size_t column; +} Token; + +typedef struct Tokens { + Token *tokens; + Errors errors; +} Tokens; + +typedef struct Scanner { + StringView current; + size_t line_number; + size_t col_number; + size_t offset; +} Scanner; + +// Print a token to standard output for debugging purposes. +void print_token(Token tok); + +// Same functionality as the ScanView pairs, but keeping track of line and +// column numbers. +char scan_next(Scanner *scanner); +char scan_peek(const Scanner *scanner); + +// Check if the current scanner still have characters left. +bool scan_has_next(const Scanner *scanner); + +// Advance the scanner until we ran out of whitespace. +void skip_whitespace(Scanner *scanner); + +// Check if a given character is a delimiter. +bool is_delimiter(char c); + +// Extract the token type from the current string. +TokenType find_primitive_type(const StringView value); + +// Generate a list of tokens from the given string. +Tokens tokenize(const StringView *sv); + +#endif // BDL_LEXER_H diff --git a/src/main.c b/src/main.c new file mode 100755 index 0000000..90860e8 --- /dev/null +++ b/src/main.c @@ -0,0 +1,137 @@ +#include +#include +#include + +#include "common.h" +#include "darray.h" +#include "string_view.c" +#include "errors.c" +#include "lexer.c" + +void +init(void) { + // STUB +} + +void +halt(void) { + // STUB +} + +void +process_source(const StringView *source, const char *file_name) { + // Read tokens. + Tokens tokens = tokenize(source); + if (tokens.errors.n != 0) { + report_errors(&tokens.errors, file_name); + exit(EXIT_FAILURE); + } + + // TODO: Parser. + // TODO: Semantic analysis. + // TODO: Optimization. + // TODO: Compilation. + + // Free resources. + array_free(tokens.tokens); +} + +void +run_file(char *file_name) { + FILE *file = fopen(file_name, "r"); + if (!file) { + fprintf(stderr, "error: couldn't open input file: %s\n", file_name); + exit(EXIT_FAILURE); + } + + // Read entire file into memory. + fseek(file, 0, SEEK_END); + size_t file_size = ftell(file); + fseek(file, 0, SEEK_SET); + + char *source = malloc(file_size + 1); + fread(source, 1, file_size, file); + source[file_size] = 0; + + StringView sv = (StringView){ + .start = source, + .n = file_size, + }; + + process_source(&sv, file_name); + + free(source); + fclose(file); +} + +#define STDIN_BUF_CAP 16 + +void +run_stdin(void) { + size_t buf_size = 0; + char *source = NULL; + array_init(source, STDIN_BUF_CAP); + + char c; + while ((c = getchar()) != EOF) { + array_push(source, c); + buf_size++; + } + + StringView sv = (StringView){ + .start = source, + .n = buf_size, + }; + + process_source(&sv, "stdin"); + + array_free(source); +} + +#ifndef BIN_NAME +#define BIN_NAME "bdl" +#endif + +void +print_usage(void) { + printf("Usage: %s [options] \n", BIN_NAME); + printf("\n"); + printf("\t-h \tShow usage.\n"); + printf("\n"); +} + +int +main(int argc, char *argv[]) { + init(); + + int option; + while ((option = getopt(argc, argv, "h")) != -1) { + switch (option) { + case 'h': { + print_usage(); + goto exit_success; + } break; + default: { + print_usage(); + return EXIT_FAILURE; + } break; + } + } + + // Run from stdin. + if (optind == argc) { + run_stdin(); + goto exit_success; + } + + // Run from file. + while (optind < argc) { + char *file_name = argv[optind]; + run_file(file_name); + optind++; + } + +exit_success: + halt(); + return EXIT_SUCCESS; +} diff --git a/src/string_view.c b/src/string_view.c new file mode 100755 index 0000000..8247bd4 --- /dev/null +++ b/src/string_view.c @@ -0,0 +1,40 @@ +#include "string_view.h" + +char +sv_next(StringView *sv) { + if (sv->n == 0) { + return '\0'; + } + char c = sv->start[0]; + sv->start++; + sv->n--; + return c; +} + +char +sv_peek(const StringView *sv) { + if (sv->n == 0) { + return '\0'; + } + return sv->start[0]; +} + +bool +sv_equal(const StringView *a, const StringView *b) { + if (a->n != b->n) { + return false; + } + for (size_t i = 0; i < a->n; i++) { + if (a->start[i] != b->start[i]) { + return false; + } + } + return true; +} + +void +sv_write(const StringView *sv) { + for (size_t i = 0; i < sv->n; i++) { + putchar(sv->start[i]); + } +} diff --git a/src/string_view.h b/src/string_view.h new file mode 100755 index 0000000..4dbbaaf --- /dev/null +++ b/src/string_view.h @@ -0,0 +1,25 @@ +#ifndef BDL_STRINGVIEW_H +#define BDL_STRINGVIEW_H + +#include "common.h" + +typedef struct StringView { + char *start; + size_t n; +} StringView; + +// Consume a character in the stream. +char sv_next(StringView *sv); + +// Check what is the current character in the stream. +char sv_peek(const StringView *sv); + +// Compare if the arguments are the same. +bool sv_equal(const StringView *a, const StringView *b); + +// Write a character to the given output stream. +void sv_write(const StringView *sv); + +#define STRING(STR) (StringView){(STR), sizeof(STR) - 1} + +#endif // BDL_STRINGVIEW_H -- cgit v1.2.1