aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBad Diode <bd@badd10de.dev>2021-10-11 19:33:29 +0200
committerBad Diode <bd@badd10de.dev>2021-10-11 19:33:29 +0200
commit463690390b45ddd96545ae958e2605e262966c9f (patch)
treea6d2b48a440dcee137b47730ca3d65e061f2d881
parent1d0ee825c8b70e9456bebc4bf2bc8366c2e89cbd (diff)
downloadbdl-463690390b45ddd96545ae958e2605e262966c9f.tar.gz
bdl-463690390b45ddd96545ae958e2605e262966c9f.zip
Add a new version of the lexer for token gen
-rw-r--r--src/bootstrap/lexer.c307
-rwxr-xr-xsrc/bootstrap/main.c17
-rw-r--r--src/bootstrap/read_line.c1
3 files changed, 320 insertions, 5 deletions
diff --git a/src/bootstrap/lexer.c b/src/bootstrap/lexer.c
new file mode 100644
index 0000000..fc53d3c
--- /dev/null
+++ b/src/bootstrap/lexer.c
@@ -0,0 +1,307 @@
1typedef enum TokenType {
2 TOKEN_UNKNOWN = 0,
3 TOKEN_LPAREN,
4 TOKEN_RPAREN,
5 TOKEN_QUOTE,
6 TOKEN_TRUE,
7 TOKEN_FALSE,
8 TOKEN_NIL,
9 TOKEN_FIXNUM,
10 TOKEN_SYMBOL,
11 TOKEN_STRING,
12 TOKEN_EOF,
13} TokenType;
14
15typedef struct Token {
16 TokenType type;
17 StringView value;
18 size_t line;
19 size_t column;
20} Token;
21
22typedef struct Tokens {
23 Token *buf;
24 size_t size;
25 size_t cap;
26} Tokens;
27
28void
29print_token(Token tok) {
30 printf("LINE: %3ld COL: %3ld ", tok.line, tok.column);
31 switch (tok.type) {
32 case TOKEN_LPAREN: {
33 printf("TOKEN_LPAREN");
34 } break;
35 case TOKEN_RPAREN: {
36 printf("TOKEN_RPAREN");
37 } break;
38 case TOKEN_QUOTE: {
39 printf("TOKEN_QUOTE");
40 } break;
41 case TOKEN_TRUE: {
42 printf("TOKEN_TRUE");
43 } break;
44 case TOKEN_FALSE: {
45 printf("TOKEN_FALSE");
46 } break;
47 case TOKEN_NIL: {
48 printf("TOKEN_NIL");
49 } break;
50 case TOKEN_FIXNUM: {
51 printf("TOKEN_FIXNUM -> ");
52 sv_write(&tok.value, stdout);
53 } break;
54 case TOKEN_SYMBOL: {
55 printf("TOKEN_SYMBOL -> ");
56 sv_write(&tok.value, stdout);
57 } break;
58 case TOKEN_STRING: {
59 printf("TOKEN_STRING -> ");
60 sv_write(&tok.value, stdout);
61 } break;
62 case TOKEN_EOF: {
63 printf("TOKEN_EOF");
64 } break;
65 case TOKEN_UNKNOWN: {
66 printf("TOKEN_UNKNOWN");
67 } break;
68 }
69 printf("\n");
70}
71
72#define TOK_BUF_CAP 256
73
74void
75push_token(Tokens *tokens, Token tok) {
76 if (tokens->buf == NULL) {
77 tokens->size = 0;
78 tokens->cap = TOK_BUF_CAP;
79 tokens->buf = malloc(tokens->cap * sizeof(Token));
80 } else if (tokens->size == tokens->cap) {
81 tokens->cap *= 2;
82 tokens->buf = realloc(tokens->buf, tokens->cap * sizeof(Token));
83 }
84 tokens->buf[tokens->size++] = tok;
85}
86
87typedef struct Scanner {
88 StringView orig;
89 StringView current;
90 size_t line_number;
91 size_t col_number;
92 size_t offset;
93 size_t lexeme_n;
94 size_t lexeme_offset;
95 size_t lexeme_line_number;
96 size_t lexeme_col_number;
97} Scanner;
98
99char
100scan_next(Scanner *scanner) {
101 if (scanner->lexeme_n == 0) {
102 scanner->lexeme_line_number = scanner->line_number;
103 scanner->lexeme_col_number = scanner->col_number;
104 scanner->lexeme_offset = scanner->offset;
105 }
106 char c = sv_next(&scanner->current);
107 if (c == '\n') {
108 scanner->line_number++;
109 scanner->col_number = 1;
110 } else {
111 scanner->col_number++;
112 }
113 scanner->offset++;
114 return c;
115}
116
117char
118scan_peek(const Scanner *scanner) {
119 return sv_peek(&scanner->current);
120}
121
122bool
123scan_has_next(const Scanner *scanner) {
124 return scanner->current.n != 0;
125}
126
127bool
128scan_has_lexeme(const Scanner * scanner) {
129 return scanner->lexeme_n != 0;
130}
131
132Token
133scan_get_lexeme(Scanner * scanner) {
134 Token token = (Token){
135 .type = TOKEN_UNKNOWN,
136 .value = (StringView){
137 .start = &scanner->orig.start[scanner->lexeme_offset],
138 .n = scanner->lexeme_n,
139 },
140 .line = scanner->lexeme_line_number,
141 .column = scanner->lexeme_col_number,
142 };
143 scanner->lexeme_n = 0;
144 scanner->lexeme_line_number = scanner->line_number;
145 scanner->lexeme_col_number = scanner->col_number;
146 scanner->lexeme_offset = scanner->offset;
147 return token;
148}
149
150TokenType
151find_primitive_type(StringView value) {
152 bool is_fixnum = true;
153 for (size_t i = 0; i < value.n; i++) {
154 char c = value.start[i];
155 if (i == 0 && c == '-' && value.n > 1) {
156 continue;
157 }
158 if (!(c >= '0' && c <= '9')) {
159 is_fixnum = false;
160 break;
161 }
162 }
163 if (is_fixnum) {
164 return TOKEN_FIXNUM;
165 }
166 if (sv_equal(&value, &(StringView){"true", 4})) {
167 return TOKEN_TRUE;
168 }
169 if (sv_equal(&value, &(StringView){"false", 5})) {
170 return TOKEN_FALSE;
171 }
172 return TOKEN_SYMBOL;
173}
174
175Tokens
176tokenize(const StringView *sv) {
177 Tokens tokens = (Tokens){0};
178 Scanner scanner = (Scanner){
179 .orig = *sv,
180 .current = *sv,
181 .line_number = 1,
182 .col_number = 1,
183 .lexeme_line_number = 1,
184 .lexeme_col_number = 1,
185 };
186
187 while (scan_has_next(&scanner)) {
188 char c = scan_next(&scanner);
189 switch (c) {
190 case ' ':
191 case '\f':
192 case '\n':
193 case '\r':
194 case '\t':
195 case '\v': {
196 if (scan_has_lexeme(&scanner)) {
197 Token token = scan_get_lexeme(&scanner);
198 token.type = find_primitive_type(token.value);
199 push_token(&tokens, token);
200 }
201 } break;
202 case ';': {
203 if (scan_has_lexeme(&scanner)) {
204 Token token = scan_get_lexeme(&scanner);
205 token.type = find_primitive_type(token.value);
206 push_token(&tokens, token);
207 }
208 while ((c = scan_next(&scanner)) != '\n' && c != '\0') {}
209 } break;
210 case '"': {
211 if (scan_has_lexeme(&scanner)) {
212 Token token = scan_get_lexeme(&scanner);
213 token.type = find_primitive_type(token.value);
214 push_token(&tokens, token);
215 scanner.lexeme_col_number--;
216 scanner.lexeme_offset--;
217 }
218
219 char prev = c;
220 bool found = false;
221 while (scan_has_next(&scanner)) {
222 c = scan_next(&scanner);
223 if (c == '"' && prev != '\\') {
224 found = true;
225 break;
226 }
227 prev = c;
228 scanner.lexeme_n++;
229 }
230 scanner.lexeme_col_number--;
231 if (found) {
232 Token token = scan_get_lexeme(&scanner);
233 token.type = TOKEN_STRING;
234 push_token(&tokens, token);
235 } else {
236 // TODO: Report error: couldn't find the closing quotes.
237 }
238 } break;
239 case '\'': {
240 if (scan_has_lexeme(&scanner)) {
241 Token token = scan_get_lexeme(&scanner);
242 token.type = find_primitive_type(token.value);
243 push_token(&tokens, token);
244 scanner.lexeme_col_number--;
245 scanner.lexeme_offset--;
246 }
247 Token token = scan_get_lexeme(&scanner);
248 token.type = TOKEN_QUOTE;
249 push_token(&tokens, token);
250 } break;
251 case '(': {
252 if (scan_has_lexeme(&scanner)) {
253 Token token = scan_get_lexeme(&scanner);
254 token.type = find_primitive_type(token.value);
255 push_token(&tokens, token);
256 scanner.lexeme_col_number--;
257 scanner.lexeme_offset--;
258 }
259 scanner.lexeme_n++;
260 if (scan_peek(&scanner) == ')') {
261 scanner.lexeme_n++;
262 scan_next(&scanner);
263 Token token = scan_get_lexeme(&scanner);
264 token.type = TOKEN_NIL;
265 push_token(&tokens, token);
266 } else {
267 Token token = scan_get_lexeme(&scanner);
268 token.type = TOKEN_LPAREN;
269 push_token(&tokens, token);
270 }
271 } break;
272 case ')': {
273 if (scan_has_lexeme(&scanner)) {
274 Token token = scan_get_lexeme(&scanner);
275 token.type = find_primitive_type(token.value);
276 push_token(&tokens, token);
277 scanner.lexeme_col_number--;
278 scanner.lexeme_offset--;
279 }
280 scanner.lexeme_n++;
281 Token token = scan_get_lexeme(&scanner);
282 token.type = TOKEN_RPAREN;
283 push_token(&tokens, token);
284 } break;
285 case EOF: {
286 break;
287 } break;
288 default: {
289 scanner.lexeme_n++;
290 } break;
291 }
292 }
293
294 // Push current lexeme if any.
295 if (scan_has_lexeme(&scanner)) {
296 Token token = scan_get_lexeme(&scanner);
297 token.type = find_primitive_type(token.value);
298 push_token(&tokens, token);
299 }
300
301 // Push EOF token.
302 Token token = scan_get_lexeme(&scanner);
303 token.type = TOKEN_EOF;
304 push_token(&tokens, token);
305
306 return tokens;
307}
diff --git a/src/bootstrap/main.c b/src/bootstrap/main.c
index 662831e..113ee48 100755
--- a/src/bootstrap/main.c
+++ b/src/bootstrap/main.c
@@ -5,10 +5,19 @@
5 5
6#include "string_view.c" 6#include "string_view.c"
7#include "read_line.c" 7#include "read_line.c"
8#include "lexer.c"
8 9
9void 10void
10process_source(const StringView *source) { 11process_source(const StringView *source) {
11 sv_write(source, stdout); 12 Tokens tokens = tokenize(source);
13
14 // Print tokens.
15 for (size_t i = 0; i < tokens.size; i++) {
16 Token tok = tokens.buf[i];
17 print_token(tok);
18 }
19
20 free(tokens.buf);
12} 21}
13 22
14#define REPL_PROMPT "bdl> " 23#define REPL_PROMPT "bdl> "
@@ -57,13 +66,13 @@ run_file(char *file_name) {
57 fclose(file); 66 fclose(file);
58} 67}
59 68
60#define STDIN_BUF_SIZE 16 69#define STDIN_BUF_CAP 16
61 70
62void 71void
63run_stdin(void) { 72run_stdin(void) {
64 size_t buf_size = 0; 73 size_t buf_size = 0;
65 size_t buf_cap = STDIN_BUF_SIZE; 74 size_t buf_cap = STDIN_BUF_CAP;
66 char *source = malloc(sizeof(char) * buf_cap); 75 char *source = malloc(buf_cap * sizeof(char));
67 76
68 char c; 77 char c;
69 while ((c = getchar()) != EOF) { 78 while ((c = getchar()) != EOF) {
diff --git a/src/bootstrap/read_line.c b/src/bootstrap/read_line.c
index 7612d05..603bfee 100644
--- a/src/bootstrap/read_line.c
+++ b/src/bootstrap/read_line.c
@@ -29,4 +29,3 @@ read_line(void) {
29 }; 29 };
30 return sv; 30 return sv;
31} 31}
32