diff options
-rw-r--r-- | src/bootstrap/lexer.c | 307 | ||||
-rwxr-xr-x | src/bootstrap/main.c | 17 | ||||
-rw-r--r-- | src/bootstrap/read_line.c | 1 |
3 files changed, 320 insertions, 5 deletions
diff --git a/src/bootstrap/lexer.c b/src/bootstrap/lexer.c new file mode 100644 index 0000000..fc53d3c --- /dev/null +++ b/src/bootstrap/lexer.c | |||
@@ -0,0 +1,307 @@ | |||
1 | typedef enum TokenType { | ||
2 | TOKEN_UNKNOWN = 0, | ||
3 | TOKEN_LPAREN, | ||
4 | TOKEN_RPAREN, | ||
5 | TOKEN_QUOTE, | ||
6 | TOKEN_TRUE, | ||
7 | TOKEN_FALSE, | ||
8 | TOKEN_NIL, | ||
9 | TOKEN_FIXNUM, | ||
10 | TOKEN_SYMBOL, | ||
11 | TOKEN_STRING, | ||
12 | TOKEN_EOF, | ||
13 | } TokenType; | ||
14 | |||
15 | typedef struct Token { | ||
16 | TokenType type; | ||
17 | StringView value; | ||
18 | size_t line; | ||
19 | size_t column; | ||
20 | } Token; | ||
21 | |||
22 | typedef struct Tokens { | ||
23 | Token *buf; | ||
24 | size_t size; | ||
25 | size_t cap; | ||
26 | } Tokens; | ||
27 | |||
28 | void | ||
29 | print_token(Token tok) { | ||
30 | printf("LINE: %3ld COL: %3ld ", tok.line, tok.column); | ||
31 | switch (tok.type) { | ||
32 | case TOKEN_LPAREN: { | ||
33 | printf("TOKEN_LPAREN"); | ||
34 | } break; | ||
35 | case TOKEN_RPAREN: { | ||
36 | printf("TOKEN_RPAREN"); | ||
37 | } break; | ||
38 | case TOKEN_QUOTE: { | ||
39 | printf("TOKEN_QUOTE"); | ||
40 | } break; | ||
41 | case TOKEN_TRUE: { | ||
42 | printf("TOKEN_TRUE"); | ||
43 | } break; | ||
44 | case TOKEN_FALSE: { | ||
45 | printf("TOKEN_FALSE"); | ||
46 | } break; | ||
47 | case TOKEN_NIL: { | ||
48 | printf("TOKEN_NIL"); | ||
49 | } break; | ||
50 | case TOKEN_FIXNUM: { | ||
51 | printf("TOKEN_FIXNUM -> "); | ||
52 | sv_write(&tok.value, stdout); | ||
53 | } break; | ||
54 | case TOKEN_SYMBOL: { | ||
55 | printf("TOKEN_SYMBOL -> "); | ||
56 | sv_write(&tok.value, stdout); | ||
57 | } break; | ||
58 | case TOKEN_STRING: { | ||
59 | printf("TOKEN_STRING -> "); | ||
60 | sv_write(&tok.value, stdout); | ||
61 | } break; | ||
62 | case TOKEN_EOF: { | ||
63 | printf("TOKEN_EOF"); | ||
64 | } break; | ||
65 | case TOKEN_UNKNOWN: { | ||
66 | printf("TOKEN_UNKNOWN"); | ||
67 | } break; | ||
68 | } | ||
69 | printf("\n"); | ||
70 | } | ||
71 | |||
72 | #define TOK_BUF_CAP 256 | ||
73 | |||
74 | void | ||
75 | push_token(Tokens *tokens, Token tok) { | ||
76 | if (tokens->buf == NULL) { | ||
77 | tokens->size = 0; | ||
78 | tokens->cap = TOK_BUF_CAP; | ||
79 | tokens->buf = malloc(tokens->cap * sizeof(Token)); | ||
80 | } else if (tokens->size == tokens->cap) { | ||
81 | tokens->cap *= 2; | ||
82 | tokens->buf = realloc(tokens->buf, tokens->cap * sizeof(Token)); | ||
83 | } | ||
84 | tokens->buf[tokens->size++] = tok; | ||
85 | } | ||
86 | |||
87 | typedef struct Scanner { | ||
88 | StringView orig; | ||
89 | StringView current; | ||
90 | size_t line_number; | ||
91 | size_t col_number; | ||
92 | size_t offset; | ||
93 | size_t lexeme_n; | ||
94 | size_t lexeme_offset; | ||
95 | size_t lexeme_line_number; | ||
96 | size_t lexeme_col_number; | ||
97 | } Scanner; | ||
98 | |||
99 | char | ||
100 | scan_next(Scanner *scanner) { | ||
101 | if (scanner->lexeme_n == 0) { | ||
102 | scanner->lexeme_line_number = scanner->line_number; | ||
103 | scanner->lexeme_col_number = scanner->col_number; | ||
104 | scanner->lexeme_offset = scanner->offset; | ||
105 | } | ||
106 | char c = sv_next(&scanner->current); | ||
107 | if (c == '\n') { | ||
108 | scanner->line_number++; | ||
109 | scanner->col_number = 1; | ||
110 | } else { | ||
111 | scanner->col_number++; | ||
112 | } | ||
113 | scanner->offset++; | ||
114 | return c; | ||
115 | } | ||
116 | |||
117 | char | ||
118 | scan_peek(const Scanner *scanner) { | ||
119 | return sv_peek(&scanner->current); | ||
120 | } | ||
121 | |||
122 | bool | ||
123 | scan_has_next(const Scanner *scanner) { | ||
124 | return scanner->current.n != 0; | ||
125 | } | ||
126 | |||
127 | bool | ||
128 | scan_has_lexeme(const Scanner * scanner) { | ||
129 | return scanner->lexeme_n != 0; | ||
130 | } | ||
131 | |||
132 | Token | ||
133 | scan_get_lexeme(Scanner * scanner) { | ||
134 | Token token = (Token){ | ||
135 | .type = TOKEN_UNKNOWN, | ||
136 | .value = (StringView){ | ||
137 | .start = &scanner->orig.start[scanner->lexeme_offset], | ||
138 | .n = scanner->lexeme_n, | ||
139 | }, | ||
140 | .line = scanner->lexeme_line_number, | ||
141 | .column = scanner->lexeme_col_number, | ||
142 | }; | ||
143 | scanner->lexeme_n = 0; | ||
144 | scanner->lexeme_line_number = scanner->line_number; | ||
145 | scanner->lexeme_col_number = scanner->col_number; | ||
146 | scanner->lexeme_offset = scanner->offset; | ||
147 | return token; | ||
148 | } | ||
149 | |||
150 | TokenType | ||
151 | find_primitive_type(StringView value) { | ||
152 | bool is_fixnum = true; | ||
153 | for (size_t i = 0; i < value.n; i++) { | ||
154 | char c = value.start[i]; | ||
155 | if (i == 0 && c == '-' && value.n > 1) { | ||
156 | continue; | ||
157 | } | ||
158 | if (!(c >= '0' && c <= '9')) { | ||
159 | is_fixnum = false; | ||
160 | break; | ||
161 | } | ||
162 | } | ||
163 | if (is_fixnum) { | ||
164 | return TOKEN_FIXNUM; | ||
165 | } | ||
166 | if (sv_equal(&value, &(StringView){"true", 4})) { | ||
167 | return TOKEN_TRUE; | ||
168 | } | ||
169 | if (sv_equal(&value, &(StringView){"false", 5})) { | ||
170 | return TOKEN_FALSE; | ||
171 | } | ||
172 | return TOKEN_SYMBOL; | ||
173 | } | ||
174 | |||
175 | Tokens | ||
176 | tokenize(const StringView *sv) { | ||
177 | Tokens tokens = (Tokens){0}; | ||
178 | Scanner scanner = (Scanner){ | ||
179 | .orig = *sv, | ||
180 | .current = *sv, | ||
181 | .line_number = 1, | ||
182 | .col_number = 1, | ||
183 | .lexeme_line_number = 1, | ||
184 | .lexeme_col_number = 1, | ||
185 | }; | ||
186 | |||
187 | while (scan_has_next(&scanner)) { | ||
188 | char c = scan_next(&scanner); | ||
189 | switch (c) { | ||
190 | case ' ': | ||
191 | case '\f': | ||
192 | case '\n': | ||
193 | case '\r': | ||
194 | case '\t': | ||
195 | case '\v': { | ||
196 | if (scan_has_lexeme(&scanner)) { | ||
197 | Token token = scan_get_lexeme(&scanner); | ||
198 | token.type = find_primitive_type(token.value); | ||
199 | push_token(&tokens, token); | ||
200 | } | ||
201 | } break; | ||
202 | case ';': { | ||
203 | if (scan_has_lexeme(&scanner)) { | ||
204 | Token token = scan_get_lexeme(&scanner); | ||
205 | token.type = find_primitive_type(token.value); | ||
206 | push_token(&tokens, token); | ||
207 | } | ||
208 | while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} | ||
209 | } break; | ||
210 | case '"': { | ||
211 | if (scan_has_lexeme(&scanner)) { | ||
212 | Token token = scan_get_lexeme(&scanner); | ||
213 | token.type = find_primitive_type(token.value); | ||
214 | push_token(&tokens, token); | ||
215 | scanner.lexeme_col_number--; | ||
216 | scanner.lexeme_offset--; | ||
217 | } | ||
218 | |||
219 | char prev = c; | ||
220 | bool found = false; | ||
221 | while (scan_has_next(&scanner)) { | ||
222 | c = scan_next(&scanner); | ||
223 | if (c == '"' && prev != '\\') { | ||
224 | found = true; | ||
225 | break; | ||
226 | } | ||
227 | prev = c; | ||
228 | scanner.lexeme_n++; | ||
229 | } | ||
230 | scanner.lexeme_col_number--; | ||
231 | if (found) { | ||
232 | Token token = scan_get_lexeme(&scanner); | ||
233 | token.type = TOKEN_STRING; | ||
234 | push_token(&tokens, token); | ||
235 | } else { | ||
236 | // TODO: Report error: couldn't find the closing quotes. | ||
237 | } | ||
238 | } break; | ||
239 | case '\'': { | ||
240 | if (scan_has_lexeme(&scanner)) { | ||
241 | Token token = scan_get_lexeme(&scanner); | ||
242 | token.type = find_primitive_type(token.value); | ||
243 | push_token(&tokens, token); | ||
244 | scanner.lexeme_col_number--; | ||
245 | scanner.lexeme_offset--; | ||
246 | } | ||
247 | Token token = scan_get_lexeme(&scanner); | ||
248 | token.type = TOKEN_QUOTE; | ||
249 | push_token(&tokens, token); | ||
250 | } break; | ||
251 | case '(': { | ||
252 | if (scan_has_lexeme(&scanner)) { | ||
253 | Token token = scan_get_lexeme(&scanner); | ||
254 | token.type = find_primitive_type(token.value); | ||
255 | push_token(&tokens, token); | ||
256 | scanner.lexeme_col_number--; | ||
257 | scanner.lexeme_offset--; | ||
258 | } | ||
259 | scanner.lexeme_n++; | ||
260 | if (scan_peek(&scanner) == ')') { | ||
261 | scanner.lexeme_n++; | ||
262 | scan_next(&scanner); | ||
263 | Token token = scan_get_lexeme(&scanner); | ||
264 | token.type = TOKEN_NIL; | ||
265 | push_token(&tokens, token); | ||
266 | } else { | ||
267 | Token token = scan_get_lexeme(&scanner); | ||
268 | token.type = TOKEN_LPAREN; | ||
269 | push_token(&tokens, token); | ||
270 | } | ||
271 | } break; | ||
272 | case ')': { | ||
273 | if (scan_has_lexeme(&scanner)) { | ||
274 | Token token = scan_get_lexeme(&scanner); | ||
275 | token.type = find_primitive_type(token.value); | ||
276 | push_token(&tokens, token); | ||
277 | scanner.lexeme_col_number--; | ||
278 | scanner.lexeme_offset--; | ||
279 | } | ||
280 | scanner.lexeme_n++; | ||
281 | Token token = scan_get_lexeme(&scanner); | ||
282 | token.type = TOKEN_RPAREN; | ||
283 | push_token(&tokens, token); | ||
284 | } break; | ||
285 | case EOF: { | ||
286 | break; | ||
287 | } break; | ||
288 | default: { | ||
289 | scanner.lexeme_n++; | ||
290 | } break; | ||
291 | } | ||
292 | } | ||
293 | |||
294 | // Push current lexeme if any. | ||
295 | if (scan_has_lexeme(&scanner)) { | ||
296 | Token token = scan_get_lexeme(&scanner); | ||
297 | token.type = find_primitive_type(token.value); | ||
298 | push_token(&tokens, token); | ||
299 | } | ||
300 | |||
301 | // Push EOF token. | ||
302 | Token token = scan_get_lexeme(&scanner); | ||
303 | token.type = TOKEN_EOF; | ||
304 | push_token(&tokens, token); | ||
305 | |||
306 | return tokens; | ||
307 | } | ||
diff --git a/src/bootstrap/main.c b/src/bootstrap/main.c index 662831e..113ee48 100755 --- a/src/bootstrap/main.c +++ b/src/bootstrap/main.c | |||
@@ -5,10 +5,19 @@ | |||
5 | 5 | ||
6 | #include "string_view.c" | 6 | #include "string_view.c" |
7 | #include "read_line.c" | 7 | #include "read_line.c" |
8 | #include "lexer.c" | ||
8 | 9 | ||
9 | void | 10 | void |
10 | process_source(const StringView *source) { | 11 | process_source(const StringView *source) { |
11 | sv_write(source, stdout); | 12 | Tokens tokens = tokenize(source); |
13 | |||
14 | // Print tokens. | ||
15 | for (size_t i = 0; i < tokens.size; i++) { | ||
16 | Token tok = tokens.buf[i]; | ||
17 | print_token(tok); | ||
18 | } | ||
19 | |||
20 | free(tokens.buf); | ||
12 | } | 21 | } |
13 | 22 | ||
14 | #define REPL_PROMPT "bdl> " | 23 | #define REPL_PROMPT "bdl> " |
@@ -57,13 +66,13 @@ run_file(char *file_name) { | |||
57 | fclose(file); | 66 | fclose(file); |
58 | } | 67 | } |
59 | 68 | ||
60 | #define STDIN_BUF_SIZE 16 | 69 | #define STDIN_BUF_CAP 16 |
61 | 70 | ||
62 | void | 71 | void |
63 | run_stdin(void) { | 72 | run_stdin(void) { |
64 | size_t buf_size = 0; | 73 | size_t buf_size = 0; |
65 | size_t buf_cap = STDIN_BUF_SIZE; | 74 | size_t buf_cap = STDIN_BUF_CAP; |
66 | char *source = malloc(sizeof(char) * buf_cap); | 75 | char *source = malloc(buf_cap * sizeof(char)); |
67 | 76 | ||
68 | char c; | 77 | char c; |
69 | while ((c = getchar()) != EOF) { | 78 | while ((c = getchar()) != EOF) { |
diff --git a/src/bootstrap/read_line.c b/src/bootstrap/read_line.c index 7612d05..603bfee 100644 --- a/src/bootstrap/read_line.c +++ b/src/bootstrap/read_line.c | |||
@@ -29,4 +29,3 @@ read_line(void) { | |||
29 | }; | 29 | }; |
30 | return sv; | 30 | return sv; |
31 | } | 31 | } |
32 | |||