diff options
Diffstat (limited to 'src/bootstrap/lexer.c')
-rw-r--r-- | src/bootstrap/lexer.c | 215 |
1 files changed, 104 insertions, 111 deletions
diff --git a/src/bootstrap/lexer.c b/src/bootstrap/lexer.c index fc53d3c..1add4dc 100644 --- a/src/bootstrap/lexer.c +++ b/src/bootstrap/lexer.c | |||
@@ -85,24 +85,14 @@ push_token(Tokens *tokens, Token tok) { | |||
85 | } | 85 | } |
86 | 86 | ||
87 | typedef struct Scanner { | 87 | typedef struct Scanner { |
88 | StringView orig; | ||
89 | StringView current; | 88 | StringView current; |
90 | size_t line_number; | 89 | size_t line_number; |
91 | size_t col_number; | 90 | size_t col_number; |
92 | size_t offset; | 91 | size_t offset; |
93 | size_t lexeme_n; | ||
94 | size_t lexeme_offset; | ||
95 | size_t lexeme_line_number; | ||
96 | size_t lexeme_col_number; | ||
97 | } Scanner; | 92 | } Scanner; |
98 | 93 | ||
99 | char | 94 | char |
100 | scan_next(Scanner *scanner) { | 95 | scan_next(Scanner *scanner) { |
101 | if (scanner->lexeme_n == 0) { | ||
102 | scanner->lexeme_line_number = scanner->line_number; | ||
103 | scanner->lexeme_col_number = scanner->col_number; | ||
104 | scanner->lexeme_offset = scanner->offset; | ||
105 | } | ||
106 | char c = sv_next(&scanner->current); | 96 | char c = sv_next(&scanner->current); |
107 | if (c == '\n') { | 97 | if (c == '\n') { |
108 | scanner->line_number++; | 98 | scanner->line_number++; |
@@ -124,27 +114,24 @@ scan_has_next(const Scanner *scanner) { | |||
124 | return scanner->current.n != 0; | 114 | return scanner->current.n != 0; |
125 | } | 115 | } |
126 | 116 | ||
127 | bool | 117 | void |
128 | scan_has_lexeme(const Scanner * scanner) { | 118 | skip_whitespace(Scanner *scanner) { |
129 | return scanner->lexeme_n != 0; | 119 | while (scan_has_next(scanner)) { |
130 | } | 120 | char c = scan_peek(scanner); |
131 | 121 | switch (c) { | |
132 | Token | 122 | case ' ': |
133 | scan_get_lexeme(Scanner * scanner) { | 123 | case '\f': |
134 | Token token = (Token){ | 124 | case '\n': |
135 | .type = TOKEN_UNKNOWN, | 125 | case '\r': |
136 | .value = (StringView){ | 126 | case '\t': |
137 | .start = &scanner->orig.start[scanner->lexeme_offset], | 127 | case '\v': { |
138 | .n = scanner->lexeme_n, | 128 | scan_next(scanner); |
139 | }, | 129 | } break; |
140 | .line = scanner->lexeme_line_number, | 130 | default: { |
141 | .column = scanner->lexeme_col_number, | 131 | return; |
142 | }; | 132 | } break; |
143 | scanner->lexeme_n = 0; | 133 | } |
144 | scanner->lexeme_line_number = scanner->line_number; | 134 | } |
145 | scanner->lexeme_col_number = scanner->col_number; | ||
146 | scanner->lexeme_offset = scanner->offset; | ||
147 | return token; | ||
148 | } | 135 | } |
149 | 136 | ||
150 | TokenType | 137 | TokenType |
@@ -172,52 +159,51 @@ find_primitive_type(StringView value) { | |||
172 | return TOKEN_SYMBOL; | 159 | return TOKEN_SYMBOL; |
173 | } | 160 | } |
174 | 161 | ||
162 | bool | ||
163 | is_delimiter(char c) { | ||
164 | switch (c) { | ||
165 | case EOF: | ||
166 | case '\0': | ||
167 | case ';': | ||
168 | case '"': | ||
169 | case '\'': | ||
170 | case '(': | ||
171 | case ')': | ||
172 | case ' ': | ||
173 | case '\f': | ||
174 | case '\n': | ||
175 | case '\r': | ||
176 | case '\t': | ||
177 | case '\v': { | ||
178 | return true; | ||
179 | } break; | ||
180 | } | ||
181 | return false; | ||
182 | } | ||
183 | |||
175 | Tokens | 184 | Tokens |
176 | tokenize(const StringView *sv) { | 185 | tokenize(const StringView *sv) { |
177 | Tokens tokens = (Tokens){0}; | 186 | Tokens tokens = (Tokens){0}; |
178 | Scanner scanner = (Scanner){ | 187 | Scanner scanner = (Scanner){ |
179 | .orig = *sv, | ||
180 | .current = *sv, | 188 | .current = *sv, |
181 | .line_number = 1, | 189 | .line_number = 1, |
182 | .col_number = 1, | 190 | .col_number = 1, |
183 | .lexeme_line_number = 1, | ||
184 | .lexeme_col_number = 1, | ||
185 | }; | 191 | }; |
186 | 192 | ||
187 | while (scan_has_next(&scanner)) { | 193 | while (scan_has_next(&scanner)) { |
194 | skip_whitespace(&scanner); | ||
195 | size_t line = scanner.line_number; | ||
196 | size_t col = scanner.col_number; | ||
197 | size_t offset = scanner.offset; | ||
188 | char c = scan_next(&scanner); | 198 | char c = scan_next(&scanner); |
189 | switch (c) { | 199 | switch (c) { |
190 | case ' ': | ||
191 | case '\f': | ||
192 | case '\n': | ||
193 | case '\r': | ||
194 | case '\t': | ||
195 | case '\v': { | ||
196 | if (scan_has_lexeme(&scanner)) { | ||
197 | Token token = scan_get_lexeme(&scanner); | ||
198 | token.type = find_primitive_type(token.value); | ||
199 | push_token(&tokens, token); | ||
200 | } | ||
201 | } break; | ||
202 | case ';': { | 200 | case ';': { |
203 | if (scan_has_lexeme(&scanner)) { | ||
204 | Token token = scan_get_lexeme(&scanner); | ||
205 | token.type = find_primitive_type(token.value); | ||
206 | push_token(&tokens, token); | ||
207 | } | ||
208 | while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} | 201 | while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} |
209 | } break; | 202 | } break; |
210 | case '"': { | 203 | case '"': { |
211 | if (scan_has_lexeme(&scanner)) { | ||
212 | Token token = scan_get_lexeme(&scanner); | ||
213 | token.type = find_primitive_type(token.value); | ||
214 | push_token(&tokens, token); | ||
215 | scanner.lexeme_col_number--; | ||
216 | scanner.lexeme_offset--; | ||
217 | } | ||
218 | |||
219 | char prev = c; | 204 | char prev = c; |
220 | bool found = false; | 205 | bool found = false; |
206 | size_t n = 0; | ||
221 | while (scan_has_next(&scanner)) { | 207 | while (scan_has_next(&scanner)) { |
222 | c = scan_next(&scanner); | 208 | c = scan_next(&scanner); |
223 | if (c == '"' && prev != '\\') { | 209 | if (c == '"' && prev != '\\') { |
@@ -225,82 +211,89 @@ tokenize(const StringView *sv) { | |||
225 | break; | 211 | break; |
226 | } | 212 | } |
227 | prev = c; | 213 | prev = c; |
228 | scanner.lexeme_n++; | 214 | n++; |
229 | } | 215 | } |
230 | scanner.lexeme_col_number--; | 216 | if (!found) { |
231 | if (found) { | ||
232 | Token token = scan_get_lexeme(&scanner); | ||
233 | token.type = TOKEN_STRING; | ||
234 | push_token(&tokens, token); | ||
235 | } else { | ||
236 | // TODO: Report error: couldn't find the closing quotes. | 217 | // TODO: Report error: couldn't find the closing quotes. |
237 | } | 218 | } |
219 | Token token = (Token){ | ||
220 | .value = (StringView){ | ||
221 | .start = &sv->start[offset + 1], | ||
222 | .n = n, | ||
223 | }, | ||
224 | .type = TOKEN_STRING, | ||
225 | .line = line, | ||
226 | .column = col, | ||
227 | }; | ||
228 | push_token(&tokens, token); | ||
238 | } break; | 229 | } break; |
239 | case '\'': { | 230 | case '\'': { |
240 | if (scan_has_lexeme(&scanner)) { | 231 | Token token = (Token){ |
241 | Token token = scan_get_lexeme(&scanner); | 232 | .type = TOKEN_QUOTE, |
242 | token.type = find_primitive_type(token.value); | 233 | .line = line, |
243 | push_token(&tokens, token); | 234 | .column = col, |
244 | scanner.lexeme_col_number--; | 235 | }; |
245 | scanner.lexeme_offset--; | ||
246 | } | ||
247 | Token token = scan_get_lexeme(&scanner); | ||
248 | token.type = TOKEN_QUOTE; | ||
249 | push_token(&tokens, token); | 236 | push_token(&tokens, token); |
250 | } break; | 237 | } break; |
251 | case '(': { | 238 | case '(': { |
252 | if (scan_has_lexeme(&scanner)) { | ||
253 | Token token = scan_get_lexeme(&scanner); | ||
254 | token.type = find_primitive_type(token.value); | ||
255 | push_token(&tokens, token); | ||
256 | scanner.lexeme_col_number--; | ||
257 | scanner.lexeme_offset--; | ||
258 | } | ||
259 | scanner.lexeme_n++; | ||
260 | if (scan_peek(&scanner) == ')') { | 239 | if (scan_peek(&scanner) == ')') { |
261 | scanner.lexeme_n++; | ||
262 | scan_next(&scanner); | 240 | scan_next(&scanner); |
263 | Token token = scan_get_lexeme(&scanner); | 241 | Token token = (Token){ |
264 | token.type = TOKEN_NIL; | 242 | .type = TOKEN_NIL, |
243 | .line = line, | ||
244 | .column = col, | ||
245 | }; | ||
265 | push_token(&tokens, token); | 246 | push_token(&tokens, token); |
266 | } else { | 247 | } else { |
267 | Token token = scan_get_lexeme(&scanner); | 248 | Token token = (Token){ |
268 | token.type = TOKEN_LPAREN; | 249 | .type = TOKEN_LPAREN, |
250 | .line = line, | ||
251 | .column = col, | ||
252 | }; | ||
269 | push_token(&tokens, token); | 253 | push_token(&tokens, token); |
270 | } | 254 | } |
271 | } break; | 255 | } break; |
272 | case ')': { | 256 | case ')': { |
273 | if (scan_has_lexeme(&scanner)) { | 257 | Token token = (Token){ |
274 | Token token = scan_get_lexeme(&scanner); | 258 | .type = TOKEN_RPAREN, |
275 | token.type = find_primitive_type(token.value); | 259 | .line = line, |
276 | push_token(&tokens, token); | 260 | .column = col, |
277 | scanner.lexeme_col_number--; | 261 | }; |
278 | scanner.lexeme_offset--; | ||
279 | } | ||
280 | scanner.lexeme_n++; | ||
281 | Token token = scan_get_lexeme(&scanner); | ||
282 | token.type = TOKEN_RPAREN; | ||
283 | push_token(&tokens, token); | 262 | push_token(&tokens, token); |
284 | } break; | 263 | } break; |
285 | case EOF: { | ||
286 | break; | ||
287 | } break; | ||
288 | default: { | 264 | default: { |
289 | scanner.lexeme_n++; | 265 | size_t n = 1; |
266 | while (scan_has_next(&scanner)) { | ||
267 | c = scan_next(&scanner); | ||
268 | if (is_delimiter(c)) { | ||
269 | break; | ||
270 | } | ||
271 | n++; | ||
272 | } | ||
273 | if (c == EOF || c == '\0') { | ||
274 | break; | ||
275 | } | ||
276 | Token token = (Token){ | ||
277 | .value = (StringView){ | ||
278 | .start = &sv->start[offset], | ||
279 | .n = n, | ||
280 | }, | ||
281 | .type = TOKEN_SYMBOL, | ||
282 | .line = line, | ||
283 | .column = col, | ||
284 | }; | ||
285 | token.type = find_primitive_type(token.value); | ||
286 | push_token(&tokens, token); | ||
290 | } break; | 287 | } break; |
291 | } | 288 | } |
292 | } | 289 | } |
293 | 290 | ||
294 | // Push current lexeme if any. | ||
295 | if (scan_has_lexeme(&scanner)) { | ||
296 | Token token = scan_get_lexeme(&scanner); | ||
297 | token.type = find_primitive_type(token.value); | ||
298 | push_token(&tokens, token); | ||
299 | } | ||
300 | |||
301 | // Push EOF token. | 291 | // Push EOF token. |
302 | Token token = scan_get_lexeme(&scanner); | 292 | Token token = (Token){ |
303 | token.type = TOKEN_EOF; | 293 | .type = TOKEN_EOF, |
294 | .line = scanner.line_number, | ||
295 | .column = 1, | ||
296 | }; | ||
304 | push_token(&tokens, token); | 297 | push_token(&tokens, token); |
305 | 298 | ||
306 | return tokens; | 299 | return tokens; |