diff options
author | Bad Diode <bd@badd10de.dev> | 2022-02-01 18:36:52 +0100 |
---|---|---|
committer | Bad Diode <bd@badd10de.dev> | 2022-02-01 18:36:52 +0100 |
commit | ee1a5de91c875fb66724dc21c02333bfebe2a812 (patch) | |
tree | d3eaa226816d295bb9dc48a2aed27044832ec413 /src/lexer.c | |
parent | 3156265c7b2da8cc43fee996c0518ea274d39c8a (diff) | |
download | bdl-ee1a5de91c875fb66724dc21c02333bfebe2a812.tar.gz bdl-ee1a5de91c875fb66724dc21c02333bfebe2a812.zip |
Add new syntax to lexer and prepare refactor
Diffstat (limited to 'src/lexer.c')
-rw-r--r-- | src/lexer.c | 224 |
1 files changed, 144 insertions, 80 deletions
diff --git a/src/lexer.c b/src/lexer.c index 09c8f6c..56b670b 100644 --- a/src/lexer.c +++ b/src/lexer.c | |||
@@ -5,7 +5,11 @@ static const char* token_str[] = { | |||
5 | [TOKEN_UNKNOWN] = "TOKEN_UNKNOWN", | 5 | [TOKEN_UNKNOWN] = "TOKEN_UNKNOWN", |
6 | [TOKEN_LPAREN] = "TOKEN_LPAREN", | 6 | [TOKEN_LPAREN] = "TOKEN_LPAREN", |
7 | [TOKEN_RPAREN] = "TOKEN_RPAREN", | 7 | [TOKEN_RPAREN] = "TOKEN_RPAREN", |
8 | [TOKEN_FIXNUM] = "TOKEN_FIXNUM", | 8 | [TOKEN_LSQUARE] = "TOKEN_LSQUARE", |
9 | [TOKEN_RSQUARE] = "TOKEN_RSQUARE", | ||
10 | [TOKEN_LCURLY] = "TOKEN_LCURLY", | ||
11 | [TOKEN_RCURLY] = "TOKEN_RCURLY", | ||
12 | [TOKEN_NUMBER] = "TOKEN_NUMBER", | ||
9 | [TOKEN_SYMBOL] = "TOKEN_SYMBOL", | 13 | [TOKEN_SYMBOL] = "TOKEN_SYMBOL", |
10 | [TOKEN_STRING] = "TOKEN_STRING", | 14 | [TOKEN_STRING] = "TOKEN_STRING", |
11 | [TOKEN_NIL] = "TOKEN_NIL", | 15 | [TOKEN_NIL] = "TOKEN_NIL", |
@@ -16,6 +20,10 @@ static const char* token_str[] = { | |||
16 | [TOKEN_DEF] = "TOKEN_DEF", | 20 | [TOKEN_DEF] = "TOKEN_DEF", |
17 | [TOKEN_SET] = "TOKEN_SET", | 21 | [TOKEN_SET] = "TOKEN_SET", |
18 | [TOKEN_FUN] = "TOKEN_FUN", | 22 | [TOKEN_FUN] = "TOKEN_FUN", |
23 | [TOKEN_STRUCT] = "TOKEN_STRUCT", | ||
24 | [TOKEN_COLON] = "TOKEN_COLON", | ||
25 | [TOKEN_DOT] = "TOKEN_DOT", | ||
26 | [TOKEN_AT] = "TOKEN_AT", | ||
19 | [TOKEN_EOF] = "TOKEN_EOF", | 27 | [TOKEN_EOF] = "TOKEN_EOF", |
20 | }; | 28 | }; |
21 | 29 | ||
@@ -24,14 +32,8 @@ print_token(Token tok) { | |||
24 | printf("[%4ld:%-4ld] ", tok.line, tok.col); | 32 | printf("[%4ld:%-4ld] ", tok.line, tok.col); |
25 | printf("%s", token_str[tok.type]); | 33 | printf("%s", token_str[tok.type]); |
26 | switch (tok.type) { | 34 | switch (tok.type) { |
27 | case TOKEN_FIXNUM: { | 35 | case TOKEN_NUMBER: |
28 | printf(" -> "); | 36 | case TOKEN_SYMBOL: |
29 | sv_write(&tok.value); | ||
30 | } break; | ||
31 | case TOKEN_SYMBOL: { | ||
32 | printf(" -> "); | ||
33 | sv_write(&tok.value); | ||
34 | } break; | ||
35 | case TOKEN_STRING: { | 37 | case TOKEN_STRING: { |
36 | printf(" -> "); | 38 | printf(" -> "); |
37 | sv_write(&tok.value); | 39 | sv_write(&tok.value); |
@@ -55,6 +57,12 @@ scan_next(Scanner *scanner) { | |||
55 | return c; | 57 | return c; |
56 | } | 58 | } |
57 | 59 | ||
60 | void | ||
61 | scan_rewind(Scanner *scanner) { | ||
62 | sv_rewind(&scanner->current); | ||
63 | scanner->offset--; | ||
64 | } | ||
65 | |||
58 | char | 66 | char |
59 | scan_peek(const Scanner *scanner) { | 67 | scan_peek(const Scanner *scanner) { |
60 | return sv_peek(&scanner->current); | 68 | return sv_peek(&scanner->current); |
@@ -95,6 +103,12 @@ is_delimiter(char c) { | |||
95 | case '\'': | 103 | case '\'': |
96 | case '(': | 104 | case '(': |
97 | case ')': | 105 | case ')': |
106 | case '[': | ||
107 | case ']': | ||
108 | case '{': | ||
109 | case '}': | ||
110 | case ':': | ||
111 | case '@': | ||
98 | case ' ': | 112 | case ' ': |
99 | case '\f': | 113 | case '\f': |
100 | case '\n': | 114 | case '\n': |
@@ -110,22 +124,65 @@ is_delimiter(char c) { | |||
110 | #define TOKEN_IS_KEYWORD(VAL, KEYWORD) \ | 124 | #define TOKEN_IS_KEYWORD(VAL, KEYWORD) \ |
111 | sv_equal(&(VAL), &(StringView){(KEYWORD), sizeof(KEYWORD) - 1}) | 125 | sv_equal(&(VAL), &(StringView){(KEYWORD), sizeof(KEYWORD) - 1}) |
112 | 126 | ||
113 | TokenType | 127 | size_t |
114 | find_primitive_type(const StringView value) { | 128 | scan_number_token(Scanner *scanner) { |
115 | bool is_fixnum = true; | 129 | char first = scan_next(scanner); |
116 | for (size_t i = 0; i < value.n; i++) { | 130 | char second = scan_peek(scanner); |
117 | char c = value.start[i]; | 131 | size_t n = 1; |
118 | if (i == 0 && c == '-' && value.n > 1) { | 132 | if (first == '0' && !is_delimiter(second)) { |
119 | continue; | 133 | if (second == 'x') { |
120 | } | 134 | // Hex constant. |
121 | if (!(c >= '0' && c <= '9')) { | 135 | scan_next(scanner); |
122 | is_fixnum = false; | 136 | n++; |
123 | break; | 137 | if (is_delimiter(scan_peek(scanner))) { |
138 | return 0; | ||
139 | } | ||
140 | while (!is_delimiter(scan_peek(scanner))) { | ||
141 | char c = scan_next(scanner); | ||
142 | if (!(c >= '0' && c <= '9') && | ||
143 | !(c >= 'a' && c <= 'f') && | ||
144 | !(c >= 'A' && c <= 'F')) { | ||
145 | return 0; | ||
146 | } | ||
147 | n++; | ||
148 | } | ||
149 | return n; | ||
150 | } else if (second == 'b') { | ||
151 | // Binary constant. | ||
152 | scan_next(scanner); | ||
153 | n++; | ||
154 | if (is_delimiter(scan_peek(scanner))) { | ||
155 | return 0; | ||
156 | } | ||
157 | while (!is_delimiter(scan_peek(scanner))) { | ||
158 | char c = scan_next(scanner); | ||
159 | if (!(c == '0' || c == '1')) { | ||
160 | return 0; | ||
161 | } | ||
162 | n++; | ||
163 | } | ||
124 | } | 164 | } |
125 | } | 165 | } |
126 | if (is_fixnum) { | 166 | |
127 | return TOKEN_FIXNUM; | 167 | // Decimal number or floating point. |
168 | bool has_dot = false; | ||
169 | while (!is_delimiter(scan_peek(scanner))) { | ||
170 | char c = scan_next(scanner); | ||
171 | if (c == '.') { | ||
172 | if (has_dot) { | ||
173 | return 0; | ||
174 | } | ||
175 | has_dot = true; | ||
176 | } else if (!(c >= '0' && c <= '9')) { | ||
177 | return 0; | ||
178 | } | ||
179 | n++; | ||
128 | } | 180 | } |
181 | return n; | ||
182 | } | ||
183 | |||
184 | TokenType | ||
185 | find_token_type(const StringView value) { | ||
129 | if (TOKEN_IS_KEYWORD(value, "nil")) { return TOKEN_NIL; } | 186 | if (TOKEN_IS_KEYWORD(value, "nil")) { return TOKEN_NIL; } |
130 | if (TOKEN_IS_KEYWORD(value, "true")) { return TOKEN_TRUE; } | 187 | if (TOKEN_IS_KEYWORD(value, "true")) { return TOKEN_TRUE; } |
131 | if (TOKEN_IS_KEYWORD(value, "false")) { return TOKEN_FALSE; } | 188 | if (TOKEN_IS_KEYWORD(value, "false")) { return TOKEN_FALSE; } |
@@ -134,12 +191,20 @@ find_primitive_type(const StringView value) { | |||
134 | if (TOKEN_IS_KEYWORD(value, "def")) { return TOKEN_DEF; } | 191 | if (TOKEN_IS_KEYWORD(value, "def")) { return TOKEN_DEF; } |
135 | if (TOKEN_IS_KEYWORD(value, "set!")) { return TOKEN_SET; } | 192 | if (TOKEN_IS_KEYWORD(value, "set!")) { return TOKEN_SET; } |
136 | if (TOKEN_IS_KEYWORD(value, "fun")) { return TOKEN_FUN; } | 193 | if (TOKEN_IS_KEYWORD(value, "fun")) { return TOKEN_FUN; } |
194 | if (TOKEN_IS_KEYWORD(value, "struct")) { return TOKEN_STRUCT; } | ||
137 | 195 | ||
138 | return TOKEN_SYMBOL; | 196 | return TOKEN_SYMBOL; |
139 | } | 197 | } |
140 | 198 | ||
199 | void | ||
200 | print_tokens(Token *tokens) { | ||
201 | for (size_t i = 0; i < array_size(tokens); i++) { | ||
202 | print_token(tokens[i]); | ||
203 | } | ||
204 | } | ||
205 | |||
141 | Token * | 206 | Token * |
142 | tokenize(const StringView *sv, Errors *errors) { | 207 | tokenize(const StringView *sv) { |
143 | Token *tokens = NULL; | 208 | Token *tokens = NULL; |
144 | array_init(tokens, 1); | 209 | array_init(tokens, 1); |
145 | Scanner scanner = (Scanner){ | 210 | Scanner scanner = (Scanner){ |
@@ -153,10 +218,16 @@ tokenize(const StringView *sv, Errors *errors) { | |||
153 | size_t line = scanner.line_number; | 218 | size_t line = scanner.line_number; |
154 | size_t col = scanner.col_number; | 219 | size_t col = scanner.col_number; |
155 | size_t offset = scanner.offset; | 220 | size_t offset = scanner.offset; |
221 | Token token = (Token){ | ||
222 | .type = TOKEN_UNKNOWN, | ||
223 | .line = line, | ||
224 | .col = col, | ||
225 | }; | ||
156 | char c = scan_next(&scanner); | 226 | char c = scan_next(&scanner); |
157 | switch (c) { | 227 | switch (c) { |
158 | case ';': { | 228 | case ';': { |
159 | while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} | 229 | while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} |
230 | continue; | ||
160 | } break; | 231 | } break; |
161 | case '"': { | 232 | case '"': { |
162 | char prev = c; | 233 | char prev = c; |
@@ -172,73 +243,66 @@ tokenize(const StringView *sv, Errors *errors) { | |||
172 | n++; | 243 | n++; |
173 | } | 244 | } |
174 | if (!found) { | 245 | if (!found) { |
175 | error_push(errors, (Error){ | 246 | push_error(ERR_TYPE_LEXER, ERR_UNMATCHED_STRING, line, col); |
176 | .type = ERR_TYPE_LEXER, | ||
177 | .value = ERR_UNMATCHED_STRING, | ||
178 | .line = line, | ||
179 | .col = col, | ||
180 | }); | ||
181 | return tokens; | 247 | return tokens; |
182 | } | 248 | } |
183 | Token token = (Token){ | 249 | token.value = (StringView){ |
184 | .value = (StringView){ | 250 | .start = &sv->start[offset + 1], |
185 | .start = &sv->start[offset + 1], | 251 | .n = n, |
186 | .n = n, | ||
187 | }, | ||
188 | .type = TOKEN_STRING, | ||
189 | .line = line, | ||
190 | .col = col, | ||
191 | }; | ||
192 | array_push(tokens, token); | ||
193 | } break; | ||
194 | case '(': { | ||
195 | if (scan_peek(&scanner) == ')') { | ||
196 | scan_next(&scanner); | ||
197 | Token token = (Token){ | ||
198 | .type = TOKEN_NIL, | ||
199 | .line = line, | ||
200 | .col = col, | ||
201 | }; | ||
202 | array_push(tokens, token); | ||
203 | } else { | ||
204 | Token token = (Token){ | ||
205 | .type = TOKEN_LPAREN, | ||
206 | .line = line, | ||
207 | .col = col, | ||
208 | }; | ||
209 | array_push(tokens, token); | ||
210 | } | ||
211 | } break; | ||
212 | case ')': { | ||
213 | Token token = (Token){ | ||
214 | .type = TOKEN_RPAREN, | ||
215 | .line = line, | ||
216 | .col = col, | ||
217 | }; | 252 | }; |
218 | array_push(tokens, token); | 253 | token.type = TOKEN_STRING; |
219 | } break; | 254 | } break; |
255 | case '(': { token.type = TOKEN_LPAREN; } break; | ||
256 | case ')': { token.type = TOKEN_RPAREN; } break; | ||
257 | case '[': { token.type = TOKEN_LSQUARE; } break; | ||
258 | case ']': { token.type = TOKEN_RSQUARE; } break; | ||
259 | case '{': { token.type = TOKEN_LCURLY; } break; | ||
260 | case '}': { token.type = TOKEN_RCURLY; } break; | ||
261 | case ':': { token.type = TOKEN_COLON; } break; | ||
262 | case '.': { token.type = TOKEN_DOT; } break; | ||
263 | case '@': { token.type = TOKEN_AT; } break; | ||
220 | default: { | 264 | default: { |
221 | size_t n = 1; | 265 | size_t n = 1; |
222 | while (!is_delimiter(scan_peek(&scanner))) { | 266 | if (c == '-' && !is_delimiter(scan_peek(&scanner))) { |
223 | scan_next(&scanner); | 267 | n += scan_number_token(&scanner); |
224 | n++; | 268 | token.value = (StringView){ |
225 | } | ||
226 | if (c == EOF || c == '\0') { | ||
227 | break; | ||
228 | } | ||
229 | Token token = (Token){ | ||
230 | .value = (StringView){ | ||
231 | .start = &sv->start[offset], | 269 | .start = &sv->start[offset], |
232 | .n = n, | 270 | .n = n, |
233 | }, | 271 | }; |
234 | .type = TOKEN_SYMBOL, | 272 | token.type = TOKEN_NUMBER; |
235 | .line = line, | 273 | } else if (c >= '0' && c <= '9') { |
236 | .col = col, | 274 | scan_rewind(&scanner); |
237 | }; | 275 | n = scan_number_token(&scanner); |
238 | token.type = find_primitive_type(token.value); | 276 | if (n == 0) { |
239 | array_push(tokens, token); | 277 | push_error(ERR_TYPE_LEXER, ERR_MALFORMED_NUMBER, line, col); |
278 | return tokens; | ||
279 | } | ||
280 | token.value = (StringView){ | ||
281 | .start = &sv->start[offset], | ||
282 | .n = n, | ||
283 | }; | ||
284 | token.type = TOKEN_NUMBER; | ||
285 | } else { | ||
286 | while (!is_delimiter(scan_peek(&scanner))) { | ||
287 | if (scan_peek(&scanner) == '.') { | ||
288 | break; | ||
289 | } | ||
290 | c = scan_next(&scanner); | ||
291 | n++; | ||
292 | } | ||
293 | token.value = (StringView){ | ||
294 | .start = &sv->start[offset], | ||
295 | .n = n, | ||
296 | }; | ||
297 | token.type = find_token_type(token.value); | ||
298 | } | ||
240 | } break; | 299 | } break; |
241 | } | 300 | } |
301 | if (token.type == TOKEN_UNKNOWN) { | ||
302 | push_error(ERR_TYPE_LEXER, ERR_UNKNOWN_TOK_TYPE, line, col); | ||
303 | return tokens; | ||
304 | } | ||
305 | array_push(tokens, token); | ||
242 | } | 306 | } |
243 | 307 | ||
244 | // Push EOF token. | 308 | // Push EOF token. |