diff options
Diffstat (limited to 'src/treewalk/lexer.c')
-rw-r--r-- | src/treewalk/lexer.c | 257 |
1 files changed, 257 insertions, 0 deletions
diff --git a/src/treewalk/lexer.c b/src/treewalk/lexer.c new file mode 100644 index 0000000..38ca37c --- /dev/null +++ b/src/treewalk/lexer.c | |||
@@ -0,0 +1,257 @@ | |||
1 | #include "lexer.h" | ||
2 | |||
3 | void | ||
4 | print_token(Token tok) { | ||
5 | printf("LINE: %3ld COL: %3ld ", tok.line, tok.column); | ||
6 | switch (tok.type) { | ||
7 | case TOKEN_LPAREN: { | ||
8 | printf("TOKEN_LPAREN"); | ||
9 | } break; | ||
10 | case TOKEN_RPAREN: { | ||
11 | printf("TOKEN_RPAREN"); | ||
12 | } break; | ||
13 | case TOKEN_QUOTE: { | ||
14 | printf("TOKEN_QUOTE"); | ||
15 | } break; | ||
16 | case TOKEN_TRUE: { | ||
17 | printf("TOKEN_TRUE"); | ||
18 | } break; | ||
19 | case TOKEN_FALSE: { | ||
20 | printf("TOKEN_FALSE"); | ||
21 | } break; | ||
22 | case TOKEN_NIL: { | ||
23 | printf("TOKEN_NIL"); | ||
24 | } break; | ||
25 | case TOKEN_FIXNUM: { | ||
26 | printf("TOKEN_FIXNUM -> "); | ||
27 | sv_write(&tok.value, stdout); | ||
28 | } break; | ||
29 | case TOKEN_SYMBOL: { | ||
30 | printf("TOKEN_SYMBOL -> "); | ||
31 | sv_write(&tok.value, stdout); | ||
32 | } break; | ||
33 | case TOKEN_STRING: { | ||
34 | printf("TOKEN_STRING -> "); | ||
35 | sv_write(&tok.value, stdout); | ||
36 | } break; | ||
37 | case TOKEN_EOF: { | ||
38 | printf("TOKEN_EOF"); | ||
39 | } break; | ||
40 | case TOKEN_UNKNOWN: { | ||
41 | printf("TOKEN_UNKNOWN"); | ||
42 | } break; | ||
43 | } | ||
44 | printf("\n"); | ||
45 | } | ||
46 | |||
47 | char | ||
48 | scan_next(Scanner *scanner) { | ||
49 | char c = sv_next(&scanner->current); | ||
50 | if (c == '\n') { | ||
51 | scanner->line_number++; | ||
52 | scanner->col_number = 1; | ||
53 | } else { | ||
54 | scanner->col_number++; | ||
55 | } | ||
56 | scanner->offset++; | ||
57 | return c; | ||
58 | } | ||
59 | |||
60 | char | ||
61 | scan_peek(const Scanner *scanner) { | ||
62 | return sv_peek(&scanner->current); | ||
63 | } | ||
64 | |||
65 | bool | ||
66 | scan_has_next(const Scanner *scanner) { | ||
67 | return scanner->current.n != 0; | ||
68 | } | ||
69 | |||
70 | void | ||
71 | skip_whitespace(Scanner *scanner) { | ||
72 | while (scan_has_next(scanner)) { | ||
73 | char c = scan_peek(scanner); | ||
74 | switch (c) { | ||
75 | case ' ': | ||
76 | case '\f': | ||
77 | case '\n': | ||
78 | case '\r': | ||
79 | case '\t': | ||
80 | case '\v': { | ||
81 | scan_next(scanner); | ||
82 | } break; | ||
83 | default: { | ||
84 | return; | ||
85 | } break; | ||
86 | } | ||
87 | } | ||
88 | } | ||
89 | |||
90 | bool | ||
91 | is_delimiter(char c) { | ||
92 | switch (c) { | ||
93 | case EOF: | ||
94 | case '\0': | ||
95 | case ';': | ||
96 | case '"': | ||
97 | case '\'': | ||
98 | case '(': | ||
99 | case ')': | ||
100 | case ' ': | ||
101 | case '\f': | ||
102 | case '\n': | ||
103 | case '\r': | ||
104 | case '\t': | ||
105 | case '\v': { | ||
106 | return true; | ||
107 | } break; | ||
108 | } | ||
109 | return false; | ||
110 | } | ||
111 | |||
112 | TokenType | ||
113 | find_primitive_type(const StringView value) { | ||
114 | bool is_fixnum = true; | ||
115 | for (size_t i = 0; i < value.n; i++) { | ||
116 | char c = value.start[i]; | ||
117 | if (i == 0 && c == '-' && value.n > 1) { | ||
118 | continue; | ||
119 | } | ||
120 | if (!(c >= '0' && c <= '9')) { | ||
121 | is_fixnum = false; | ||
122 | break; | ||
123 | } | ||
124 | } | ||
125 | if (is_fixnum) { | ||
126 | return TOKEN_FIXNUM; | ||
127 | } | ||
128 | if (sv_equal(&value, &(StringView){"true", 4})) { | ||
129 | return TOKEN_TRUE; | ||
130 | } | ||
131 | if (sv_equal(&value, &(StringView){"false", 5})) { | ||
132 | return TOKEN_FALSE; | ||
133 | } | ||
134 | return TOKEN_SYMBOL; | ||
135 | } | ||
136 | |||
137 | Token * | ||
138 | tokenize(const StringView *sv) { | ||
139 | Token *tokens = NULL; | ||
140 | array_init(tokens, 1); | ||
141 | Scanner scanner = (Scanner){ | ||
142 | .current = *sv, | ||
143 | .line_number = 1, | ||
144 | .col_number = 1, | ||
145 | }; | ||
146 | |||
147 | while (scan_has_next(&scanner)) { | ||
148 | skip_whitespace(&scanner); | ||
149 | size_t line = scanner.line_number; | ||
150 | size_t col = scanner.col_number; | ||
151 | size_t offset = scanner.offset; | ||
152 | char c = scan_next(&scanner); | ||
153 | switch (c) { | ||
154 | case ';': { | ||
155 | while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} | ||
156 | } break; | ||
157 | case '"': { | ||
158 | char prev = c; | ||
159 | bool found = false; | ||
160 | size_t n = 0; | ||
161 | while (scan_has_next(&scanner)) { | ||
162 | c = scan_next(&scanner); | ||
163 | if (c == '"' && prev != '\\') { | ||
164 | found = true; | ||
165 | break; | ||
166 | } | ||
167 | prev = c; | ||
168 | n++; | ||
169 | } | ||
170 | if (!found) { | ||
171 | error_push((Error){ | ||
172 | .type = ERR_TYPE_LEXER, | ||
173 | .value = ERR_UNMATCHED_STRING, | ||
174 | .line = line, | ||
175 | .col = col, | ||
176 | }); | ||
177 | return tokens; | ||
178 | } | ||
179 | Token token = (Token){ | ||
180 | .value = (StringView){ | ||
181 | .start = &sv->start[offset + 1], | ||
182 | .n = n, | ||
183 | }, | ||
184 | .type = TOKEN_STRING, | ||
185 | .line = line, | ||
186 | .column = col, | ||
187 | }; | ||
188 | array_push(tokens, token); | ||
189 | } break; | ||
190 | case '\'': { | ||
191 | Token token = (Token){ | ||
192 | .type = TOKEN_QUOTE, | ||
193 | .line = line, | ||
194 | .column = col, | ||
195 | }; | ||
196 | array_push(tokens, token); | ||
197 | } break; | ||
198 | case '(': { | ||
199 | if (scan_peek(&scanner) == ')') { | ||
200 | scan_next(&scanner); | ||
201 | Token token = (Token){ | ||
202 | .type = TOKEN_NIL, | ||
203 | .line = line, | ||
204 | .column = col, | ||
205 | }; | ||
206 | array_push(tokens, token); | ||
207 | } else { | ||
208 | Token token = (Token){ | ||
209 | .type = TOKEN_LPAREN, | ||
210 | .line = line, | ||
211 | .column = col, | ||
212 | }; | ||
213 | array_push(tokens, token); | ||
214 | } | ||
215 | } break; | ||
216 | case ')': { | ||
217 | Token token = (Token){ | ||
218 | .type = TOKEN_RPAREN, | ||
219 | .line = line, | ||
220 | .column = col, | ||
221 | }; | ||
222 | array_push(tokens, token); | ||
223 | } break; | ||
224 | default: { | ||
225 | size_t n = 1; | ||
226 | while (!is_delimiter(scan_peek(&scanner))) { | ||
227 | scan_next(&scanner); | ||
228 | n++; | ||
229 | } | ||
230 | if (c == EOF || c == '\0') { | ||
231 | break; | ||
232 | } | ||
233 | Token token = (Token){ | ||
234 | .value = (StringView){ | ||
235 | .start = &sv->start[offset], | ||
236 | .n = n, | ||
237 | }, | ||
238 | .type = TOKEN_SYMBOL, | ||
239 | .line = line, | ||
240 | .column = col, | ||
241 | }; | ||
242 | token.type = find_primitive_type(token.value); | ||
243 | array_push(tokens, token); | ||
244 | } break; | ||
245 | } | ||
246 | } | ||
247 | |||
248 | // Push EOF token. | ||
249 | Token token = (Token){ | ||
250 | .type = TOKEN_EOF, | ||
251 | .line = scanner.line_number, | ||
252 | .column = 1, | ||
253 | }; | ||
254 | array_push(tokens, token); | ||
255 | |||
256 | return tokens; | ||
257 | } | ||