aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBad Diode <bd@badd10de.dev>2021-10-11 20:59:05 +0200
committerBad Diode <bd@badd10de.dev>2021-10-11 20:59:05 +0200
commit2627e81de26667b7bc9d88304473e2a234fee7fe (patch)
treeb8cbff397c2bd6d9ab42c5ef7d20182a2ec4b492
parent463690390b45ddd96545ae958e2605e262966c9f (diff)
downloadbdl-2627e81de26667b7bc9d88304473e2a234fee7fe.tar.gz
bdl-2627e81de26667b7bc9d88304473e2a234fee7fe.zip
Simplify lexer code
-rw-r--r--src/bootstrap/lexer.c215
1 files changed, 104 insertions, 111 deletions
diff --git a/src/bootstrap/lexer.c b/src/bootstrap/lexer.c
index fc53d3c..1add4dc 100644
--- a/src/bootstrap/lexer.c
+++ b/src/bootstrap/lexer.c
@@ -85,24 +85,14 @@ push_token(Tokens *tokens, Token tok) {
85} 85}
86 86
87typedef struct Scanner { 87typedef struct Scanner {
88 StringView orig;
89 StringView current; 88 StringView current;
90 size_t line_number; 89 size_t line_number;
91 size_t col_number; 90 size_t col_number;
92 size_t offset; 91 size_t offset;
93 size_t lexeme_n;
94 size_t lexeme_offset;
95 size_t lexeme_line_number;
96 size_t lexeme_col_number;
97} Scanner; 92} Scanner;
98 93
99char 94char
100scan_next(Scanner *scanner) { 95scan_next(Scanner *scanner) {
101 if (scanner->lexeme_n == 0) {
102 scanner->lexeme_line_number = scanner->line_number;
103 scanner->lexeme_col_number = scanner->col_number;
104 scanner->lexeme_offset = scanner->offset;
105 }
106 char c = sv_next(&scanner->current); 96 char c = sv_next(&scanner->current);
107 if (c == '\n') { 97 if (c == '\n') {
108 scanner->line_number++; 98 scanner->line_number++;
@@ -124,27 +114,24 @@ scan_has_next(const Scanner *scanner) {
124 return scanner->current.n != 0; 114 return scanner->current.n != 0;
125} 115}
126 116
127bool 117void
128scan_has_lexeme(const Scanner * scanner) { 118skip_whitespace(Scanner *scanner) {
129 return scanner->lexeme_n != 0; 119 while (scan_has_next(scanner)) {
130} 120 char c = scan_peek(scanner);
131 121 switch (c) {
132Token 122 case ' ':
133scan_get_lexeme(Scanner * scanner) { 123 case '\f':
134 Token token = (Token){ 124 case '\n':
135 .type = TOKEN_UNKNOWN, 125 case '\r':
136 .value = (StringView){ 126 case '\t':
137 .start = &scanner->orig.start[scanner->lexeme_offset], 127 case '\v': {
138 .n = scanner->lexeme_n, 128 scan_next(scanner);
139 }, 129 } break;
140 .line = scanner->lexeme_line_number, 130 default: {
141 .column = scanner->lexeme_col_number, 131 return;
142 }; 132 } break;
143 scanner->lexeme_n = 0; 133 }
144 scanner->lexeme_line_number = scanner->line_number; 134 }
145 scanner->lexeme_col_number = scanner->col_number;
146 scanner->lexeme_offset = scanner->offset;
147 return token;
148} 135}
149 136
150TokenType 137TokenType
@@ -172,52 +159,51 @@ find_primitive_type(StringView value) {
172 return TOKEN_SYMBOL; 159 return TOKEN_SYMBOL;
173} 160}
174 161
162bool
163is_delimiter(char c) {
164 switch (c) {
165 case EOF:
166 case '\0':
167 case ';':
168 case '"':
169 case '\'':
170 case '(':
171 case ')':
172 case ' ':
173 case '\f':
174 case '\n':
175 case '\r':
176 case '\t':
177 case '\v': {
178 return true;
179 } break;
180 }
181 return false;
182}
183
175Tokens 184Tokens
176tokenize(const StringView *sv) { 185tokenize(const StringView *sv) {
177 Tokens tokens = (Tokens){0}; 186 Tokens tokens = (Tokens){0};
178 Scanner scanner = (Scanner){ 187 Scanner scanner = (Scanner){
179 .orig = *sv,
180 .current = *sv, 188 .current = *sv,
181 .line_number = 1, 189 .line_number = 1,
182 .col_number = 1, 190 .col_number = 1,
183 .lexeme_line_number = 1,
184 .lexeme_col_number = 1,
185 }; 191 };
186 192
187 while (scan_has_next(&scanner)) { 193 while (scan_has_next(&scanner)) {
194 skip_whitespace(&scanner);
195 size_t line = scanner.line_number;
196 size_t col = scanner.col_number;
197 size_t offset = scanner.offset;
188 char c = scan_next(&scanner); 198 char c = scan_next(&scanner);
189 switch (c) { 199 switch (c) {
190 case ' ':
191 case '\f':
192 case '\n':
193 case '\r':
194 case '\t':
195 case '\v': {
196 if (scan_has_lexeme(&scanner)) {
197 Token token = scan_get_lexeme(&scanner);
198 token.type = find_primitive_type(token.value);
199 push_token(&tokens, token);
200 }
201 } break;
202 case ';': { 200 case ';': {
203 if (scan_has_lexeme(&scanner)) {
204 Token token = scan_get_lexeme(&scanner);
205 token.type = find_primitive_type(token.value);
206 push_token(&tokens, token);
207 }
208 while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} 201 while ((c = scan_next(&scanner)) != '\n' && c != '\0') {}
209 } break; 202 } break;
210 case '"': { 203 case '"': {
211 if (scan_has_lexeme(&scanner)) {
212 Token token = scan_get_lexeme(&scanner);
213 token.type = find_primitive_type(token.value);
214 push_token(&tokens, token);
215 scanner.lexeme_col_number--;
216 scanner.lexeme_offset--;
217 }
218
219 char prev = c; 204 char prev = c;
220 bool found = false; 205 bool found = false;
206 size_t n = 0;
221 while (scan_has_next(&scanner)) { 207 while (scan_has_next(&scanner)) {
222 c = scan_next(&scanner); 208 c = scan_next(&scanner);
223 if (c == '"' && prev != '\\') { 209 if (c == '"' && prev != '\\') {
@@ -225,82 +211,89 @@ tokenize(const StringView *sv) {
225 break; 211 break;
226 } 212 }
227 prev = c; 213 prev = c;
228 scanner.lexeme_n++; 214 n++;
229 } 215 }
230 scanner.lexeme_col_number--; 216 if (!found) {
231 if (found) {
232 Token token = scan_get_lexeme(&scanner);
233 token.type = TOKEN_STRING;
234 push_token(&tokens, token);
235 } else {
236 // TODO: Report error: couldn't find the closing quotes. 217 // TODO: Report error: couldn't find the closing quotes.
237 } 218 }
219 Token token = (Token){
220 .value = (StringView){
221 .start = &sv->start[offset + 1],
222 .n = n,
223 },
224 .type = TOKEN_STRING,
225 .line = line,
226 .column = col,
227 };
228 push_token(&tokens, token);
238 } break; 229 } break;
239 case '\'': { 230 case '\'': {
240 if (scan_has_lexeme(&scanner)) { 231 Token token = (Token){
241 Token token = scan_get_lexeme(&scanner); 232 .type = TOKEN_QUOTE,
242 token.type = find_primitive_type(token.value); 233 .line = line,
243 push_token(&tokens, token); 234 .column = col,
244 scanner.lexeme_col_number--; 235 };
245 scanner.lexeme_offset--;
246 }
247 Token token = scan_get_lexeme(&scanner);
248 token.type = TOKEN_QUOTE;
249 push_token(&tokens, token); 236 push_token(&tokens, token);
250 } break; 237 } break;
251 case '(': { 238 case '(': {
252 if (scan_has_lexeme(&scanner)) {
253 Token token = scan_get_lexeme(&scanner);
254 token.type = find_primitive_type(token.value);
255 push_token(&tokens, token);
256 scanner.lexeme_col_number--;
257 scanner.lexeme_offset--;
258 }
259 scanner.lexeme_n++;
260 if (scan_peek(&scanner) == ')') { 239 if (scan_peek(&scanner) == ')') {
261 scanner.lexeme_n++;
262 scan_next(&scanner); 240 scan_next(&scanner);
263 Token token = scan_get_lexeme(&scanner); 241 Token token = (Token){
264 token.type = TOKEN_NIL; 242 .type = TOKEN_NIL,
243 .line = line,
244 .column = col,
245 };
265 push_token(&tokens, token); 246 push_token(&tokens, token);
266 } else { 247 } else {
267 Token token = scan_get_lexeme(&scanner); 248 Token token = (Token){
268 token.type = TOKEN_LPAREN; 249 .type = TOKEN_LPAREN,
250 .line = line,
251 .column = col,
252 };
269 push_token(&tokens, token); 253 push_token(&tokens, token);
270 } 254 }
271 } break; 255 } break;
272 case ')': { 256 case ')': {
273 if (scan_has_lexeme(&scanner)) { 257 Token token = (Token){
274 Token token = scan_get_lexeme(&scanner); 258 .type = TOKEN_RPAREN,
275 token.type = find_primitive_type(token.value); 259 .line = line,
276 push_token(&tokens, token); 260 .column = col,
277 scanner.lexeme_col_number--; 261 };
278 scanner.lexeme_offset--;
279 }
280 scanner.lexeme_n++;
281 Token token = scan_get_lexeme(&scanner);
282 token.type = TOKEN_RPAREN;
283 push_token(&tokens, token); 262 push_token(&tokens, token);
284 } break; 263 } break;
285 case EOF: {
286 break;
287 } break;
288 default: { 264 default: {
289 scanner.lexeme_n++; 265 size_t n = 1;
266 while (scan_has_next(&scanner)) {
267 c = scan_next(&scanner);
268 if (is_delimiter(c)) {
269 break;
270 }
271 n++;
272 }
273 if (c == EOF || c == '\0') {
274 break;
275 }
276 Token token = (Token){
277 .value = (StringView){
278 .start = &sv->start[offset],
279 .n = n,
280 },
281 .type = TOKEN_SYMBOL,
282 .line = line,
283 .column = col,
284 };
285 token.type = find_primitive_type(token.value);
286 push_token(&tokens, token);
290 } break; 287 } break;
291 } 288 }
292 } 289 }
293 290
294 // Push current lexeme if any.
295 if (scan_has_lexeme(&scanner)) {
296 Token token = scan_get_lexeme(&scanner);
297 token.type = find_primitive_type(token.value);
298 push_token(&tokens, token);
299 }
300
301 // Push EOF token. 291 // Push EOF token.
302 Token token = scan_get_lexeme(&scanner); 292 Token token = (Token){
303 token.type = TOKEN_EOF; 293 .type = TOKEN_EOF,
294 .line = scanner.line_number,
295 .column = 1,
296 };
304 push_token(&tokens, token); 297 push_token(&tokens, token);
305 298
306 return tokens; 299 return tokens;