aboutsummaryrefslogtreecommitdiffstats
path: root/src/lexer.c
diff options
context:
space:
mode:
authorBad Diode <bd@badd10de.dev>2022-02-01 18:36:52 +0100
committerBad Diode <bd@badd10de.dev>2022-02-01 18:36:52 +0100
commitee1a5de91c875fb66724dc21c02333bfebe2a812 (patch)
treed3eaa226816d295bb9dc48a2aed27044832ec413 /src/lexer.c
parent3156265c7b2da8cc43fee996c0518ea274d39c8a (diff)
downloadbdl-ee1a5de91c875fb66724dc21c02333bfebe2a812.tar.gz
bdl-ee1a5de91c875fb66724dc21c02333bfebe2a812.zip
Add new syntax to lexer and prepare refactor
Diffstat (limited to 'src/lexer.c')
-rw-r--r--src/lexer.c224
1 files changed, 144 insertions, 80 deletions
diff --git a/src/lexer.c b/src/lexer.c
index 09c8f6c..56b670b 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -5,7 +5,11 @@ static const char* token_str[] = {
5 [TOKEN_UNKNOWN] = "TOKEN_UNKNOWN", 5 [TOKEN_UNKNOWN] = "TOKEN_UNKNOWN",
6 [TOKEN_LPAREN] = "TOKEN_LPAREN", 6 [TOKEN_LPAREN] = "TOKEN_LPAREN",
7 [TOKEN_RPAREN] = "TOKEN_RPAREN", 7 [TOKEN_RPAREN] = "TOKEN_RPAREN",
8 [TOKEN_FIXNUM] = "TOKEN_FIXNUM", 8 [TOKEN_LSQUARE] = "TOKEN_LSQUARE",
9 [TOKEN_RSQUARE] = "TOKEN_RSQUARE",
10 [TOKEN_LCURLY] = "TOKEN_LCURLY",
11 [TOKEN_RCURLY] = "TOKEN_RCURLY",
12 [TOKEN_NUMBER] = "TOKEN_NUMBER",
9 [TOKEN_SYMBOL] = "TOKEN_SYMBOL", 13 [TOKEN_SYMBOL] = "TOKEN_SYMBOL",
10 [TOKEN_STRING] = "TOKEN_STRING", 14 [TOKEN_STRING] = "TOKEN_STRING",
11 [TOKEN_NIL] = "TOKEN_NIL", 15 [TOKEN_NIL] = "TOKEN_NIL",
@@ -16,6 +20,10 @@ static const char* token_str[] = {
16 [TOKEN_DEF] = "TOKEN_DEF", 20 [TOKEN_DEF] = "TOKEN_DEF",
17 [TOKEN_SET] = "TOKEN_SET", 21 [TOKEN_SET] = "TOKEN_SET",
18 [TOKEN_FUN] = "TOKEN_FUN", 22 [TOKEN_FUN] = "TOKEN_FUN",
23 [TOKEN_STRUCT] = "TOKEN_STRUCT",
24 [TOKEN_COLON] = "TOKEN_COLON",
25 [TOKEN_DOT] = "TOKEN_DOT",
26 [TOKEN_AT] = "TOKEN_AT",
19 [TOKEN_EOF] = "TOKEN_EOF", 27 [TOKEN_EOF] = "TOKEN_EOF",
20}; 28};
21 29
@@ -24,14 +32,8 @@ print_token(Token tok) {
24 printf("[%4ld:%-4ld] ", tok.line, tok.col); 32 printf("[%4ld:%-4ld] ", tok.line, tok.col);
25 printf("%s", token_str[tok.type]); 33 printf("%s", token_str[tok.type]);
26 switch (tok.type) { 34 switch (tok.type) {
27 case TOKEN_FIXNUM: { 35 case TOKEN_NUMBER:
28 printf(" -> "); 36 case TOKEN_SYMBOL:
29 sv_write(&tok.value);
30 } break;
31 case TOKEN_SYMBOL: {
32 printf(" -> ");
33 sv_write(&tok.value);
34 } break;
35 case TOKEN_STRING: { 37 case TOKEN_STRING: {
36 printf(" -> "); 38 printf(" -> ");
37 sv_write(&tok.value); 39 sv_write(&tok.value);
@@ -55,6 +57,12 @@ scan_next(Scanner *scanner) {
55 return c; 57 return c;
56} 58}
57 59
60void
61scan_rewind(Scanner *scanner) {
62 sv_rewind(&scanner->current);
63 scanner->offset--;
64}
65
58char 66char
59scan_peek(const Scanner *scanner) { 67scan_peek(const Scanner *scanner) {
60 return sv_peek(&scanner->current); 68 return sv_peek(&scanner->current);
@@ -95,6 +103,12 @@ is_delimiter(char c) {
95 case '\'': 103 case '\'':
96 case '(': 104 case '(':
97 case ')': 105 case ')':
106 case '[':
107 case ']':
108 case '{':
109 case '}':
110 case ':':
111 case '@':
98 case ' ': 112 case ' ':
99 case '\f': 113 case '\f':
100 case '\n': 114 case '\n':
@@ -110,22 +124,65 @@ is_delimiter(char c) {
110#define TOKEN_IS_KEYWORD(VAL, KEYWORD) \ 124#define TOKEN_IS_KEYWORD(VAL, KEYWORD) \
111 sv_equal(&(VAL), &(StringView){(KEYWORD), sizeof(KEYWORD) - 1}) 125 sv_equal(&(VAL), &(StringView){(KEYWORD), sizeof(KEYWORD) - 1})
112 126
113TokenType 127size_t
114find_primitive_type(const StringView value) { 128scan_number_token(Scanner *scanner) {
115 bool is_fixnum = true; 129 char first = scan_next(scanner);
116 for (size_t i = 0; i < value.n; i++) { 130 char second = scan_peek(scanner);
117 char c = value.start[i]; 131 size_t n = 1;
118 if (i == 0 && c == '-' && value.n > 1) { 132 if (first == '0' && !is_delimiter(second)) {
119 continue; 133 if (second == 'x') {
120 } 134 // Hex constant.
121 if (!(c >= '0' && c <= '9')) { 135 scan_next(scanner);
122 is_fixnum = false; 136 n++;
123 break; 137 if (is_delimiter(scan_peek(scanner))) {
138 return 0;
139 }
140 while (!is_delimiter(scan_peek(scanner))) {
141 char c = scan_next(scanner);
142 if (!(c >= '0' && c <= '9') &&
143 !(c >= 'a' && c <= 'f') &&
144 !(c >= 'A' && c <= 'F')) {
145 return 0;
146 }
147 n++;
148 }
149 return n;
150 } else if (second == 'b') {
151 // Binary constant.
152 scan_next(scanner);
153 n++;
154 if (is_delimiter(scan_peek(scanner))) {
155 return 0;
156 }
157 while (!is_delimiter(scan_peek(scanner))) {
158 char c = scan_next(scanner);
159 if (!(c == '0' || c == '1')) {
160 return 0;
161 }
162 n++;
163 }
124 } 164 }
125 } 165 }
126 if (is_fixnum) { 166
127 return TOKEN_FIXNUM; 167 // Decimal number or floating point.
168 bool has_dot = false;
169 while (!is_delimiter(scan_peek(scanner))) {
170 char c = scan_next(scanner);
171 if (c == '.') {
172 if (has_dot) {
173 return 0;
174 }
175 has_dot = true;
176 } else if (!(c >= '0' && c <= '9')) {
177 return 0;
178 }
179 n++;
128 } 180 }
181 return n;
182}
183
184TokenType
185find_token_type(const StringView value) {
129 if (TOKEN_IS_KEYWORD(value, "nil")) { return TOKEN_NIL; } 186 if (TOKEN_IS_KEYWORD(value, "nil")) { return TOKEN_NIL; }
130 if (TOKEN_IS_KEYWORD(value, "true")) { return TOKEN_TRUE; } 187 if (TOKEN_IS_KEYWORD(value, "true")) { return TOKEN_TRUE; }
131 if (TOKEN_IS_KEYWORD(value, "false")) { return TOKEN_FALSE; } 188 if (TOKEN_IS_KEYWORD(value, "false")) { return TOKEN_FALSE; }
@@ -134,12 +191,20 @@ find_primitive_type(const StringView value) {
134 if (TOKEN_IS_KEYWORD(value, "def")) { return TOKEN_DEF; } 191 if (TOKEN_IS_KEYWORD(value, "def")) { return TOKEN_DEF; }
135 if (TOKEN_IS_KEYWORD(value, "set!")) { return TOKEN_SET; } 192 if (TOKEN_IS_KEYWORD(value, "set!")) { return TOKEN_SET; }
136 if (TOKEN_IS_KEYWORD(value, "fun")) { return TOKEN_FUN; } 193 if (TOKEN_IS_KEYWORD(value, "fun")) { return TOKEN_FUN; }
194 if (TOKEN_IS_KEYWORD(value, "struct")) { return TOKEN_STRUCT; }
137 195
138 return TOKEN_SYMBOL; 196 return TOKEN_SYMBOL;
139} 197}
140 198
199void
200print_tokens(Token *tokens) {
201 for (size_t i = 0; i < array_size(tokens); i++) {
202 print_token(tokens[i]);
203 }
204}
205
141Token * 206Token *
142tokenize(const StringView *sv, Errors *errors) { 207tokenize(const StringView *sv) {
143 Token *tokens = NULL; 208 Token *tokens = NULL;
144 array_init(tokens, 1); 209 array_init(tokens, 1);
145 Scanner scanner = (Scanner){ 210 Scanner scanner = (Scanner){
@@ -153,10 +218,16 @@ tokenize(const StringView *sv, Errors *errors) {
153 size_t line = scanner.line_number; 218 size_t line = scanner.line_number;
154 size_t col = scanner.col_number; 219 size_t col = scanner.col_number;
155 size_t offset = scanner.offset; 220 size_t offset = scanner.offset;
221 Token token = (Token){
222 .type = TOKEN_UNKNOWN,
223 .line = line,
224 .col = col,
225 };
156 char c = scan_next(&scanner); 226 char c = scan_next(&scanner);
157 switch (c) { 227 switch (c) {
158 case ';': { 228 case ';': {
159 while ((c = scan_next(&scanner)) != '\n' && c != '\0') {} 229 while ((c = scan_next(&scanner)) != '\n' && c != '\0') {}
230 continue;
160 } break; 231 } break;
161 case '"': { 232 case '"': {
162 char prev = c; 233 char prev = c;
@@ -172,73 +243,66 @@ tokenize(const StringView *sv, Errors *errors) {
172 n++; 243 n++;
173 } 244 }
174 if (!found) { 245 if (!found) {
175 error_push(errors, (Error){ 246 push_error(ERR_TYPE_LEXER, ERR_UNMATCHED_STRING, line, col);
176 .type = ERR_TYPE_LEXER,
177 .value = ERR_UNMATCHED_STRING,
178 .line = line,
179 .col = col,
180 });
181 return tokens; 247 return tokens;
182 } 248 }
183 Token token = (Token){ 249 token.value = (StringView){
184 .value = (StringView){ 250 .start = &sv->start[offset + 1],
185 .start = &sv->start[offset + 1], 251 .n = n,
186 .n = n,
187 },
188 .type = TOKEN_STRING,
189 .line = line,
190 .col = col,
191 };
192 array_push(tokens, token);
193 } break;
194 case '(': {
195 if (scan_peek(&scanner) == ')') {
196 scan_next(&scanner);
197 Token token = (Token){
198 .type = TOKEN_NIL,
199 .line = line,
200 .col = col,
201 };
202 array_push(tokens, token);
203 } else {
204 Token token = (Token){
205 .type = TOKEN_LPAREN,
206 .line = line,
207 .col = col,
208 };
209 array_push(tokens, token);
210 }
211 } break;
212 case ')': {
213 Token token = (Token){
214 .type = TOKEN_RPAREN,
215 .line = line,
216 .col = col,
217 }; 252 };
218 array_push(tokens, token); 253 token.type = TOKEN_STRING;
219 } break; 254 } break;
255 case '(': { token.type = TOKEN_LPAREN; } break;
256 case ')': { token.type = TOKEN_RPAREN; } break;
257 case '[': { token.type = TOKEN_LSQUARE; } break;
258 case ']': { token.type = TOKEN_RSQUARE; } break;
259 case '{': { token.type = TOKEN_LCURLY; } break;
260 case '}': { token.type = TOKEN_RCURLY; } break;
261 case ':': { token.type = TOKEN_COLON; } break;
262 case '.': { token.type = TOKEN_DOT; } break;
263 case '@': { token.type = TOKEN_AT; } break;
220 default: { 264 default: {
221 size_t n = 1; 265 size_t n = 1;
222 while (!is_delimiter(scan_peek(&scanner))) { 266 if (c == '-' && !is_delimiter(scan_peek(&scanner))) {
223 scan_next(&scanner); 267 n += scan_number_token(&scanner);
224 n++; 268 token.value = (StringView){
225 }
226 if (c == EOF || c == '\0') {
227 break;
228 }
229 Token token = (Token){
230 .value = (StringView){
231 .start = &sv->start[offset], 269 .start = &sv->start[offset],
232 .n = n, 270 .n = n,
233 }, 271 };
234 .type = TOKEN_SYMBOL, 272 token.type = TOKEN_NUMBER;
235 .line = line, 273 } else if (c >= '0' && c <= '9') {
236 .col = col, 274 scan_rewind(&scanner);
237 }; 275 n = scan_number_token(&scanner);
238 token.type = find_primitive_type(token.value); 276 if (n == 0) {
239 array_push(tokens, token); 277 push_error(ERR_TYPE_LEXER, ERR_MALFORMED_NUMBER, line, col);
278 return tokens;
279 }
280 token.value = (StringView){
281 .start = &sv->start[offset],
282 .n = n,
283 };
284 token.type = TOKEN_NUMBER;
285 } else {
286 while (!is_delimiter(scan_peek(&scanner))) {
287 if (scan_peek(&scanner) == '.') {
288 break;
289 }
290 c = scan_next(&scanner);
291 n++;
292 }
293 token.value = (StringView){
294 .start = &sv->start[offset],
295 .n = n,
296 };
297 token.type = find_token_type(token.value);
298 }
240 } break; 299 } break;
241 } 300 }
301 if (token.type == TOKEN_UNKNOWN) {
302 push_error(ERR_TYPE_LEXER, ERR_UNKNOWN_TOK_TYPE, line, col);
303 return tokens;
304 }
305 array_push(tokens, token);
242 } 306 }
243 307
244 // Push EOF token. 308 // Push EOF token.