diff options
author | Bad Diode <bd@badd10de.dev> | 2023-04-17 09:15:11 +0200 |
---|---|---|
committer | Bad Diode <bd@badd10de.dev> | 2023-04-17 09:15:11 +0200 |
commit | 54bf0529ab9e2af55f63890b819ece37ab854194 (patch) | |
tree | efbd46f676794537de9c20c59cc56ae9ff6ca829 | |
parent | fc83417ad04f19bfea4313cd811facf21d8dec9b (diff) | |
download | gba-link-cable-tester-54bf0529ab9e2af55f63890b819ece37ab854194.tar.gz gba-link-cable-tester-54bf0529ab9e2af55f63890b819ece37ab854194.zip |
Add big lut tables for decoding 1bpp rows
While the performance increase using this tables can be high, they
consume drastically more memory than with smaller ones and per nibble
decoding. For now I'm using the big tables but allow a compile time
switch for the small ones instead.
-rw-r--r-- | src/renderer_m0.c | 196 |
1 files changed, 174 insertions, 22 deletions
diff --git a/src/renderer_m0.c b/src/renderer_m0.c index d66785a..45923e8 100644 --- a/src/renderer_m0.c +++ b/src/renderer_m0.c | |||
@@ -69,11 +69,13 @@ draw_pixel(size_t x, size_t y, u8 clr) { | |||
69 | size_t tile_y = y / 8; | 69 | size_t tile_y = y / 8; |
70 | size_t start_col = x % 8; | 70 | size_t start_col = x % 8; |
71 | size_t start_row = y % 8; | 71 | size_t start_row = y % 8; |
72 | size_t pos = start_row + (tile_x + tile_y * 32) * 8; | 72 | u32 *dst = &backbuf[start_row + (tile_x + tile_y * 32) * 8]; |
73 | 73 | ||
74 | // Update backbuffer. | 74 | // Update backbuffer. |
75 | size_t shift = start_col * sizeof(u32); | 75 | size_t shift = start_col * sizeof(u32); |
76 | backbuf[pos] = (backbuf[pos] & ~(0xF << shift)) | clr << shift; | 76 | u32 mask = 0xF << shift; |
77 | u32 row = clr << shift; | ||
78 | *dst = (*dst & ~mask) | row; | ||
77 | 79 | ||
78 | // Mark tile as dirty. | 80 | // Mark tile as dirty. |
79 | dirty_tiles[tile_y] |= 1 << tile_x; | 81 | dirty_tiles[tile_y] |= 1 << tile_x; |
@@ -100,11 +102,11 @@ draw_hline(size_t x0, size_t x1, size_t y0, u8 clr) { | |||
100 | // partial row updates, rows in the middle can write the entire | 102 | // partial row updates, rows in the middle can write the entire |
101 | // row. | 103 | // row. |
102 | size_t dtx = tile_x1 - tile_x0; | 104 | size_t dtx = tile_x1 - tile_x0; |
103 | u32 *dst = &backbuf[tile_x0 * 8 + tile_y * 8 * 32 + start_row]; | 105 | u32 *dst = &backbuf[start_row + (tile_x0 + tile_y * 32) * 8]; |
104 | if (dtx < 1) { | 106 | if (dtx < 1) { |
105 | u32 mask = 0xFFFFFFFF; | 107 | size_t shift_left = start_col * 4; |
106 | mask >>= (7 - end_col - dtx) * 4; | 108 | size_t shift_right = (7 - end_col) * 4; |
107 | mask &= 0xFFFFFFFF << start_col * 4; | 109 | u32 mask = (0xFFFFFFFF >> shift_right) & (0xFFFFFFFF << shift_left); |
108 | u32 row = (0x11111111 * clr) & mask; | 110 | u32 row = (0x11111111 * clr) & mask; |
109 | *dst = (*dst & ~mask) | row; | 111 | *dst = (*dst & ~mask) | row; |
110 | } else { | 112 | } else { |
@@ -139,7 +141,7 @@ draw_vline(size_t x0, size_t y0, size_t y1, u8 clr) { | |||
139 | 141 | ||
140 | size_t shift_left = start_col * 4; | 142 | size_t shift_left = start_col * 4; |
141 | 143 | ||
142 | u32 *dst = &backbuf[tile_x * 8 + tile_y * 8 * 32 + start_row0]; | 144 | u32 *dst = &backbuf[start_row0 + (tile_x + tile_y * 32) * 8]; |
143 | u32 mask = 0x0000000F << shift_left; | 145 | u32 mask = 0x0000000F << shift_left; |
144 | u32 row = (0x11111111 * clr) & mask; | 146 | u32 row = (0x11111111 * clr) & mask; |
145 | u32 dty = tile_y1 - tile_y0; | 147 | u32 dty = tile_y1 - tile_y0; |
@@ -176,22 +178,31 @@ draw_line(size_t x0, size_t y0, size_t x1, size_t y1, u8 clr) { | |||
176 | MAYBE_SWAP(y0, y1); | 178 | MAYBE_SWAP(y0, y1); |
177 | draw_vline(x0, y0, y1, clr); | 179 | draw_vline(x0, y0, y1, clr); |
178 | } else { | 180 | } else { |
179 | #if 0 | 181 | #if 1 |
180 | // Diagonal line. | ||
181 | int dx = x0 > x1 ? x0 - x1 : x1 - x0; | 182 | int dx = x0 > x1 ? x0 - x1 : x1 - x0; |
182 | int dy = y0 > y1 ? y1 - y0 : y0 - y1; | 183 | int dy = y0 > y1 ? y0 - y1 : y1 - y0; |
183 | int x_step = x0 < x1 ? 1 : -1; | 184 | int x_step = x0 > x1 ? -1 : 1; |
184 | int y_step = y0 < y1 ? 1 : -1; | 185 | int y_step = y0 > y1 ? -1 : 1; |
185 | int err = dx + dy; | 186 | if (dx >= dy) { |
186 | while (!(x0 == x1 && y0 == y1)) { | 187 | int diff = 2 * dy - dx; |
187 | draw_pixel(x0, y0, clr); | 188 | for (int i = 0; i < dx + 1; i++) { |
188 | int diff = 2 * err; | 189 | draw_pixel(x0, y0, clr); |
189 | if (diff >= dy) { | 190 | if (diff >= 0) { |
190 | err += dy; | 191 | diff -= 2 * dx; |
192 | y0 += y_step; | ||
193 | } | ||
194 | diff += 2 * dy; | ||
191 | x0 += x_step; | 195 | x0 += x_step; |
192 | } | 196 | } |
193 | if (diff <= dx) { | 197 | } else { |
194 | err += dx; | 198 | int diff = 2 * dx - dy; |
199 | for (int i = 0; i < dy + 1; i++) { | ||
200 | draw_pixel(x0, y0, clr); | ||
201 | if (diff >= 0) { | ||
202 | diff -= 2 * dy; | ||
203 | x0 += x_step; | ||
204 | } | ||
205 | diff += 2 * dx; | ||
195 | y0 += y_step; | 206 | y0 += y_step; |
196 | } | 207 | } |
197 | } | 208 | } |
@@ -402,6 +413,130 @@ flip_buffer(void) { | |||
402 | // } | 413 | // } |
403 | } | 414 | } |
404 | 415 | ||
416 | #define DEC_BIG_LUT 1 | ||
417 | |||
418 | #if DEC_BIG_LUT | ||
419 | static u32 dec_byte_flip_x[256] = { | ||
420 | 0x00000000, 0x00000001, 0x00000010, 0x00000011, 0x00000100, | ||
421 | 0x00000101, 0x00000110, 0x00000111, 0x00001000, 0x00001001, | ||
422 | 0x00001010, 0x00001011, 0x00001100, 0x00001101, 0x00001110, | ||
423 | 0x00001111, 0x00010000, 0x00010001, 0x00010010, 0x00010011, | ||
424 | 0x00010100, 0x00010101, 0x00010110, 0x00010111, 0x00011000, | ||
425 | 0x00011001, 0x00011010, 0x00011011, 0x00011100, 0x00011101, | ||
426 | 0x00011110, 0x00011111, 0x00100000, 0x00100001, 0x00100010, | ||
427 | 0x00100011, 0x00100100, 0x00100101, 0x00100110, 0x00100111, | ||
428 | 0x00101000, 0x00101001, 0x00101010, 0x00101011, 0x00101100, | ||
429 | 0x00101101, 0x00101110, 0x00101111, 0x00110000, 0x00110001, | ||
430 | 0x00110010, 0x00110011, 0x00110100, 0x00110101, 0x00110110, | ||
431 | 0x00110111, 0x00111000, 0x00111001, 0x00111010, 0x00111011, | ||
432 | 0x00111100, 0x00111101, 0x00111110, 0x00111111, 0x01000000, | ||
433 | 0x01000001, 0x01000010, 0x01000011, 0x01000100, 0x01000101, | ||
434 | 0x01000110, 0x01000111, 0x01001000, 0x01001001, 0x01001010, | ||
435 | 0x01001011, 0x01001100, 0x01001101, 0x01001110, 0x01001111, | ||
436 | 0x01010000, 0x01010001, 0x01010010, 0x01010011, 0x01010100, | ||
437 | 0x01010101, 0x01010110, 0x01010111, 0x01011000, 0x01011001, | ||
438 | 0x01011010, 0x01011011, 0x01011100, 0x01011101, 0x01011110, | ||
439 | 0x01011111, 0x01100000, 0x01100001, 0x01100010, 0x01100011, | ||
440 | 0x01100100, 0x01100101, 0x01100110, 0x01100111, 0x01101000, | ||
441 | 0x01101001, 0x01101010, 0x01101011, 0x01101100, 0x01101101, | ||
442 | 0x01101110, 0x01101111, 0x01110000, 0x01110001, 0x01110010, | ||
443 | 0x01110011, 0x01110100, 0x01110101, 0x01110110, 0x01110111, | ||
444 | 0x01111000, 0x01111001, 0x01111010, 0x01111011, 0x01111100, | ||
445 | 0x01111101, 0x01111110, 0x01111111, 0x10000000, 0x10000001, | ||
446 | 0x10000010, 0x10000011, 0x10000100, 0x10000101, 0x10000110, | ||
447 | 0x10000111, 0x10001000, 0x10001001, 0x10001010, 0x10001011, | ||
448 | 0x10001100, 0x10001101, 0x10001110, 0x10001111, 0x10010000, | ||
449 | 0x10010001, 0x10010010, 0x10010011, 0x10010100, 0x10010101, | ||
450 | 0x10010110, 0x10010111, 0x10011000, 0x10011001, 0x10011010, | ||
451 | 0x10011011, 0x10011100, 0x10011101, 0x10011110, 0x10011111, | ||
452 | 0x10100000, 0x10100001, 0x10100010, 0x10100011, 0x10100100, | ||
453 | 0x10100101, 0x10100110, 0x10100111, 0x10101000, 0x10101001, | ||
454 | 0x10101010, 0x10101011, 0x10101100, 0x10101101, 0x10101110, | ||
455 | 0x10101111, 0x10110000, 0x10110001, 0x10110010, 0x10110011, | ||
456 | 0x10110100, 0x10110101, 0x10110110, 0x10110111, 0x10111000, | ||
457 | 0x10111001, 0x10111010, 0x10111011, 0x10111100, 0x10111101, | ||
458 | 0x10111110, 0x10111111, 0x11000000, 0x11000001, 0x11000010, | ||
459 | 0x11000011, 0x11000100, 0x11000101, 0x11000110, 0x11000111, | ||
460 | 0x11001000, 0x11001001, 0x11001010, 0x11001011, 0x11001100, | ||
461 | 0x11001101, 0x11001110, 0x11001111, 0x11010000, 0x11010001, | ||
462 | 0x11010010, 0x11010011, 0x11010100, 0x11010101, 0x11010110, | ||
463 | 0x11010111, 0x11011000, 0x11011001, 0x11011010, 0x11011011, | ||
464 | 0x11011100, 0x11011101, 0x11011110, 0x11011111, 0x11100000, | ||
465 | 0x11100001, 0x11100010, 0x11100011, 0x11100100, 0x11100101, | ||
466 | 0x11100110, 0x11100111, 0x11101000, 0x11101001, 0x11101010, | ||
467 | 0x11101011, 0x11101100, 0x11101101, 0x11101110, 0x11101111, | ||
468 | 0x11110000, 0x11110001, 0x11110010, 0x11110011, 0x11110100, | ||
469 | 0x11110101, 0x11110110, 0x11110111, 0x11111000, 0x11111001, | ||
470 | 0x11111010, 0x11111011, 0x11111100, 0x11111101, 0x11111110, | ||
471 | 0x11111111 | ||
472 | }; | ||
473 | |||
474 | static u32 dec_byte[256] = { | ||
475 | 0x00000000, 0x10000000, 0x01000000, 0x11000000, 0x00100000, | ||
476 | 0x10100000, 0x01100000, 0x11100000, 0x00010000, 0x10010000, | ||
477 | 0x01010000, 0x11010000, 0x00110000, 0x10110000, 0x01110000, | ||
478 | 0x11110000, 0x00001000, 0x10001000, 0x01001000, 0x11001000, | ||
479 | 0x00101000, 0x10101000, 0x01101000, 0x11101000, 0x00011000, | ||
480 | 0x10011000, 0x01011000, 0x11011000, 0x00111000, 0x10111000, | ||
481 | 0x01111000, 0x11111000, 0x00000100, 0x10000100, 0x01000100, | ||
482 | 0x11000100, 0x00100100, 0x10100100, 0x01100100, 0x11100100, | ||
483 | 0x00010100, 0x10010100, 0x01010100, 0x11010100, 0x00110100, | ||
484 | 0x10110100, 0x01110100, 0x11110100, 0x00001100, 0x10001100, | ||
485 | 0x01001100, 0x11001100, 0x00101100, 0x10101100, 0x01101100, | ||
486 | 0x11101100, 0x00011100, 0x10011100, 0x01011100, 0x11011100, | ||
487 | 0x00111100, 0x10111100, 0x01111100, 0x11111100, 0x00000010, | ||
488 | 0x10000010, 0x01000010, 0x11000010, 0x00100010, 0x10100010, | ||
489 | 0x01100010, 0x11100010, 0x00010010, 0x10010010, 0x01010010, | ||
490 | 0x11010010, 0x00110010, 0x10110010, 0x01110010, 0x11110010, | ||
491 | 0x00001010, 0x10001010, 0x01001010, 0x11001010, 0x00101010, | ||
492 | 0x10101010, 0x01101010, 0x11101010, 0x00011010, 0x10011010, | ||
493 | 0x01011010, 0x11011010, 0x00111010, 0x10111010, 0x01111010, | ||
494 | 0x11111010, 0x00000110, 0x10000110, 0x01000110, 0x11000110, | ||
495 | 0x00100110, 0x10100110, 0x01100110, 0x11100110, 0x00010110, | ||
496 | 0x10010110, 0x01010110, 0x11010110, 0x00110110, 0x10110110, | ||
497 | 0x01110110, 0x11110110, 0x00001110, 0x10001110, 0x01001110, | ||
498 | 0x11001110, 0x00101110, 0x10101110, 0x01101110, 0x11101110, | ||
499 | 0x00011110, 0x10011110, 0x01011110, 0x11011110, 0x00111110, | ||
500 | 0x10111110, 0x01111110, 0x11111110, 0x00000001, 0x10000001, | ||
501 | 0x01000001, 0x11000001, 0x00100001, 0x10100001, 0x01100001, | ||
502 | 0x11100001, 0x00010001, 0x10010001, 0x01010001, 0x11010001, | ||
503 | 0x00110001, 0x10110001, 0x01110001, 0x11110001, 0x00001001, | ||
504 | 0x10001001, 0x01001001, 0x11001001, 0x00101001, 0x10101001, | ||
505 | 0x01101001, 0x11101001, 0x00011001, 0x10011001, 0x01011001, | ||
506 | 0x11011001, 0x00111001, 0x10111001, 0x01111001, 0x11111001, | ||
507 | 0x00000101, 0x10000101, 0x01000101, 0x11000101, 0x00100101, | ||
508 | 0x10100101, 0x01100101, 0x11100101, 0x00010101, 0x10010101, | ||
509 | 0x01010101, 0x11010101, 0x00110101, 0x10110101, 0x01110101, | ||
510 | 0x11110101, 0x00001101, 0x10001101, 0x01001101, 0x11001101, | ||
511 | 0x00101101, 0x10101101, 0x01101101, 0x11101101, 0x00011101, | ||
512 | 0x10011101, 0x01011101, 0x11011101, 0x00111101, 0x10111101, | ||
513 | 0x01111101, 0x11111101, 0x00000011, 0x10000011, 0x01000011, | ||
514 | 0x11000011, 0x00100011, 0x10100011, 0x01100011, 0x11100011, | ||
515 | 0x00010011, 0x10010011, 0x01010011, 0x11010011, 0x00110011, | ||
516 | 0x10110011, 0x01110011, 0x11110011, 0x00001011, 0x10001011, | ||
517 | 0x01001011, 0x11001011, 0x00101011, 0x10101011, 0x01101011, | ||
518 | 0x11101011, 0x00011011, 0x10011011, 0x01011011, 0x11011011, | ||
519 | 0x00111011, 0x10111011, 0x01111011, 0x11111011, 0x00000111, | ||
520 | 0x10000111, 0x01000111, 0x11000111, 0x00100111, 0x10100111, | ||
521 | 0x01100111, 0x11100111, 0x00010111, 0x10010111, 0x01010111, | ||
522 | 0x11010111, 0x00110111, 0x10110111, 0x01110111, 0x11110111, | ||
523 | 0x00001111, 0x10001111, 0x01001111, 0x11001111, 0x00101111, | ||
524 | 0x10101111, 0x01101111, 0x11101111, 0x00011111, 0x10011111, | ||
525 | 0x01011111, 0x11011111, 0x00111111, 0x10111111, 0x01111111, | ||
526 | 0x11111111 | ||
527 | }; | ||
528 | |||
529 | IWRAM_CODE | ||
530 | static inline | ||
531 | u32 | ||
532 | decode_1bpp(u8 row, u8 flip_x) { | ||
533 | if (flip_x) { | ||
534 | return dec_byte_flip_x[row]; | ||
535 | } | ||
536 | return dec_byte[row]; | ||
537 | } | ||
538 | |||
539 | #else | ||
405 | static u16 dec_nibble[] = { | 540 | static u16 dec_nibble[] = { |
406 | 0x0000, 0x1000, 0x0100, 0x1100, | 541 | 0x0000, 0x1000, 0x0100, 0x1100, |
407 | 0x0010, 0x1010, 0x0110, 0x1110, | 542 | 0x0010, 0x1010, 0x0110, 0x1110, |
@@ -427,6 +562,7 @@ decode_1bpp(u8 row, u8 flip_x) { | |||
427 | u16 *lut = dec_nibble; | 562 | u16 *lut = dec_nibble; |
428 | return (u32)lut[(row >> 0) & 0xF] << 16 | (u32)lut[(row >> 4) & 0xF]; | 563 | return (u32)lut[(row >> 0) & 0xF] << 16 | (u32)lut[(row >> 4) & 0xF]; |
429 | } | 564 | } |
565 | #endif | ||
430 | 566 | ||
431 | IWRAM_CODE | 567 | IWRAM_CODE |
432 | static inline | 568 | static inline |
@@ -441,18 +577,34 @@ draw_2bpp_row(size_t x, size_t y, u8 a, u8 b, u8 flip_x) { | |||
441 | size_t shift_left = start_col * 4; | 577 | size_t shift_left = start_col * 4; |
442 | size_t shift_right = (8 - start_col) * 4; | 578 | size_t shift_right = (8 - start_col) * 4; |
443 | 579 | ||
444 | u32 *dst = &backbuf[tile_x * 8 + tile_y * 8 * 32 + start_row]; | 580 | u32 *dst = &backbuf[start_row + (tile_x + tile_y * 32) * 8]; |
581 | #if DEC_BIG_LUT | ||
582 | u32 *lut = dec_byte; | ||
583 | if (flip_x) { | ||
584 | lut = dec_byte_flip_x; | ||
585 | } | ||
586 | #endif | ||
445 | if (start_col == 0) { | 587 | if (start_col == 0) { |
588 | #if DEC_BIG_LUT | ||
589 | u32 clr_a = lut[a]; | ||
590 | u32 clr_b = lut[b]; | ||
591 | #else | ||
446 | u32 clr_a = decode_1bpp(a, flip_x); | 592 | u32 clr_a = decode_1bpp(a, flip_x); |
447 | u32 clr_b = decode_1bpp(b, flip_x); | 593 | u32 clr_b = decode_1bpp(b, flip_x); |
594 | #endif | ||
448 | u32 mask_a = (clr_a * 0xF); | 595 | u32 mask_a = (clr_a * 0xF); |
449 | u32 mask_b = (clr_b * 0xF); | 596 | u32 mask_b = (clr_b * 0xF); |
450 | u32 mask = (mask_a | mask_b); | 597 | u32 mask = (mask_a | mask_b); |
451 | u32 color = clr_a + (clr_b << 1); | 598 | u32 color = clr_a + (clr_b << 1); |
452 | dst[0] = (dst[0] & ~mask) | color; | 599 | dst[0] = (dst[0] & ~mask) | color; |
453 | } else { | 600 | } else { |
601 | #if DEC_BIG_LUT | ||
602 | u32 clr_a = lut[a]; | ||
603 | u32 clr_b = lut[b]; | ||
604 | #else | ||
454 | u32 clr_a = decode_1bpp(a, flip_x); | 605 | u32 clr_a = decode_1bpp(a, flip_x); |
455 | u32 clr_b = decode_1bpp(b, flip_x); | 606 | u32 clr_b = decode_1bpp(b, flip_x); |
607 | #endif | ||
456 | u32 mask_a = (clr_a * 0xF); | 608 | u32 mask_a = (clr_a * 0xF); |
457 | u32 mask_b = (clr_b * 0xF); | 609 | u32 mask_b = (clr_b * 0xF); |
458 | u32 mask = (mask_a | mask_b); | 610 | u32 mask = (mask_a | mask_b); |
@@ -477,7 +629,7 @@ draw_1bpp_row(size_t x, size_t y, u8 a, u8 clr, u8 flip_x) { | |||
477 | size_t shift_left = start_col * 4; | 629 | size_t shift_left = start_col * 4; |
478 | size_t shift_right = (8 - start_col) * 4; | 630 | size_t shift_right = (8 - start_col) * 4; |
479 | 631 | ||
480 | u32 *dst = &backbuf[tile_x * 8 + tile_y * 8 * 32]; | 632 | u32 *dst = &backbuf[(tile_x + tile_y * 32) * 8]; |
481 | dst += start_row; | 633 | dst += start_row; |
482 | if (start_col == 0) { | 634 | if (start_col == 0) { |
483 | u32 color = decode_1bpp(a, flip_x); | 635 | u32 color = decode_1bpp(a, flip_x); |