summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBad Diode <bd@badd10de.dev>2023-04-17 09:15:11 +0200
committerBad Diode <bd@badd10de.dev>2023-04-17 09:15:11 +0200
commit54bf0529ab9e2af55f63890b819ece37ab854194 (patch)
treeefbd46f676794537de9c20c59cc56ae9ff6ca829
parentfc83417ad04f19bfea4313cd811facf21d8dec9b (diff)
downloadgba-link-cable-tester-54bf0529ab9e2af55f63890b819ece37ab854194.tar.gz
gba-link-cable-tester-54bf0529ab9e2af55f63890b819ece37ab854194.zip
Add big lut tables for decoding 1bpp rows
While the performance increase using this tables can be high, they consume drastically more memory than with smaller ones and per nibble decoding. For now I'm using the big tables but allow a compile time switch for the small ones instead.
-rw-r--r--src/renderer_m0.c196
1 files changed, 174 insertions, 22 deletions
diff --git a/src/renderer_m0.c b/src/renderer_m0.c
index d66785a..45923e8 100644
--- a/src/renderer_m0.c
+++ b/src/renderer_m0.c
@@ -69,11 +69,13 @@ draw_pixel(size_t x, size_t y, u8 clr) {
69 size_t tile_y = y / 8; 69 size_t tile_y = y / 8;
70 size_t start_col = x % 8; 70 size_t start_col = x % 8;
71 size_t start_row = y % 8; 71 size_t start_row = y % 8;
72 size_t pos = start_row + (tile_x + tile_y * 32) * 8; 72 u32 *dst = &backbuf[start_row + (tile_x + tile_y * 32) * 8];
73 73
74 // Update backbuffer. 74 // Update backbuffer.
75 size_t shift = start_col * sizeof(u32); 75 size_t shift = start_col * sizeof(u32);
76 backbuf[pos] = (backbuf[pos] & ~(0xF << shift)) | clr << shift; 76 u32 mask = 0xF << shift;
77 u32 row = clr << shift;
78 *dst = (*dst & ~mask) | row;
77 79
78 // Mark tile as dirty. 80 // Mark tile as dirty.
79 dirty_tiles[tile_y] |= 1 << tile_x; 81 dirty_tiles[tile_y] |= 1 << tile_x;
@@ -100,11 +102,11 @@ draw_hline(size_t x0, size_t x1, size_t y0, u8 clr) {
100 // partial row updates, rows in the middle can write the entire 102 // partial row updates, rows in the middle can write the entire
101 // row. 103 // row.
102 size_t dtx = tile_x1 - tile_x0; 104 size_t dtx = tile_x1 - tile_x0;
103 u32 *dst = &backbuf[tile_x0 * 8 + tile_y * 8 * 32 + start_row]; 105 u32 *dst = &backbuf[start_row + (tile_x0 + tile_y * 32) * 8];
104 if (dtx < 1) { 106 if (dtx < 1) {
105 u32 mask = 0xFFFFFFFF; 107 size_t shift_left = start_col * 4;
106 mask >>= (7 - end_col - dtx) * 4; 108 size_t shift_right = (7 - end_col) * 4;
107 mask &= 0xFFFFFFFF << start_col * 4; 109 u32 mask = (0xFFFFFFFF >> shift_right) & (0xFFFFFFFF << shift_left);
108 u32 row = (0x11111111 * clr) & mask; 110 u32 row = (0x11111111 * clr) & mask;
109 *dst = (*dst & ~mask) | row; 111 *dst = (*dst & ~mask) | row;
110 } else { 112 } else {
@@ -139,7 +141,7 @@ draw_vline(size_t x0, size_t y0, size_t y1, u8 clr) {
139 141
140 size_t shift_left = start_col * 4; 142 size_t shift_left = start_col * 4;
141 143
142 u32 *dst = &backbuf[tile_x * 8 + tile_y * 8 * 32 + start_row0]; 144 u32 *dst = &backbuf[start_row0 + (tile_x + tile_y * 32) * 8];
143 u32 mask = 0x0000000F << shift_left; 145 u32 mask = 0x0000000F << shift_left;
144 u32 row = (0x11111111 * clr) & mask; 146 u32 row = (0x11111111 * clr) & mask;
145 u32 dty = tile_y1 - tile_y0; 147 u32 dty = tile_y1 - tile_y0;
@@ -176,22 +178,31 @@ draw_line(size_t x0, size_t y0, size_t x1, size_t y1, u8 clr) {
176 MAYBE_SWAP(y0, y1); 178 MAYBE_SWAP(y0, y1);
177 draw_vline(x0, y0, y1, clr); 179 draw_vline(x0, y0, y1, clr);
178 } else { 180 } else {
179#if 0 181#if 1
180 // Diagonal line.
181 int dx = x0 > x1 ? x0 - x1 : x1 - x0; 182 int dx = x0 > x1 ? x0 - x1 : x1 - x0;
182 int dy = y0 > y1 ? y1 - y0 : y0 - y1; 183 int dy = y0 > y1 ? y0 - y1 : y1 - y0;
183 int x_step = x0 < x1 ? 1 : -1; 184 int x_step = x0 > x1 ? -1 : 1;
184 int y_step = y0 < y1 ? 1 : -1; 185 int y_step = y0 > y1 ? -1 : 1;
185 int err = dx + dy; 186 if (dx >= dy) {
186 while (!(x0 == x1 && y0 == y1)) { 187 int diff = 2 * dy - dx;
187 draw_pixel(x0, y0, clr); 188 for (int i = 0; i < dx + 1; i++) {
188 int diff = 2 * err; 189 draw_pixel(x0, y0, clr);
189 if (diff >= dy) { 190 if (diff >= 0) {
190 err += dy; 191 diff -= 2 * dx;
192 y0 += y_step;
193 }
194 diff += 2 * dy;
191 x0 += x_step; 195 x0 += x_step;
192 } 196 }
193 if (diff <= dx) { 197 } else {
194 err += dx; 198 int diff = 2 * dx - dy;
199 for (int i = 0; i < dy + 1; i++) {
200 draw_pixel(x0, y0, clr);
201 if (diff >= 0) {
202 diff -= 2 * dy;
203 x0 += x_step;
204 }
205 diff += 2 * dx;
195 y0 += y_step; 206 y0 += y_step;
196 } 207 }
197 } 208 }
@@ -402,6 +413,130 @@ flip_buffer(void) {
402 // } 413 // }
403} 414}
404 415
416#define DEC_BIG_LUT 1
417
418#if DEC_BIG_LUT
419static u32 dec_byte_flip_x[256] = {
420 0x00000000, 0x00000001, 0x00000010, 0x00000011, 0x00000100,
421 0x00000101, 0x00000110, 0x00000111, 0x00001000, 0x00001001,
422 0x00001010, 0x00001011, 0x00001100, 0x00001101, 0x00001110,
423 0x00001111, 0x00010000, 0x00010001, 0x00010010, 0x00010011,
424 0x00010100, 0x00010101, 0x00010110, 0x00010111, 0x00011000,
425 0x00011001, 0x00011010, 0x00011011, 0x00011100, 0x00011101,
426 0x00011110, 0x00011111, 0x00100000, 0x00100001, 0x00100010,
427 0x00100011, 0x00100100, 0x00100101, 0x00100110, 0x00100111,
428 0x00101000, 0x00101001, 0x00101010, 0x00101011, 0x00101100,
429 0x00101101, 0x00101110, 0x00101111, 0x00110000, 0x00110001,
430 0x00110010, 0x00110011, 0x00110100, 0x00110101, 0x00110110,
431 0x00110111, 0x00111000, 0x00111001, 0x00111010, 0x00111011,
432 0x00111100, 0x00111101, 0x00111110, 0x00111111, 0x01000000,
433 0x01000001, 0x01000010, 0x01000011, 0x01000100, 0x01000101,
434 0x01000110, 0x01000111, 0x01001000, 0x01001001, 0x01001010,
435 0x01001011, 0x01001100, 0x01001101, 0x01001110, 0x01001111,
436 0x01010000, 0x01010001, 0x01010010, 0x01010011, 0x01010100,
437 0x01010101, 0x01010110, 0x01010111, 0x01011000, 0x01011001,
438 0x01011010, 0x01011011, 0x01011100, 0x01011101, 0x01011110,
439 0x01011111, 0x01100000, 0x01100001, 0x01100010, 0x01100011,
440 0x01100100, 0x01100101, 0x01100110, 0x01100111, 0x01101000,
441 0x01101001, 0x01101010, 0x01101011, 0x01101100, 0x01101101,
442 0x01101110, 0x01101111, 0x01110000, 0x01110001, 0x01110010,
443 0x01110011, 0x01110100, 0x01110101, 0x01110110, 0x01110111,
444 0x01111000, 0x01111001, 0x01111010, 0x01111011, 0x01111100,
445 0x01111101, 0x01111110, 0x01111111, 0x10000000, 0x10000001,
446 0x10000010, 0x10000011, 0x10000100, 0x10000101, 0x10000110,
447 0x10000111, 0x10001000, 0x10001001, 0x10001010, 0x10001011,
448 0x10001100, 0x10001101, 0x10001110, 0x10001111, 0x10010000,
449 0x10010001, 0x10010010, 0x10010011, 0x10010100, 0x10010101,
450 0x10010110, 0x10010111, 0x10011000, 0x10011001, 0x10011010,
451 0x10011011, 0x10011100, 0x10011101, 0x10011110, 0x10011111,
452 0x10100000, 0x10100001, 0x10100010, 0x10100011, 0x10100100,
453 0x10100101, 0x10100110, 0x10100111, 0x10101000, 0x10101001,
454 0x10101010, 0x10101011, 0x10101100, 0x10101101, 0x10101110,
455 0x10101111, 0x10110000, 0x10110001, 0x10110010, 0x10110011,
456 0x10110100, 0x10110101, 0x10110110, 0x10110111, 0x10111000,
457 0x10111001, 0x10111010, 0x10111011, 0x10111100, 0x10111101,
458 0x10111110, 0x10111111, 0x11000000, 0x11000001, 0x11000010,
459 0x11000011, 0x11000100, 0x11000101, 0x11000110, 0x11000111,
460 0x11001000, 0x11001001, 0x11001010, 0x11001011, 0x11001100,
461 0x11001101, 0x11001110, 0x11001111, 0x11010000, 0x11010001,
462 0x11010010, 0x11010011, 0x11010100, 0x11010101, 0x11010110,
463 0x11010111, 0x11011000, 0x11011001, 0x11011010, 0x11011011,
464 0x11011100, 0x11011101, 0x11011110, 0x11011111, 0x11100000,
465 0x11100001, 0x11100010, 0x11100011, 0x11100100, 0x11100101,
466 0x11100110, 0x11100111, 0x11101000, 0x11101001, 0x11101010,
467 0x11101011, 0x11101100, 0x11101101, 0x11101110, 0x11101111,
468 0x11110000, 0x11110001, 0x11110010, 0x11110011, 0x11110100,
469 0x11110101, 0x11110110, 0x11110111, 0x11111000, 0x11111001,
470 0x11111010, 0x11111011, 0x11111100, 0x11111101, 0x11111110,
471 0x11111111
472};
473
474static u32 dec_byte[256] = {
475 0x00000000, 0x10000000, 0x01000000, 0x11000000, 0x00100000,
476 0x10100000, 0x01100000, 0x11100000, 0x00010000, 0x10010000,
477 0x01010000, 0x11010000, 0x00110000, 0x10110000, 0x01110000,
478 0x11110000, 0x00001000, 0x10001000, 0x01001000, 0x11001000,
479 0x00101000, 0x10101000, 0x01101000, 0x11101000, 0x00011000,
480 0x10011000, 0x01011000, 0x11011000, 0x00111000, 0x10111000,
481 0x01111000, 0x11111000, 0x00000100, 0x10000100, 0x01000100,
482 0x11000100, 0x00100100, 0x10100100, 0x01100100, 0x11100100,
483 0x00010100, 0x10010100, 0x01010100, 0x11010100, 0x00110100,
484 0x10110100, 0x01110100, 0x11110100, 0x00001100, 0x10001100,
485 0x01001100, 0x11001100, 0x00101100, 0x10101100, 0x01101100,
486 0x11101100, 0x00011100, 0x10011100, 0x01011100, 0x11011100,
487 0x00111100, 0x10111100, 0x01111100, 0x11111100, 0x00000010,
488 0x10000010, 0x01000010, 0x11000010, 0x00100010, 0x10100010,
489 0x01100010, 0x11100010, 0x00010010, 0x10010010, 0x01010010,
490 0x11010010, 0x00110010, 0x10110010, 0x01110010, 0x11110010,
491 0x00001010, 0x10001010, 0x01001010, 0x11001010, 0x00101010,
492 0x10101010, 0x01101010, 0x11101010, 0x00011010, 0x10011010,
493 0x01011010, 0x11011010, 0x00111010, 0x10111010, 0x01111010,
494 0x11111010, 0x00000110, 0x10000110, 0x01000110, 0x11000110,
495 0x00100110, 0x10100110, 0x01100110, 0x11100110, 0x00010110,
496 0x10010110, 0x01010110, 0x11010110, 0x00110110, 0x10110110,
497 0x01110110, 0x11110110, 0x00001110, 0x10001110, 0x01001110,
498 0x11001110, 0x00101110, 0x10101110, 0x01101110, 0x11101110,
499 0x00011110, 0x10011110, 0x01011110, 0x11011110, 0x00111110,
500 0x10111110, 0x01111110, 0x11111110, 0x00000001, 0x10000001,
501 0x01000001, 0x11000001, 0x00100001, 0x10100001, 0x01100001,
502 0x11100001, 0x00010001, 0x10010001, 0x01010001, 0x11010001,
503 0x00110001, 0x10110001, 0x01110001, 0x11110001, 0x00001001,
504 0x10001001, 0x01001001, 0x11001001, 0x00101001, 0x10101001,
505 0x01101001, 0x11101001, 0x00011001, 0x10011001, 0x01011001,
506 0x11011001, 0x00111001, 0x10111001, 0x01111001, 0x11111001,
507 0x00000101, 0x10000101, 0x01000101, 0x11000101, 0x00100101,
508 0x10100101, 0x01100101, 0x11100101, 0x00010101, 0x10010101,
509 0x01010101, 0x11010101, 0x00110101, 0x10110101, 0x01110101,
510 0x11110101, 0x00001101, 0x10001101, 0x01001101, 0x11001101,
511 0x00101101, 0x10101101, 0x01101101, 0x11101101, 0x00011101,
512 0x10011101, 0x01011101, 0x11011101, 0x00111101, 0x10111101,
513 0x01111101, 0x11111101, 0x00000011, 0x10000011, 0x01000011,
514 0x11000011, 0x00100011, 0x10100011, 0x01100011, 0x11100011,
515 0x00010011, 0x10010011, 0x01010011, 0x11010011, 0x00110011,
516 0x10110011, 0x01110011, 0x11110011, 0x00001011, 0x10001011,
517 0x01001011, 0x11001011, 0x00101011, 0x10101011, 0x01101011,
518 0x11101011, 0x00011011, 0x10011011, 0x01011011, 0x11011011,
519 0x00111011, 0x10111011, 0x01111011, 0x11111011, 0x00000111,
520 0x10000111, 0x01000111, 0x11000111, 0x00100111, 0x10100111,
521 0x01100111, 0x11100111, 0x00010111, 0x10010111, 0x01010111,
522 0x11010111, 0x00110111, 0x10110111, 0x01110111, 0x11110111,
523 0x00001111, 0x10001111, 0x01001111, 0x11001111, 0x00101111,
524 0x10101111, 0x01101111, 0x11101111, 0x00011111, 0x10011111,
525 0x01011111, 0x11011111, 0x00111111, 0x10111111, 0x01111111,
526 0x11111111
527};
528
529IWRAM_CODE
530static inline
531u32
532decode_1bpp(u8 row, u8 flip_x) {
533 if (flip_x) {
534 return dec_byte_flip_x[row];
535 }
536 return dec_byte[row];
537}
538
539#else
405static u16 dec_nibble[] = { 540static u16 dec_nibble[] = {
406 0x0000, 0x1000, 0x0100, 0x1100, 541 0x0000, 0x1000, 0x0100, 0x1100,
407 0x0010, 0x1010, 0x0110, 0x1110, 542 0x0010, 0x1010, 0x0110, 0x1110,
@@ -427,6 +562,7 @@ decode_1bpp(u8 row, u8 flip_x) {
427 u16 *lut = dec_nibble; 562 u16 *lut = dec_nibble;
428 return (u32)lut[(row >> 0) & 0xF] << 16 | (u32)lut[(row >> 4) & 0xF]; 563 return (u32)lut[(row >> 0) & 0xF] << 16 | (u32)lut[(row >> 4) & 0xF];
429} 564}
565#endif
430 566
431IWRAM_CODE 567IWRAM_CODE
432static inline 568static inline
@@ -441,18 +577,34 @@ draw_2bpp_row(size_t x, size_t y, u8 a, u8 b, u8 flip_x) {
441 size_t shift_left = start_col * 4; 577 size_t shift_left = start_col * 4;
442 size_t shift_right = (8 - start_col) * 4; 578 size_t shift_right = (8 - start_col) * 4;
443 579
444 u32 *dst = &backbuf[tile_x * 8 + tile_y * 8 * 32 + start_row]; 580 u32 *dst = &backbuf[start_row + (tile_x + tile_y * 32) * 8];
581#if DEC_BIG_LUT
582 u32 *lut = dec_byte;
583 if (flip_x) {
584 lut = dec_byte_flip_x;
585 }
586#endif
445 if (start_col == 0) { 587 if (start_col == 0) {
588#if DEC_BIG_LUT
589 u32 clr_a = lut[a];
590 u32 clr_b = lut[b];
591#else
446 u32 clr_a = decode_1bpp(a, flip_x); 592 u32 clr_a = decode_1bpp(a, flip_x);
447 u32 clr_b = decode_1bpp(b, flip_x); 593 u32 clr_b = decode_1bpp(b, flip_x);
594#endif
448 u32 mask_a = (clr_a * 0xF); 595 u32 mask_a = (clr_a * 0xF);
449 u32 mask_b = (clr_b * 0xF); 596 u32 mask_b = (clr_b * 0xF);
450 u32 mask = (mask_a | mask_b); 597 u32 mask = (mask_a | mask_b);
451 u32 color = clr_a + (clr_b << 1); 598 u32 color = clr_a + (clr_b << 1);
452 dst[0] = (dst[0] & ~mask) | color; 599 dst[0] = (dst[0] & ~mask) | color;
453 } else { 600 } else {
601#if DEC_BIG_LUT
602 u32 clr_a = lut[a];
603 u32 clr_b = lut[b];
604#else
454 u32 clr_a = decode_1bpp(a, flip_x); 605 u32 clr_a = decode_1bpp(a, flip_x);
455 u32 clr_b = decode_1bpp(b, flip_x); 606 u32 clr_b = decode_1bpp(b, flip_x);
607#endif
456 u32 mask_a = (clr_a * 0xF); 608 u32 mask_a = (clr_a * 0xF);
457 u32 mask_b = (clr_b * 0xF); 609 u32 mask_b = (clr_b * 0xF);
458 u32 mask = (mask_a | mask_b); 610 u32 mask = (mask_a | mask_b);
@@ -477,7 +629,7 @@ draw_1bpp_row(size_t x, size_t y, u8 a, u8 clr, u8 flip_x) {
477 size_t shift_left = start_col * 4; 629 size_t shift_left = start_col * 4;
478 size_t shift_right = (8 - start_col) * 4; 630 size_t shift_right = (8 - start_col) * 4;
479 631
480 u32 *dst = &backbuf[tile_x * 8 + tile_y * 8 * 32]; 632 u32 *dst = &backbuf[(tile_x + tile_y * 32) * 8];
481 dst += start_row; 633 dst += start_row;
482 if (start_col == 0) { 634 if (start_col == 0) {
483 u32 color = decode_1bpp(a, flip_x); 635 u32 color = decode_1bpp(a, flip_x);